From 8c8c667405cc046c15e180682fbd47de3195b8af Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithya.r@gmail.com>
Date: Mon, 29 Apr 2024 09:48:34 -0700
Subject: [PATCH 001/178] docs and simplification of cmd args (#8979)

* docs and simplification of cmd args

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added cicd test

Signed-off-by: arendu <adithya.r@gmail.com>

* added cicd test is needs

Signed-off-by: arendu <adithya.r@gmail.com>

* Update information_retrieval.rst

Signed-off-by: Adi Renduchintala <adithya.r@gmail.com>

* updated to fix wrong file paths

Signed-off-by: arendu <adithya.r@gmail.com>

* update

Signed-off-by: arendu <adithya.r@gmail.com>

* Update cicd-main.yml

Signed-off-by: Adi Renduchintala <adithya.r@gmail.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Adi Renduchintala <adithya.r@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .github/workflows/cicd-main.yml               |  55 +++++++++
 docs/source/nlp/information_retrieval.rst     | 104 ++++++++++++++++++
 ...megatron_gpt_embedder_generate_config.yaml |  14 ++-
 .../megatron_gpt_embedder_tuning_config.yaml  |  37 ++++---
 .../megatron_gpt_embedding_model.py           |   9 +-
 .../megatron_gpt_sft_model.py                 |   3 +-
 6 files changed, 200 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index e6e8fb808943..a13284521b3c 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4648,6 +4648,60 @@ jobs:
             rm -rf examples/nlp/language_modeling/gpt_sft_results
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
+  
+  L2_Megatron_GPT_Embedding:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            rm -rf /home/TestData/nlp/megatron_ir/working_dir
+
+            python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
+            exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
+            model.global_batch_size=4 \
+            model.micro_batch_size=4 \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            trainer.max_epochs=null \
+            trainer.max_steps=20 \
+            trainer.val_check_interval=10 \
+            model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
+            model.peft.lora_tuning.adapter_dim=8 \
+            model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
+            model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
+            model.data.validation_ds.write_embeddings_to_file=True \
+            model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
+
+
+            python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
+            model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \
+            model.global_batch_size=4 \
+            model.micro_batch_size=4 \
+            model.peft.lora_tuning.adapter_dim=8 \
+            model.data.test_ds.write_embeddings_to_file=True \
+            model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \
+            model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
+            model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl]
+
+            rm -rf /home/TestData/nlp/megatron_ir/working_dir
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
 
   L2_Megatron_GPT_PEFT_Lora_PP2:
     needs: [cicd-test-container-setup]
@@ -6256,6 +6310,7 @@ jobs:
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
       - L2_Megatron_GPT_Finetuning_PP2
       - L2_Megatron_GPT_Finetuning_StarCoder_PP1
+      - L2_Megatron_GPT_Embedding 
       - L2_Megatron_GPT_PEFT_Lora_PP2
       - L2_Megatron_GPT_PEFT_Lora_TP2
       - L2_Megatron_GPT_Eval
diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
index a283c845b11d..fa9157e45b59 100644
--- a/docs/source/nlp/information_retrieval.rst
+++ b/docs/source/nlp/information_retrieval.rst
@@ -102,3 +102,107 @@ Then you can fine-tune the sentence-BERT model using the following script:
     exp_manager.wandb_logger_kwargs.name=${NAME} \
     exp_manager.wandb_logger_kwargs.project=${PROJECT}
     
+GPT Embedding Models
+=====================
+
+Recent work has also shown that it is possible to use Decoder-Only (GPT Style) models to train embedding models.
+`Improving Text Embeddings with
+Large Language Models <https://arxiv.org/pdf/2401.00368.pdf>`__ is one such recent papers which served as inspiration to implement Decoder-only embedding training in Nemo.
+
+Training a GPT Embedding Model
+-------------------------------
+
+To train GPT Embedding models we follow a format very similar to the SBERT Embedding training. However, there are a couple of differences. GPT Embedding model training expects a `jsonl` file in which each line is a json object. Here is a truncated example of data jsonl file::
+
+{"query": "What did ... 1952-2002 period?", "pos_doc": "Morning (2008) ... has changed little.", "neg_doc": "Even though ... sapiens.", "query_id": "q103151", "doc_id": "d14755"}
+{"query": "What type of ...  passions?", "pos_doc": "Burke was a leading ... upper classes.", "neg_doc": "Writing to a friend ... Government.", "query_id": "q77959", "doc_id": "d11263"}
+{"query": "Since 1999, ... progressed at?", "pos_doc": "Commercial solar water ... as of 2007.", "neg_doc": "The potential solar ... acquire.", "query_id": "q16545", "doc_id": "d1883"}
+
+
+As visible the json object should contain the following fields ``query``, ``pos_doc``, ``neg_doc``, ``query_id`` and ``doc_id``. The ``query_id`` and ``doc_id`` can be any alphanumeric string that uniquely maps to the ``query`` string and ``pos_doc`` string.
+
+During training, the GPT Embedding model employs LoRA (by default) to learn embeddings for the queries and documents, such that similarity of the ``query``-to-``pos_doc`` are maximized while simultaneously minimizing ``query``-to-``neg_doc`` similarity. LoRA allows us to fine-tune large LLMs such as Mistral 7B model with a relatively small number of training parameters.
+
+An example command to launch a training job is
+
+.. code-block:: console
+
+ python3 /NeMo/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
+    exp_manager.exp_dir="PATH_TO_SAVE_LORA_WEIGHTS" \
+    model.global_batch_size=4 \                         # exact choice for global batch size is data dependent typical values are in the range of 32 to 128.
+    model.micro_batch_size=4 \                          # exact choice for micro batch size is GPU memory dependent 2 to 8 are reasonable values.
+    trainer.devices=1 \                                 # indicates how many GPUs to use during training per node.
+    trainer.num_nodes=1 \                               # indicates how many nodes to use if multi-node cluster is available
+    trainer.max_steps=20 \                              # how many training steps to run.
+    model.restore_from_path="PATH_TO_BASE_NEMO_MODEL" \
+    model.peft.lora_tuning.adapter_dim=16 \             # the low-rank size for lora weights.
+    model.data.train_ds.file_names=["train.jsonl"]
+
+The full list of possible run arguments is configurable in ``/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml``. By default a trained model file should be generated in here ``PATH_TO_SAVE_LORA_WEIGHTS/megatron_gpt_peft_lora_tuning/checkpoints/`` typically with the extension ``.nemo``.
+
+
+Inference using a GPT Embedding Model
+-------------------------------------
+
+Once trained, the GPT Embedding Model can be used to generate embeddings for queries and corpus documents. We can launch inference using the following command:
+
+.. code-block:: console
+
+ python3 /NeMo/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
+    model.global_batch_size=4 \
+    model.micro_batch_size=4 \
+    trainer.devices=1 \
+    trainer.num_nodes=1 \
+    model.restore_from_path="PATH_TO_BASE_NEMO_MODEL" \  # Same base model used at training time. 
+    model.peft.restore_from_path="PATH_TO_SAVE_LORA_WEIGHTS/megatron_gpt_peft_lora_tuning/checkpoints//megatron_gpt_peft_lora_tuning.nemo" \ 
+    model.data.test_ds.query_file_names=["test_query.jsonl"] \
+    model.data.test_ds.doc_file_names=\["test_docs.jsonl"] \
+    model.data.test_ds.write_embeddings_to_file=True \
+    model.data.test_ds.output_file_path_prefix="PATH_TO_SAVE_EMEBDDINGS" 
+
+The contents of ``test_queries.jsonl`` is expected to be in the following format::
+
+{"query": "What do ... quantities?","query_id": "q11600", "doc_id": "d1172"}
+{"query": "What are ... subsectors?", "query_id": "q5831", "doc_id": "d577"}
+{"query": "Which article ... Government?", "query_id": "q3037", "doc_id": "d336"}
+
+Here, the ``doc_id`` field is expected to be the id of the document/passage which is the correct passage for the query. Note that since we are in inference mode, we don't require query-doc pairs.
+
+The contents of ``test_docs.jsonl`` is expected to be in the following format::
+
+{"pos_doc": "Hormones ... vitamin D.", "doc_id": "d823"}
+{"pos_doc": "Historically, Victoria ... October 2016.", "doc_id": "d159"}
+{"pos_doc": "Exceptional examples ... Warsaw.", "doc_id": "d1084"}
+
+Once again, we show 3 examples form each file. Typically the ``test_docs.jsonl`` will contain more items than queries in the ``test_queries.jsonl``.
+
+The inference command will result in two folders 
+
+* ``PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_queries`` 
+* ``PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_docs``
+
+The ``X`` in the folder ``consumed_samplesX`` is a number denoted number of batches consumed, this is not crucial at test time, but it is useful in training which we will see in the next section. First, let's take a look at the ``test_queries``.
+
+.. code-block:: console
+
+ $> ls PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_queries
+ query.ids  query.npy
+ $>head -n3 PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_queries/query.ids 
+ q11600
+ q5831
+ q3037
+
+``query.npy`` is a numpy pickled array containing rows of query embeddings and the ``query.ids`` text file list the id of each embedding in the same order.
+
+Similarly let's look into the ``test_docs`` folder
+
+.. code-block:: console
+
+ $> ls PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_doc/
+ doc.ids  doc.npy
+ $> head -n3 PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_doc/doc.ids 
+ d823
+ d159
+ d1084
+
+We can see that ``test_doc`` has a similar structure to ``test_queries`` but with ids and embeddings of the documents from the ``test_docs.josnl`` file. With this setup it is possible to evaluate the performance using metrics like MRR or NDCG.
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
index 778dc937efdc..1a81d21dd9a8 100644
--- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
+++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
@@ -4,7 +4,7 @@ trainer:
   devices: 1
   accelerator: gpu
   num_nodes: 1
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -66,8 +66,14 @@ model:
   hidden_dropout: 0.0
   attention_dropout: 0.0
   ffn_dropout: 0.0
-  temperature: 0.8
+  temperature: 0.02
   num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
+  use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only
+  post_process: False # should be False.
+  transformer_engine: True # required to be True for newer versions of Megatron-LM based models
+  mcore_gpt: True # required to be True for newer versions of Megatron-LM based models
+  use_flash_attention: True
+  precision: bf16
 
   peft:
     peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
@@ -119,8 +125,8 @@ model:
       query_file_names: ??? # Path to a list of JSONL files corresponding to the query data. Data format is identical to validation_ds.
       doc_file_names: ??? # Path to a list of JSONL files corresponding to the doc data. Data format is identical to validation_ds.
       names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics.
-      global_batch_size: 1
-      micro_batch_size: 1
+      global_batch_size: ${global_batch_size}
+      micro_batch_size: ${micro_batch_size}
       shuffle: False
       num_workers: 0
       pin_memory: True
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
index efd5271884ed..315bffd8a1ff 100644
--- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
+++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
@@ -4,15 +4,16 @@ trainer:
   devices: 1
   accelerator: gpu
   num_nodes: 1
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
-  max_epochs: 9999
+  max_epochs: null
   max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10 # frequency with which training steps are logged
-  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
-  gradient_clip_val: 1.0
+  val_check_interval: ${trainer.max_steps} # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: null
+  num_sanity_val_steps: 0
 
 exp_manager:
   explicit_log_dir: null
@@ -34,7 +35,7 @@ exp_manager:
     model_parallel_size: ${model.tensor_model_parallel_size}
     always_save_nemo: False
     save_best_model: True
-  create_early_stopping_callback: True
+  create_early_stopping_callback: False
   early_stopping_callback_params:
     monitor: "val_loss"
     mode: "min"
@@ -54,7 +55,7 @@ model:
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
   sync_batch_comm: False
-  megatron_amp_O2: False
+  megatron_amp_O2: True 
 
   ## Sequence Parallelism
   # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
@@ -62,8 +63,8 @@ model:
   sequence_parallel: False
 
   ## Activation Checkpoint
-  activations_checkpoint_granularity: null # 'selective' or 'full'
-  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  activations_checkpoint_granularity: selective # 'selective' or 'full'
+  activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
   # 'uniform' divides the total number of transformer layers and checkpoints the input activation
   # of each chunk at the specified granularity
   # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
@@ -74,9 +75,14 @@ model:
   hidden_dropout: 0.0
   attention_dropout: 0.0
   ffn_dropout: 0.0
-  temperature: 0.8
+  temperature: 0.02
   num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
   use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only
+  post_process: False # should be False.
+  transformer_engine: True # required to be True for newer versions of Megatron-LM based models
+  mcore_gpt: True # required to be True for newer versions of Megatron-LM based models
+  use_flash_attention: True
+  precision: bf16
 
   peft:
     peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
@@ -135,7 +141,7 @@ model:
       num_workers: 0
       memmap_workers: 2
       pin_memory: True
-      max_seq_length: 2048
+      max_seq_length: 512  # Even if the base model can handle longer sequences, 512 is generally a good choice for training efficiency.
       min_seq_length: 1
       drop_last: True
       # Example of how to specify concat_sampling_probabilities
@@ -143,15 +149,16 @@ model:
       #   - 0.5
       #   - 0.25
       #   - 0.25
-      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      concat_sampling_probabilities: 
+        - 1.0 
       label_key: 'output'
       add_eos: True
       add_bos: False
       index_mapping_dir: null # Path to a directory to write index mapping files.
       truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
     validation_ds:
-      query_file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      doc_file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      query_file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      doc_file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
       names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics.
       global_batch_size: ${model.global_batch_size}
       micro_batch_size: ${model.micro_batch_size}
@@ -159,7 +166,7 @@ model:
       num_workers: 0
       memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
-      max_seq_length: 2048
+      max_seq_length: ${model.data.train_ds.max_seq_length}
       min_seq_length: 1
       drop_last: False
       label_key: ${model.data.train_ds.label_key}
@@ -182,7 +189,7 @@ model:
       num_workers: 0
       memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
-      max_seq_length: 2048
+      max_seq_length: ${model.data.train_ds.max_seq_length}
       min_seq_length: 1
       drop_last: False
       add_eos: ${model.data.train_ds.add_eos}
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
index 110e59494b52..4cdeba1d67e2 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
@@ -123,8 +123,10 @@ def _build_dataset(self, data_cfg, is_train=True):
             _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples)
             num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])
         else:
-            num_query_samples_per_dataset = [[None]] * len(data_cfg.query_file_names)
-            num_doc_samples_per_dataset = [[None]] * len(data_cfg.doc_file_names)
+            num_query_files = len(data_cfg.query_file_names) if data_cfg.query_file_names is not None else 0
+            num_doc_files = len(data_cfg.doc_file_names) if data_cfg.doc_file_names is not None else 0
+            num_query_samples_per_dataset = [[None]] * num_query_files
+            num_doc_samples_per_dataset = [[None]] * num_doc_files
 
         # Check dataset max_seq_legnth and max_position_embeddings size
         if (
@@ -174,6 +176,9 @@ def _build_dataset(self, data_cfg, is_train=True):
             )
             return dataset
         else:
+            if data_cfg.query_file_names is None or data_cfg.doc_file_names is None:
+                return []
+
             query_dataset = GPTEmbeddingDataset(
                 file_path=data_cfg.query_file_names[0],
                 tokenizer=self.tokenizer,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 448f912c44d6..892a87189880 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -804,7 +804,8 @@ def build_train_valid_test_datasets(self, stage):
             logging.info('Building GPT SFT validation datasets.')
             # Wrap this in a list since the general finetuning parent class supports multi-validation.
             self._validation_ds = self._build_dataset(self.cfg.data.validation_ds, is_train=False)
-            logging.info(f'Length of val dataset: {len(self._validation_ds[0])}')
+            if self._validation_ds:
+                logging.info(f'Length of val dataset: {len(self._validation_ds[0])}')
 
         if stage != 'validate':
             self.maybe_build_test()

From 43afd943507ec583072271c607341ea63c574496 Mon Sep 17 00:00:00 2001
From: Ming <111467530+Victor49152@users.noreply.github.com>
Date: Mon, 29 Apr 2024 16:51:31 -0700
Subject: [PATCH 002/178] remove in-place addition for dreambooth train with
 text encoder (#8825)

* remove in-place addition for dreambooth train with text encoder

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Signed-off-by: Ming <111467530+Victor49152@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../multimodal/models/text_to_image/dreambooth/dreambooth.py   | 3 ++-
 .../multimodal/modules/stable_diffusion/encoders/modules.py    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
index 317cdf5d6364..0b830ac7319b 100644
--- a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
+++ b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
@@ -20,7 +20,6 @@
 from torch._inductor import config as inductor_config
 
 from nemo.collections.multimodal.data.dreambooth.dreambooth_dataset import DreamBoothDataset
-from nemo.collections.multimodal.modules.stable_diffusion.attention import LinearWrapper
 from nemo.collections.multimodal.modules.stable_diffusion.distributions.distributions import (
     DiagonalGaussianDistribution,
 )
@@ -647,6 +646,8 @@ def load_from_checkpoint(
         return checkpoint
 
     def _check_and_add_adapter(self, name, module, peft_name, peft_cfg, name_key_to_mcore_mixins=None):
+        from nemo.collections.multimodal.modules.stable_diffusion.attention import LinearWrapper
+
         if isinstance(module, AdapterModuleMixin):
             if isinstance(module, LinearWrapper):
                 peft_cfg.in_features, peft_cfg.out_features = module.in_features, module.out_features
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
index 446b81ab11b6..bff579bbca4f 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
@@ -718,7 +718,7 @@ def forward(self, text):
 
     def encode_with_transformer(self, text):
         x = self.model.language_model.embedding.word_embeddings(text)
-        x += self.model.language_model.embedding.position_embeddings
+        x = x + self.model.language_model.embedding.position_embeddings
         x = x.permute(1, 0, 2)  # NLD -> LND
         x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
         x = self.model.language_model.encoder.final_layernorm(x)

From 3735b5c9a953021e7f4d3843009b8f4636057026 Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Tue, 30 Apr 2024 02:33:24 +0200
Subject: [PATCH 003/178] [NeMo-UX] Add checkpoint-io to MegatronStrategy
 (#9057)

* Adding MegatronParallel

* Move over _strategy_liMegatronCheckpointIO

* Adding GPTModel & MockDataModule

* Add nemo.io to MegatronStrategy

* Move to cloudpickle

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix tests

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 nemo/io/__init__.py                     |  14 +++
 nemo/io/api.py                          |  62 +++++++++++
 nemo/io/capture.py                      |  98 +++++++++++++++++
 nemo/io/mixin.py                        | 139 ++++++++++++++++++++++++
 nemo/io/pl.py                           |  49 ++++++++-
 nemo/lightning/__init__.py              |   3 +-
 nemo/lightning/pytorch/strategies.py    |  11 +-
 nemo/lightning/pytorch/trainer.py       |  15 +++
 nemo/llm/gpt/model/base.py              |   5 +-
 requirements/requirements.txt           |   1 +
 requirements/requirements_lightning.txt |   1 +
 tests/io/__init__.py                    |   0
 tests/io/test_api.py                    |  18 +++
 tests/io/test_mixin.py                  |  16 +++
 14 files changed, 422 insertions(+), 10 deletions(-)
 create mode 100644 nemo/io/api.py
 create mode 100644 nemo/io/capture.py
 create mode 100644 nemo/io/mixin.py
 create mode 100644 nemo/lightning/pytorch/trainer.py
 create mode 100644 tests/io/__init__.py
 create mode 100644 tests/io/test_api.py
 create mode 100644 tests/io/test_mixin.py

diff --git a/nemo/io/__init__.py b/nemo/io/__init__.py
index e69de29bb2d1..5b1d48768848 100644
--- a/nemo/io/__init__.py
+++ b/nemo/io/__init__.py
@@ -0,0 +1,14 @@
+from nemo.io.api import load, load_ckpt
+from nemo.io.capture import reinit
+from nemo.io.mixin import IOMixin
+from nemo.io.pl import TrainerCheckpoint, is_distributed_ckpt
+
+
+__all__ = [
+    "IOMixin",
+    "is_distributed_ckpt",
+    "load",
+    "load_ckpt",
+    'reinit',
+    "TrainerCheckpoint",
+]
diff --git a/nemo/io/api.py b/nemo/io/api.py
new file mode 100644
index 000000000000..f7de36cb9545
--- /dev/null
+++ b/nemo/io/api.py
@@ -0,0 +1,62 @@
+import pickle
+from pathlib import Path
+from typing import Any, Type, TypeVar
+
+import fiddle as fdl
+
+from nemo.io.pl import TrainerCheckpoint
+
+CkptType = TypeVar("CkptType")
+
+
+def load(path: Path, output_type: Type[CkptType] = Any) -> CkptType:
+    """
+    Loads a configuration from a pickle file and constructs an object of the specified type.
+
+    Args:
+        path (Path): The path to the pickle file or directory containing 'io.pkl'.
+        output_type (Type[CkptType]): The type of the object to be constructed from the loaded data.
+
+    Returns
+    -------
+        CkptType: An instance of the specified type constructed from the loaded configuration.
+
+    Raises
+    ------
+        FileNotFoundError: If the specified file does not exist.
+
+    Example:
+        loaded_model = load("/path/to/model", output_type=MyModel)
+    """
+    del output_type  # Just for type-hint
+
+    _path = Path(path)
+    if hasattr(_path, 'is_dir') and _path.is_dir():
+        _path = Path(_path) / "io.pkl"
+    elif hasattr(_path, 'isdir') and _path.isdir:
+        _path = Path(_path) / "io.pkl"
+
+    if not _path.is_file():
+        raise FileNotFoundError(f"No such file: '{_path}'")
+
+    with open(_path, "rb") as f:
+        config = pickle.load(f)
+
+    return fdl.build(config)
+
+
+def load_ckpt(path: Path) -> TrainerCheckpoint:
+    """
+    Loads a TrainerCheckpoint from a pickle file or directory.
+
+    Args:
+        path (Path): The path to the pickle file or directory containing 'io.pkl'.
+
+    Returns
+    -------
+        TrainerCheckpoint: The loaded TrainerCheckpoint instance.
+
+    Example:
+        checkpoint: TrainerCheckpoint = load_ckpt("/path/to/checkpoint")
+    """
+    return load(path, output_type=TrainerCheckpoint)
diff --git a/nemo/io/capture.py b/nemo/io/capture.py
new file mode 100644
index 000000000000..2a65d18c15e3
--- /dev/null
+++ b/nemo/io/capture.py
@@ -0,0 +1,98 @@
+import functools
+import logging
+from typing import Callable, Generic, Optional, Protocol, TypeVar, runtime_checkable
+
+import fiddle as fdl
+
+log = logging.getLogger(__name__)
+
+
+def capture(to_capture: Optional[Callable] = None):
+    if to_capture is None:
+        return lambda f: capture(f)
+
+    @functools.wraps(to_capture)
+    def wrapper(*args, **kwargs):
+        if isinstance(to_capture, IOProtocol):
+            return to_capture(*args, **kwargs)
+
+        output = to_capture(*args, **kwargs)
+        if not hasattr(output, '__dict__'):
+            try:
+                if isinstance(output, (int, float, str, tuple)):
+                    new_output = type_factory(type(output), base_value=output)
+                else:
+                    NewType = type_factory(type(output))
+                    new_output = NewType(output)
+                new_output.__io__ = fdl.Partial(to_capture, *args, **kwargs)
+                output = new_output
+            except Exception as e:
+                logging.error(f"Error creating configurable type: {e}")
+        else:
+            output.__io__ = fdl.Partial(to_capture, *args, **kwargs)
+
+        return output
+
+    return wrapper
+
+
+SelfT = TypeVar("SelfT", covariant=True)
+
+
+@runtime_checkable
+class IOProtocol(Protocol, Generic[SelfT]):
+    @property
+    def __io__(self) -> fdl.Config[SelfT]:
+        ...
+
+
+@runtime_checkable
+class ReInitProtocol(Protocol, Generic[SelfT]):
+    def reinit(self) -> SelfT:
+        ...
+
+
+def reinit(configurable: IOProtocol[SelfT]) -> SelfT:
+    if isinstance(configurable, ReInitProtocol):
+        return configurable.reinit()
+
+    if not hasattr(configurable, '__io__'):
+        raise ValueError(f"Cannot reinit {configurable} because it does not have a __io__ attribute")
+
+    return fdl.build(configurable.__io__)
+
+
+# Global cache for dynamically created types
+type_cache = {}
+
+
+def type_factory(original_type, base_value=None):
+    """
+    Factory function to create or retrieve from cache a new type that can have additional attributes,
+    even if the original type is immutable.
+
+    Args:
+        original_type: The type of the original output value.
+        base_value: The base value to use for immutable types, if applicable.
+
+    Returns
+    -------
+        A new type that inherits from the original type and can have additional attributes,
+        or an instance of this new type if base_value is provided.
+    """
+    type_name = f"Configurable{original_type.__name__}"
+    if type_name in type_cache:
+        NewType = type_cache[type_name]
+    else:
+        NewType = type(f"Configurable{original_type.__name__}", (original_type,), {})
+        type_cache[type_name] = NewType
+
+    if base_value is not None:
+        try:
+            instance = NewType(base_value)
+        except TypeError:
+            logging.warning(f"Could not instantiate type {NewType.__name__} with base value.")
+            instance = NewType()
+        return instance
+
+    return NewType
diff --git a/nemo/io/mixin.py b/nemo/io/mixin.py
new file mode 100644
index 000000000000..d09c456f7957
--- /dev/null
+++ b/nemo/io/mixin.py
@@ -0,0 +1,139 @@
+import functools
+import inspect
+from dataclasses import is_dataclass
+from pathlib import Path
+from typing import Any, Dict
+
+import fiddle as fdl
+from cloudpickle import dump
+from typing_extensions import Self
+
+from nemo.io.capture import IOProtocol
+
+
+class IOMixin:
+    """
+    A mixin class designed to capture the arguments passed to the `__init__` method,
+    facilitating the re-creation of the object through `io.reinit` method using stored configurations.
+
+    This class intercepts the initialization of an object to store the arguments in a configuration
+    object, which can be serialized and later used to reinitialize the object to its original state.
+    It utilizes `fdl.Config` from the Fiddle library to create a structured configuration object
+    that holds the initialization parameters. This configuration object is crucial for enabling
+    serialization and deserialization of the parameters, thus allowing the object to be reconstructed
+    at a later time with the same initial state.
+
+    Attributes
+    ----------
+        __io__ (fdl.Config[Self]): A configuration object that stores the captured initialization
+        parameters in a structured format. This object is an instance of `fdl.Config`, which allows
+        for the serialization and deserialization of the parameters, enabling the object to be
+        reconstructed at a later time with the same initial state.
+
+    Examples
+    --------
+        from nemo import io
+        
+        class ExampleClass(io.IOMixin):
+            def __init__(self, param1, param2):
+                super().__init__()
+                self.param1 = param1
+                self.param2 = param2
+
+        # Creating an instance of ExampleClass
+        example = ExampleClass('value1', 'value2')
+        example_copy = io.reinit(example)
+        
+
+    Note:
+        For more information on `fdl.Config`, refer to the Fiddle library documentation at
+        [Fiddle Config Documentation](https://fiddle.readthedocs.io/en/latest/api_reference/core.html#config).
+
+    """
+
+    __io__ = fdl.Config[Self]
+
+    def __new__(cls, *args, **kwargs):
+        """
+        Overrides the default object creation process to wrap the `__init__` method, allowing
+        initialization arguments to be captured and stored in the `__io__` attribute.
+
+        Args:
+            *args: Variable length argument list for the `__init__` method.
+            **kwargs: Arbitrary keyword arguments for the `__init__` method.
+
+        Returns
+        -------
+            The newly created object instance.
+        """
+        original_init = cls.__init__
+
+        @functools.wraps(original_init)
+        def wrapped_init(self, *args, **kwargs):
+            cfg_kwargs = self.io_transform_args(original_init, *args, **kwargs)
+            self.__io__ = self.io_init(**cfg_kwargs)
+            original_init(self, *args, **kwargs)
+
+        cls.__init__ = wrapped_init
+        output = object().__new__(cls)
+
+        return output
+
+    def io_transform_args(self, init_fn, *args, **kwargs) -> Dict[str, Any]:
+        """
+        Transforms and captures the arguments passed to the `__init__` method, filtering out
+        any arguments that are instances of `IOProtocol` or are dataclass fields with default
+        factories.
+
+        Args:
+            init_fn (Callable): The original `__init__` method of the class.
+            *args: Variable length argument list for the `__init__` method.
+            **kwargs: Arbitrary keyword arguments for the `__init__` method.
+
+        Returns
+        -------
+            Dict[str, Any]: A dictionary of the captured and transformed arguments.
+        """
+        sig = inspect.signature(init_fn)
+        bound_args = sig.bind_partial(self, *args, **kwargs)
+        bound_args.apply_defaults()
+        config_kwargs = {k: v for k, v in bound_args.arguments.items() if k != "self"}
+
+        to_del = []
+        for key in config_kwargs:
+            if isinstance(config_kwargs[key], IOProtocol):
+                config_kwargs[key] = config_kwargs[key].__io__
+            if is_dataclass(self):
+                # Check if the arg is a factory (dataclasses.field)
+                if config_kwargs[key].__class__.__name__ == "_HAS_DEFAULT_FACTORY_CLASS":
+                    to_del.append(key)
+
+        for key in to_del:
+            del config_kwargs[key]
+
+        return config_kwargs
+
+    def io_init(self, **kwargs) -> fdl.Config[Self]:
+        """
+        Initializes the configuration object (`__io__`) with the captured arguments.
+
+        Args:
+            **kwargs: A dictionary of arguments that were captured during object initialization.
+
+        Returns
+        -------
+            fdl.Config[Self]: The initialized configuration object.
+        """
+        return fdl.Config(type(self), **kwargs)
+
+    def io_dump(self, output: Path):
+        """
+        Serializes the configuration object (`__io__`) to a file, allowing the object state to be
+        saved and later restored.
+
+        Args:
+            output (Path): The path to the file where the configuration object will be serialized.
+        """
+        config_path = Path(output) / "io.pkl"
+        with open(config_path, "wb") as f:
+            dump(self.__io__, f)
diff --git a/nemo/io/pl.py b/nemo/io/pl.py
index 659ef0d6621b..ba9b5be72cab 100644
--- a/nemo/io/pl.py
+++ b/nemo/io/pl.py
@@ -1,6 +1,7 @@
 import logging
+from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Generic, Optional, Protocol, TypeVar, Union
 
 import pytorch_lightning as pl
 import torch
@@ -8,8 +9,13 @@
 from lightning_fabric.utilities.cloud_io import get_filesystem
 from lightning_fabric.utilities.types import _PATH
 from torch import nn
-from typing_extensions import override
+from typing_extensions import Self, override
 
+from nemo.io.capture import IOProtocol
+from nemo.io.mixin import IOMixin
+
+if TYPE_CHECKING:
+    from nemo.lightning.pytorch.strategies import MegatronStrategy
 
 log = logging.getLogger(__name__)
 
@@ -18,6 +24,42 @@
 ModuleT = TypeVar("ModuleT", bound=nn.Module)
 
 
+@dataclass
+class TrainerCheckpoint(IOMixin, Generic[LightningModuleT]):
+    model: LightningModuleT
+    trainer: pl.Trainer
+    extra: Dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_strategy(cls, strategy: "MegatronStrategy") -> Self:
+        if not isinstance(strategy.trainer, IOProtocol):
+            raise ValueError(f"Trainer must be an instance of {IOProtocol}. Please use the Trainer from nemo.")
+
+        if not isinstance(strategy.lightning_module, IOProtocol):
+            raise ValueError("LightningModule must extend IOMixin.")
+
+        return cls(trainer=strategy.trainer, model=strategy.lightning_module, extra=cls.construct_extra(strategy))
+
+    @classmethod
+    def construct_extra(cls, strategy: "MegatronStrategy") -> Dict[str, Any]:
+        extra = {}
+        if hasattr(strategy.trainer, "datamodule") and isinstance(strategy.trainer.datamodule, IOProtocol):
+            extra["datamodule"] = strategy.trainer.datamodule.__io__
+
+        # TODO: Add optimizer to extra
+
+        return extra
+
+
+class TrainerCkptProtocol(Protocol):
+    @classmethod
+    def from_strategy(cls, strategy: "MegatronStrategy") -> Self:
+        ...
+
+    def io_dump(self, output: Path):
+        ...
+
+
 class MegatronCheckpointIO(CheckpointIO):
     """CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints respectively,
     common for most use cases.
@@ -54,7 +96,6 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
             logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
             return
-
         fs.makedirs(checkpoint_dir, exist_ok=True)
         dist_checkpointing.save(sharded_state_dict=checkpoint, checkpoint_dir=str(checkpoint_dir))
 
@@ -113,7 +154,6 @@ def _fix_tensors_device(ckpt: Dict) -> Dict:
     """Ensure checkpoint tensors are on the correct device."""
     assert torch.cuda.is_initialized(), (torch.cuda.is_available(), torch.cuda.is_initialized())
     cur_dev = torch.device("cuda", index=torch.cuda.current_device())
-
     from megatron.core.dist_checkpointing.dict_utils import dict_list_map_outplace
 
     def _fix_device(t):
@@ -130,7 +170,6 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
     to be used as a directory for distributed checkpoints.
     """
     filepath = Path(filepath)
-
     if not filepath.suffix == ".ckpt":
         filepath = filepath.with_suffix(filepath.suffix + ".ckpt")
 
diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py
index a508f29b9ace..f900345f96eb 100644
--- a/nemo/lightning/__init__.py
+++ b/nemo/lightning/__init__.py
@@ -7,6 +7,7 @@
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler
 from nemo.lightning.pytorch.strategies import MegatronStrategy
+from nemo.lightning.pytorch.trainer import Trainer
 
 
 # We monkey patch because nvidia uses a naming convention for SLURM jobs
@@ -21,4 +22,4 @@ def _is_slurm_interactive_mode():
 _pl_plugins._PLUGIN_INPUT = Union[_pl_plugins._PLUGIN_INPUT, _data_sampler.DataSampler]  # noqa: SLF001
 
 
-__all__ = ["MegatronStrategy", "MegatronDataSampler", "get_vocab_size", "teardown"]
+__all__ = ["MegatronStrategy", "MegatronDataSampler", "Trainer", "get_vocab_size", "teardown"]
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 89cbe98cf707..65986b2a4855 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -27,7 +27,8 @@
 from torch.utils.data import DataLoader
 from typing_extensions import override
 
-from nemo.io.pl import MegatronCheckpointIO
+from nemo import io
+from nemo.io.pl import MegatronCheckpointIO, TrainerCheckpoint, TrainerCkptProtocol
 from nemo.lightning import _strategy_lib
 from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
 from nemo.lightning.pytorch.callbacks import MegatronProgressBar
@@ -38,7 +39,7 @@
 ConfigT = TypeVar("ConfigT")
 
 
-class MegatronStrategy(DDPStrategy):
+class MegatronStrategy(DDPStrategy, io.IOMixin):
     """Megatron plugin for Pytorch Lightning.
 
     Args:
@@ -60,6 +61,8 @@ def __init__(
         checkpoint_io=None,  # TODO: Add type-hint
         no_ddp_communication_hook: bool = True,
         find_unused_parameters: bool = False,
+        enable_nemo_ckpt_io: bool = True,
+        ckpt_type: TrainerCkptProtocol = TrainerCheckpoint,
         lazy_init: bool = False,
         **kwargs,
     ) -> None:
@@ -77,6 +80,8 @@ def __init__(
         self.pipeline_model_parallel_size = pipeline_model_parallel_size
         self.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
         self.sequence_parallel = sequence_parallel
+        self.enable_nemo_ckpt_io = enable_nemo_ckpt_io
+        self.ckpt_type = ckpt_type
         self.lazy_init = lazy_init
 
         # used in NVIDIA NGC PyTorch containers
@@ -346,6 +351,8 @@ def save_checkpoint(
             checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
 
         self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
+        if self.enable_nemo_ckpt_io and self.is_global_zero and self.ckpt_type:
+            self.ckpt_type.from_strategy(self).io_dump(ckpt_to_dir(filepath))
 
     @override
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py
new file mode 100644
index 000000000000..da04a93eef05
--- /dev/null
+++ b/nemo/lightning/pytorch/trainer.py
@@ -0,0 +1,15 @@
+from copy import deepcopy
+
+import fiddle as fdl
+import pytorch_lightning as pl
+from typing_extensions import Self
+
+from nemo.io.mixin import IOMixin
+
+
+class Trainer(pl.Trainer, IOMixin):
+    def io_init(self, **kwargs) -> fdl.Config[Self]:
+        # Each argument of the trainer can be stateful so we copy them
+        cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items()}
+
+        return fdl.Config(type(self), **cfg_kwargs)
diff --git a/nemo/llm/gpt/model/base.py b/nemo/llm/gpt/model/base.py
index 02588b494077..93186a7e7e08 100644
--- a/nemo/llm/gpt/model/base.py
+++ b/nemo/llm/gpt/model/base.py
@@ -7,6 +7,7 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch.optim import Optimizer
 
+from nemo import io
 from nemo.lightning import get_vocab_size
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 
@@ -17,7 +18,7 @@
 
 
 @dataclass
-class GPTConfig(TransformerConfig):
+class GPTConfig(TransformerConfig, io.IOMixin):
     # From megatron.core.models.gpt.gpt_model.GPTModel
     fp16_lm_cross_entropy: bool = False
     parallel_output: bool = True
@@ -60,7 +61,7 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
         )
 
 
-class GPTModel(L.LightningModule):
+class GPTModel(L.LightningModule, io.IOMixin):
     def __init__(
         self,
         config: GPTConfig,
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 20efa2b22013..e2a558929146 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,3 +1,4 @@
+fiddle
 huggingface_hub>=0.20.3
 numba
 numpy>=1.22
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 6bd43cdfc9c7..6acfddad9189 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -1,3 +1,4 @@
+fiddle
 hydra-core>1.3,<=1.3.2
 omegaconf<=2.3
 pytorch-lightning>=2.2.1
diff --git a/tests/io/__init__.py b/tests/io/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/io/test_api.py b/tests/io/test_api.py
new file mode 100644
index 000000000000..d4c317bf2e9f
--- /dev/null
+++ b/tests/io/test_api.py
@@ -0,0 +1,18 @@
+from nemo import io
+from nemo import lightning as nl
+from nemo import llm
+
+
+class TestLoad:
+    def test_reload_ckpt(self, tmpdir):
+        trainer = nl.Trainer(devices=1, accelerator="cpu", strategy=nl.MegatronStrategy())
+        # model = llm.Mistral7BModel()
+        model = llm.GPTModel(
+            llm.GPTConfig(num_layers=2, hidden_size=1024, ffn_hidden_size=4096, num_attention_heads=8,)
+        )
+
+        ckpt = io.TrainerCheckpoint(model, trainer)
+        ckpt.io_dump(tmpdir)
+        loaded = io.load_ckpt(tmpdir)
+
+        assert loaded.model.config.seq_length == ckpt.model.config.seq_length
diff --git a/tests/io/test_mixin.py b/tests/io/test_mixin.py
new file mode 100644
index 000000000000..ed898d435609
--- /dev/null
+++ b/tests/io/test_mixin.py
@@ -0,0 +1,16 @@
+from nemo import io
+
+
+class DummyClass(io.IOMixin):
+    def __init__(self, a: int, b: int):
+        self.a = a
+        self.b = b
+
+
+class TestIOMixin:
+    def test_reinit(self):
+        dummy = DummyClass(5, 5)
+        copied = io.reinit(dummy)
+        assert copied is not dummy
+        assert copied.a == dummy.a
+        assert copied.b == dummy.b

From be59e0814835c2575e66887d9b6fb32bbdcf94df Mon Sep 17 00:00:00 2001
From: Ming <111467530+Victor49152@users.noreply.github.com>
Date: Mon, 29 Apr 2024 19:32:05 -0700
Subject: [PATCH 004/178] Mingyuanm/sdxl quantization notebook (#9042)

* Move cached embedding devices and dtype for onnx export consistency

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add old trt export/inference script, currently not working in latest container.

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Update intro and why nemo in dev doc

* Categorize tutorials

* Add NeMo TRT inference pipeline and quatization workflow

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add guards to avoid undefined variables

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor fix

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update tutorials link

* update index

* Restructure

* Restructure

* Restructure

* Restructure

* Restructure

* Restructure

* Add conversion script from hf sdxl to nemo sdxl

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Update quantize pipeline to adapt to variable image dimension

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* update sdxl pipeline to be aware of additional emb channels

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add guards for potential local var

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* copyright header

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Restructure

* Restructure

* Update flash attention

* Update flash attention

* Update file paths

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Fix few structure issue

* Fix migration

* Fix structure

* Fix structure

* Few updates

* Add few more scripts

* Fix scripts

* Fix few things

* Fix tutorial table

* Restructure

* Rename

* Add notebook

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* WIP

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Documentation

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Few fixes and moves

* Move sections

* Fix bib

* Refactor files

* Fixes

* Update quantization script

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add tutorial and docs

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add images

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Fix

* Fix few issues

* remove scripts

* Update comments

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Update docs

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add links to sdxl quantization tutorial

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add link to new tutorial

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove unused import

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Using links to images

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* remove unused imports

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 docs/source/multimodal/text2img/intro.rst     |   1 +
 .../multimodal/text2img/sdxl_quantization.rst | 156 ++++
 docs/source/starthere/tutorials.rst           |   3 +
 .../stable_diffusion/conf/sd_xl_quantize.yaml |  32 +-
 .../stable_diffusion/sd_xl_quantize.py        |  45 +
 .../stable_diffusion/sd_xl_trt_inference.py   |   5 +-
 nemo/utils/trt_utils.py                       |   4 +
 tutorials/multimodal/SDXL Quantization.ipynb  | 851 ++++++++++++++++++
 8 files changed, 1084 insertions(+), 13 deletions(-)
 create mode 100644 docs/source/multimodal/text2img/sdxl_quantization.rst
 create mode 100644 tutorials/multimodal/SDXL Quantization.ipynb

diff --git a/docs/source/multimodal/text2img/intro.rst b/docs/source/multimodal/text2img/intro.rst
index 9ec793d246fa..3c3c17768679 100644
--- a/docs/source/multimodal/text2img/intro.rst
+++ b/docs/source/multimodal/text2img/intro.rst
@@ -13,3 +13,4 @@ NeMo multimodal provides implementations of multiple image-to-text models, inclu
    imagen
    dreambooth
    controlnet
+   sdxl_quantization
diff --git a/docs/source/multimodal/text2img/sdxl_quantization.rst b/docs/source/multimodal/text2img/sdxl_quantization.rst
new file mode 100644
index 000000000000..78403e9c402c
--- /dev/null
+++ b/docs/source/multimodal/text2img/sdxl_quantization.rst
@@ -0,0 +1,156 @@
+Stable Diffusion XL Int8 Quantization
+=======================================
+
+This example shows how to use Ammo to calibrate and quantize the UNet part of the SDXL. The UNet part typically consumes
+>95% of the e2e Stable Diffusion latency.
+
+We also provide instructions on deploying and running E2E SDXL pipeline
+with Ammo quantized int8 UNet to generate images and measure latency on target GPUs.
+
+To get started, it is required to have a pretrained SDXL checkpoint in `nemo` format. The example training configs are provided in NeMo,
+which is located in `NeMo/examples/multimodal/text2img/stable_diffusion`.
+
+Calibration
+---------------
+The first step is to run quantization script with default config, and finally the script will export the quantized unet to onnx file.
+Here is the default config for `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_quantize.py`.
+
+
+.. code-block:: yaml
+    quantize
+      exp_name: nemo
+      n_steps: 20          # number of inference steps
+      format: 'int8'       # only int8 quantization is supported now
+      percentile: 1.0      # Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of `(n_steps * percentile)` steps. Recommendation: 1.0
+      batch_size: 1        # batch size calling sdxl inference pipeline during calibration
+      calib_size: 32       # For SDXL, we recommend 32, 64 or 128
+      quant_level: 2.5     #Which layers to be quantized, 1: `CNNs`, 2: `CNN + FFN`, 2.5: `CNN + FFN + QKV`, 3: `CNN + Linear`. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.
+      alpha: 0.8           # A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL
+
+
+
+Important Parameters
+^^^^^^^^^^^^^^^^^^^^
+- percentile: Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of (n_steps * percentile) steps. Recommendation: 1.0
+- alpha: A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL, 1.0 for SD 1.5
+- quant-level: Which layers to be quantized, 1: CNNs, 2: CNN + FFN, 2.5: CNN + FFN + QKV, 3: CNN + Linear. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.
+- calib-size: For SDXL, we recommend 32, 64 or 128, for SD 1.5, set to 512 or 1024.
+
+
+Build the TRT engine for the Quantized ONNX UNet
+------------------------------------------------------------
+
+.. code-block:: bash
+    trtexec --onnx=./nemo_onnx/unet.onnx --shapes=x:8x4x128x128,timesteps:8,context:8x80x2048,y:8x2816 --fp16 --int8 --builderOptimizationLevel=4 --saveEngine=nemo_unet_xl.plan
+
+
+Important Parameters
+^^^^^^^^^^^^^^^^^^^^
+Input shape has to be provided here when building TRT engine.
+- x: Input image latent shape (B * C * H * W)
+- context: Input text conditioning (B * S * hidden_dimention)
+- y: Additional embedding (B * adm_in_channels)
+
+Build End-to-end Stable Diffusion XL Pipeline with NeMo
+-----------------------------------------------------------
+
+We provide a script to build end to end TRT inference pipeline with NeMo backend, which is located at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_export.py`.
+
+.. code-block:: yaml
+    infer:
+        out_path: sdxl_export
+        width: 1024
+        height: 1024
+        batch_size: 2
+
+    trt:
+      static_batch: False
+      min_batch_size: 1
+      max_batch_size: 8
+
+Important Parameters
+^^^^^^^^^^^^^^^^^^^^
+- out_path: Directory to save onnx file and TRT engine files
+- width and height: Image resolution of inference output
+- batch_size: Only used for dummy input generation and onnx sanity check
+- {min,max}_batch_size: The input batch size of TRT engine along its dynamic axis
+
+
+Run End-to-end Stable Diffusion XL TRT Pipeline
+-----------------------------------------------------------
+
+The inference script can be found at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_trt_inference.py`.
+
+.. code-block:: yaml
+    unet_xl: sdxl_export/plan/unet_xl.plan
+    vae: sdxl_export/plan/vae.plan
+    clip1: sdxl_export/plan/clip1.plan
+    clip2: sdxl_export/plan/clip2.plan
+
+    out_path: trt_output
+
+
+Please specify unet_xl as the quantized Unet engine to run the quantized solution. The system will load the original engine file by default.
+
+Inference Speedup
+-------------------
+TRT version  9.3.0
+GPU: H100
+
+TRT int8 vs Framework fp16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++---------------------+------------+-------------+----------------+------------+---------+------------+
+| Pipeline            | Batch Size | Latency (ms)| Pipeline       | Batch Size | Latency | Speedup    |
++=====================+============+=============+================+============+=========+============+
+| Framework fp16 base | 1          | 3056.01     | Ammo TRT Int8  | 1          | 1406.68 | 2.172498365|
++---------------------+------------+-------------+----------------+------------+---------+------------+
+| Framework fp16 base | 2          | 4832.24     | Ammo TRT Int8  | 2          | 2403.29 | 2.01067703 |
++---------------------+------------+-------------+----------------+------------+---------+------------+
+| Framework fp16 base | 4          | 8433.71     | Ammo TRT Int8  | 4          | 4252.6  | 1.983189108|
++---------------------+------------+-------------+----------------+------------+---------+------------+
+
+
+
+TRT int8 vs TRT fp16
+^^^^^^^^^^^^^^^^^^^^^^^
+
+
++-------------+------------+--------------+-----------+------------+------------+-------------+
+| Pipeline    | Batch Size | Latency (ms) | Precision | Batch Size | Latency    | Speedup     |
++=============+============+==============+===========+============+============+=============+
+| fp16 base   | 1          | 1723.97      | Ammo Int8 | 1          | 1406.68    | 1.225559473 |
++-------------+------------+--------------+-----------+------------+------------+-------------+
+| fp16 base   | 2          | 3004.47      | Ammo Int8 | 2          | 2403.29    | 1.250148754 |
++-------------+------------+--------------+-----------+------------+------------+-------------+
+| fp16 base   | 4          | 5657.19      | Ammo Int8 | 4          | 4252.6     | 1.330289705 |
++-------------+------------+--------------+-----------+------------+------------+-------------+
+
+
+FP16 inference vs Int8 inference
+----------------------------------
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_1.png
+   :width: 50%
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_1.png
+   :width: 50%
+Prompt: A photo of a Shiba Inu dog with a backpack riding a bike. It is wearing sunglasses and a beach hat. (FP16 upper vs Int8 lower)
+
+
+
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_2.png
+   :width: 50%
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_2.png
+   :width: 50%
+Prompt: A cute corgi lives in a house made out of sushi. (FP16 upper vs Int8 lower)
+
+
+
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_3.png
+   :width: 50%
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_3.png
+   :width: 50%
+Prompt: A high contrast portrait of a very happy fuzzy panda dressed as a chef in a high end kitchen making dough. There is a painting of flowers on the wall behind him. (FP16 upper vs Int8 lower)
+
diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst
index 5ca48904ed9b..0298dbdf6d4b 100644
--- a/docs/source/starthere/tutorials.rst
+++ b/docs/source/starthere/tutorials.rst
@@ -63,6 +63,9 @@ Tutorial Overview
    * - Multimodal
      - Preparations and Advanced Applications: DreamBooth Tutorial
      - `DreamBooth Tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/DreamBooth%20Tutorial.ipynb>`_
+   * - Multimodal
+     - Preparations and Advanced Applications: Stable Diffusion XL Quantization Tutorial
+     - `DreamBooth Tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/SDXL%20Quantization.ipynb>`_
 
 .. list-table:: **Automatic Speech Recognition (ASR) Tutorials**
    :widths: 15 30 55
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml
index 000416f7996b..ecb75953829e 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml
@@ -2,7 +2,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 32
+  precision: 16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -17,7 +17,7 @@ trainer:
 
 
 infer:
-  num_samples: 1
+  num_samples: 4
   prompt:
     - "A professional photograph of an astronaut riding a pig"
     - 'A photo of a Shiba Inu dog with a backpack riding a bike. It is wearing sunglasses and a beach hat.'
@@ -59,25 +59,33 @@ model:
 
 quantize:
   exp_name: nemo_test
-  n_steps: 20
-  format: 'int8'
-  percentile: 1.0
-  batch_size: 1
-  calib_size: 32
-  quant_level: 2.5
-  alpha: 0.8
+  n_steps: 20          # number of inference steps
+  format: 'int8'       # only int8 quantization is supported now
+  percentile: 1.0      # Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of `(n_steps * percentile)` steps. Recommendation: 1.0
+  batch_size: 1        # batch size calling sdxl inference pipeline during calibration
+  calib_size: 32       # For SDXL, we recommend 32, 64 or 128
+  quant_level: 2.5     #Which layers to be quantized, 1: `CNNs`, 2: `CNN + FFN`, 2.5: `CNN + FFN + QKV`, 3: `CNN + Linear`. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.
+  alpha: 0.8           # A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL
   quantized_ckpt: nemo.unet.state_dict.${quantize.exp_name}.pt
 
 onnx_export:
-  onnx_dir: nemo_onnx
-  pretrained_base: ${model.restore_from_path}
-  quantized_ckpt: ${quantize.quantized_ckpt}
+  onnx_dir: nemo_onnx  # Path to save onnx files
+  pretrained_base: ${model.restore_from_path}  # Path to nemo checkpoint for sdxl
+  quantized_ckpt: ${quantize.quantized_ckpt}  # Path to save quantized unet checkpoint
   format: int8
 
+trt_export:
+  static_batch: False # static batch engines have better latency
+  min_batch_size: 1   # minimum batch size when using dynamic batch, has to be the same with max_batch_size and infer.num_samples when using static batch
+  max_batch_size: 8   # maximum batch size when using dynamic batch, has to be the same with min_batch_size and infer.num_samples when using static batch
+  int8: True          # Allow engine builder recognize int8 precision
+  builder_optimization_level: 4  # set to 1-5, higher optimization level means better latency but longer compiling time
+  trt_engine: int8_unet_xl.plan  # path to save trt engine
 
 use_refiner: False
 use_fp16: False # use fp16 model weights
 out_path: ./output
 run_quantization: True
 run_onnx_export: True
+run_trt_export: True
 
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
index 5c5e1dd94a09..89bfcd294ae4 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
@@ -34,6 +34,7 @@
 from nemo.collections.multimodal.parts.stable_diffusion.sdxl_pipeline import SamplingPipeline
 from nemo.collections.multimodal.parts.utils import setup_trainer_and_model_for_inference
 from nemo.core.config import hydra_runner
+from nemo.utils.trt_utils import build_engine
 
 
 def do_calibrate(base, calibration_prompts, **kwargs):
@@ -49,6 +50,26 @@ def do_calibrate(base, calibration_prompts, **kwargs):
         )
 
 
+def get_input_profile_unet(
+    batch_size, static_batch=False, min_batch_size=1, max_batch_size=8, latent_dim=32, adm_in_channels=1280
+):
+    assert batch_size >= min_batch_size and batch_size <= max_batch_size
+    if static_batch:
+        min_batch_size = batch_size if static_batch else min_batch_size
+        max_batch_size = batch_size if static_batch else max_batch_size
+    input_profile = {}
+    dummy_input = generate_dummy_inputs(
+        sd_version="nemo", device='cuda', latent_dim=latent_dim, adm_in_channels=adm_in_channels
+    )
+    for key, value in dummy_input.items():
+        input_profile[key] = [
+            (min_batch_size, *(value.shape[1:])),
+            (batch_size, *(value.shape[1:])),
+            (max_batch_size, *(value.shape[1:])),
+        ]
+    return input_profile
+
+
 @hydra_runner(config_path='conf', config_name='sd_xl_quantize')
 def main(cfg):
     def model_cfg_modifier(model_cfg):
@@ -147,6 +168,30 @@ def forward_loop():
             opset_version=opset_version,
         )
 
+    if cfg.run_trt_export:
+        torch.cuda.empty_cache()
+        batch_size = cfg.infer.get('num_samples', 1)
+        min_batch_size = cfg.trt_export.min_batch_size
+        max_batch_size = cfg.trt_export.max_batch_size
+        static_batch = cfg.trt_export.static_batch
+        fp16 = cfg.trainer.precision in ['16', '16-mixed', 16]
+        build_engine(
+            f"{cfg.onnx_export.onnx_dir}/unet.onnx",
+            f"{cfg.trt_export.trt_engine}",
+            fp16=fp16,
+            input_profile=get_input_profile_unet(
+                batch_size,
+                static_batch=static_batch,
+                min_batch_size=min_batch_size,
+                max_batch_size=max_batch_size,
+                latent_dim=cfg.sampling.base.height // 8,
+                adm_in_channels=base.model.model.diffusion_model.adm_in_channels,
+            ),
+            timing_cache=None,
+            int8=cfg.trt_export.int8,
+            builder_optimization_level=cfg.trt_export.builder_optimization_level,
+        )
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py
index 04fc7bd91315..14c64a58a8af 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py
@@ -22,6 +22,7 @@
 from cuda import cudart
 from transformers import CLIPTokenizer
 
+from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser import DiscreteDenoiser
 from nemo.collections.multimodal.modules.stable_diffusion.encoders.modules import ConcatTimestepEmbedderND
 from nemo.collections.multimodal.modules.stable_diffusion.quantization_utils.trt_engine import TRT_LOGGER, Engine
 from nemo.collections.multimodal.parts.stable_diffusion.sdxl_helpers import perform_save_locally
@@ -176,6 +177,7 @@ def run(self, prompt, negative_prompt, image_height, image_width, num_samples, a
 
         with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER):
             torch.cuda.synchronize()
+            e2e_tic = time.perf_counter()
 
             c, uc = self.encode_prompt(prompt, negative_prompt)
 
@@ -198,8 +200,9 @@ def denoiser(input, sigma, c):
 
             samples_z = self.sampler(denoiser, randn, cond=c, uc=uc)
             samples_x = self.decode_images(samples_z)
+            e2e_tic = time.perf_counter() - e2e_tic
             samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
-
+            print(f'This batch takes {e2e_tic}s')
             perform_save_locally(self.cfg.out_path, samples)
 
 
diff --git a/nemo/utils/trt_utils.py b/nemo/utils/trt_utils.py
index 73e899532691..a355a8e9e77e 100644
--- a/nemo/utils/trt_utils.py
+++ b/nemo/utils/trt_utils.py
@@ -32,6 +32,8 @@ def build_engine(
     enable_preview=False,
     timing_cache=None,
     workspace_size=0,
+    int8=False,
+    builder_optimization_level=None,
 ):
     print(f"Building TensorRT engine for {onnx_path}: {output_path}")
     p = Profile()
@@ -53,6 +55,8 @@ def build_engine(
             profiles=[p],
             preview_features=preview_features,
             load_timing_cache=timing_cache,
+            int8=int8,
+            builder_optimization_level=builder_optimization_level,
             **config_kwargs,
         ),
         save_timing_cache=timing_cache,
diff --git a/tutorials/multimodal/SDXL Quantization.ipynb b/tutorials/multimodal/SDXL Quantization.ipynb
new file mode 100644
index 000000000000..1562a9c756ee
--- /dev/null
+++ b/tutorials/multimodal/SDXL Quantization.ipynb	
@@ -0,0 +1,851 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b32d3842",
+   "metadata": {},
+   "source": [
+    "# SDXL Int8 Quantization Solution by Ammo\n",
+    "\n",
+    "### Note:\n",
+    "This notebook requires nvidia-ammo > 0.9.x, which comes with NeMo framework container > 23.05. An example command to launch the container:\n",
+    "\n",
+    "```\n",
+    "docker run --gpus all -it --rm -v <your_nemo_dir>:/opt/NeMo --shm-size=8g \\\n",
+    "     -p 8888:8888 --ulimit memlock=-1 --ulimit \\\n",
+    "      stack=67108864 <your_nemo_container>\n",
+    "```\n",
+    "\n",
+    "This tutorial shows how to use Ammo to calibrate and quantize the UNet part of the SDXL within NeMo framework. \n",
+    "\n",
+    "Please note that NeMo provides users with an end-to-end training framework for SDXL, and this quantization pipeline is supposed to work with a `.nemo` checkpoint trained from their own text-image dataset. In this tutorial, a open-source checkpoint is converted to `.nemo` format for illustration purpose."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f8320ca",
+   "metadata": {},
+   "source": [
+    "### Download SDXL checkpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd436eab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Download Unet checkpoint\n",
+    "! mkdir -p /sdxl_ckpts/stable-diffusion-xl-base-1.0/unet && wget -P /sdxl_ckpts/stable-diffusion-xl-base-1.0/unet https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/unet/diffusion_pytorch_model.safetensors\n",
+    "## Download Vae checkpoint  \n",
+    "! mkdir -p /sdxl_ckpts/stable-diffusion-xl-base-1.0/vae && wget -P /sdxl_ckpts/stable-diffusion-xl-base-1.0/vae https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/vae/diffusion_pytorch_model.safetensors"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "70164e82",
+   "metadata": {},
+   "source": [
+    "### Convert downloaded checkpoint into `.nemo` format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c9649553",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo I 2024-04-24 22:13:11 distributed:42] Initializing torch.distributed with local_rank: 0, rank: 0, world_size: 1\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1183] hidden_size not found in {'precision': 'bf16-mixed', 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': True, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors'}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/vae/diffusion_pytorch_model.safetensors', 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0'}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[rank0]:[W init.cpp:767] Warning: nvfuser is no longer supported in torch script, use _jit_set_nvfuser_enabled is deprecated and a no-op (function operator())\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:253] Rank 0 has data parallel group : [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:259] Rank 0 has combined group of data parallel and context parallel : [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:264] All data parallel group ranks with context parallel combined: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:267] Ranks 0 has data parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:284] Rank 0 has context parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:287] All context parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:288] Ranks 0 has context parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:299] Rank 0 has model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:300] All model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:310] Rank 0 has tensor model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:314] All tensor model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:315] Rank 0 has tensor model parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:344] Rank 0 has pipeline model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:356] Rank 0 has embedding group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:362] All pipeline model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:363] Rank 0 has pipeline model parallel rank 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:364] All embedding group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:365] Rank 0 has embedding rank: 0\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1183] hidden_size not found in {'precision': 'bf16-mixed', 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': True, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors'}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/vae/diffusion_pytorch_model.safetensors', 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0'}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:14 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:14 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:16 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:16 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:18 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:18 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:20 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:20 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:21 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:21 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:23 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:23 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:25 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers>, cls=<OpenAIWrapper>\n",
+      "open_clip_pytorch_model.bin: 100%|██████████| 10.2G/10.2G [01:36<00:00, 106MB/s]\n",
+      "Initialized embedder #0: FrozenCLIPEmbedder with 123060480 params. Trainable: False\n",
+      "Initialized embedder #1: FrozenOpenCLIPEmbedder2 with 694659841 params. Trainable: False\n",
+      "Initialized embedder #2: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #3: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #4: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "Working with z of shape (1, 4, 32, 32) = 4096 dimensions.\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "AutoencoderKLInferenceWrapper: Following keys are missing during loading VAE weights, which may lead to compromised image quality for a resumed training. Please check the checkpoint you provided.\n",
+      "Missing:['encoder.mid.attn_1.proj_out.bias', 'decoder.mid.attn_1.v.weight', 'encoder.mid.attn_1.proj_out.weight', 'decoder.mid.attn_1.proj_out.bias', 'decoder.mid.attn_1.q.weight', 'decoder.mid.attn_1.q.bias', 'encoder.mid.attn_1.q.weight', 'encoder.mid.attn_1.k.weight', 'encoder.mid.attn_1.v.bias', 'decoder.mid.attn_1.k.weight', 'decoder.mid.attn_1.v.bias', 'decoder.mid.attn_1.proj_out.weight', 'encoder.mid.attn_1.q.bias', 'encoder.mid.attn_1.v.weight', 'encoder.mid.attn_1.k.bias', 'decoder.mid.attn_1.k.bias']\n",
+      "Unexpected:['encoder.mid.attentions.0.to_k.weight', 'decoder.mid.attentions.0.to_out.0.weight', 'encoder.mid.attentions.0.to_v.bias', 'decoder.mid.attentions.0.to_q.bias', 'encoder.mid.attentions.0.to_q.weight', 'encoder.mid.attentions.0.to_v.weight', 'decoder.mid.attentions.0.to_k.weight', 'decoder.mid.attentions.0.to_v.bias', 'encoder.mid.attentions.0.to_k.bias', 'encoder.mid.attentions.0.to_out.0.bias', 'decoder.mid.attentions.0.to_out.0.bias', 'encoder.mid.attentions.0.to_out.0.weight', 'decoder.mid.attentions.0.to_k.bias', 'decoder.mid.attentions.0.to_v.weight', 'decoder.mid.attentions.0.to_q.weight', 'encoder.mid.attentions.0.to_q.bias']\n",
+      "[NeMo I 2024-04-24 22:15:42 convert_hf_ckpt_to_nemo:226] NeMo model saved to: /quantization/sdxl_base.nemo\n"
+     ]
+    }
+   ],
+   "source": [
+    "WORKDIR = '/quantization'\n",
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/convert_hf_ckpt_to_nemo.py \\\n",
+    "    --model_type sdxl \\\n",
+    "    --ckpt_path /sdxl_ckpts/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors \\\n",
+    "    --hparams_file /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml \\\n",
+    "    --nemo_file_path $WORKDIR/sdxl_base.nemo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25817b85",
+   "metadata": {},
+   "source": [
+    "### Run quantization script with default config, and finally the script will export the quantized unet to onnx file.\n",
+    "\n",
+    "##### Quantization config\n",
+    "\n",
+    "```yaml\n",
+    "quantize\n",
+    "  exp_name: nemo_test\n",
+    "  n_steps: 20          # number of inference steps\n",
+    "  format: 'int8'       # only int8 quantization is supported now\n",
+    "  percentile: 1.0      # Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of `(n_steps * percentile)` steps. Recommendation: 1.0\n",
+    "  batch_size: 1        # batch size calling sdxl inference pipeline during calibration\n",
+    "  calib_size: 32       # For SDXL, we recommend 32, 64 or 128\n",
+    "  quant_level: 2.5     #Which layers to be quantized, 1: `CNNs`, 2: `CNN + FFN`, 2.5: `CNN + FFN + QKV`, 3: `CNN + Linear`. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.\n",
+    "  alpha: 0.8           # A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL\n",
+    "```\n",
+    "\n",
+    "##### Onnx export config\n",
+    "\n",
+    "```yaml\n",
+    "onnx_export:\n",
+    "  onnx_dir: nemo_onnx    # Path to save onnx files\n",
+    "  pretrained_base: ${model.restore_from_path}  # Path to nemo checkpoint for sdxl\n",
+    "  quantized_ckpt: nemo.unet.state_dict.${quantize.exp_name}.pt  # Path to save quantized unet checkpoint\n",
+    "  format: int8\n",
+    "```\n",
+    "##### Onnx export config\n",
+    "\n",
+    "```yaml\n",
+    "trt_export:\n",
+    "  static_batch: False # static batch engines have better latency\n",
+    "  min_batch_size: 1   # minimum batch size when using dynamic batch, has to be the same with max_batch_size and infer.num_samples when using static batch\n",
+    "  max_batch_size: 1   # maximum batch size when using dynamic batch, has to be the same with min_batch_size and infer.num_samples when using static batch\n",
+    "  int8: True          # Allow engine builder recognize int8 precision\n",
+    "  builder_optimization_level: 4  # set to 1-5, higher optimization level means better latency but longer compiling time\n",
+    "  trt_engine: int8_unet_xl.plan  # path to save trt engine\n",
+    "```\n",
+    "\n",
+    "The following command restores a pre-trained sdxl model from `$WORKDIR/sdxl_base.nemo` derived from the above step.\n",
+    "The quantized U-Net checkpoint is saved to `quantize.quantized_ckpt`, converted onnx file is saved to `onnx_export.onnx_dir` and trt engine is saved to `trt_export.trt_engine`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d955f6c3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo W 2024-04-24 19:42:59 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+      "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+      "      ret = run_job(\n",
+      "    \n",
+      "[NeMo W 2024-04-24 19:42:59 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!\n",
+      "    \n",
+      "Using 16bit Automatic Mixed Precision (AMP)\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[W init.cpp:767] Warning: nvfuser is no longer supported in torch script, use _jit_set_nvfuser_enabled is deprecated and a no-op (function operator())\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:253] Rank 0 has data parallel group : [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:259] Rank 0 has combined group of data parallel and context parallel : [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:264] All data parallel group ranks with context parallel combined: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:267] Ranks 0 has data parallel rank: 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:284] Rank 0 has context parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:287] All context parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:288] Ranks 0 has context parallel rank: 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:299] Rank 0 has model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:300] All model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:310] Rank 0 has tensor model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:314] All tensor model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:315] Rank 0 has tensor model parallel rank: 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:344] Rank 0 has pipeline model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:356] Rank 0 has embedding group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:362] All pipeline model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:363] Rank 0 has pipeline model parallel rank 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:364] All embedding group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:365] Rank 0 has embedding rank: 0\n",
+      "24-04-24 19:43:09 - PID:1361 - rank:(0, 0, 0, 0) - microbatches.py:39 - INFO - setting number of micro-batches to constant 1\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:10 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:10 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:11 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:11 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:13 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:13 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:15 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:15 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:17 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:17 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:19 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:19 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:20 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:20 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:21 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers>, cls=<OpenAIWrapper>\n",
+      "Loaded ViT-bigG-14 model config.\n",
+      "Loading pretrained ViT-bigG-14 weights (laion2b_s39b_b160k).\n",
+      "Initialized embedder #0: FrozenCLIPEmbedder with 123060480 params. Trainable: False\n",
+      "Initialized embedder #1: FrozenOpenCLIPEmbedder2 with 694659841 params. Trainable: False\n",
+      "Initialized embedder #2: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #3: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #4: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "Working with z of shape (1, 4, 32, 32) = 4096 dimensions.\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "[NeMo I 2024-04-24 19:43:53 nlp_overrides:1155] Model MegatronDiffusionEngine was successfully restored from /quantization/sdxl_base.nemo.\n",
+      "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "distributed_backend=nccl\n",
+      "All distributed processes registered. Starting with 1 processes\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "\n",
+      "Building TensorRT engine for /quantization/nemo_onnx/unet.onnx: /quantization/int8_unet_xl.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {x [min=(1, 4, 128, 128), opt=(4, 4, 128, 128), max=(8, 4, 128, 128)],\n",
+      "             y [min=(1, 2816), opt=(4, 2816), max=(8, 2816)],\n",
+      "             timesteps [min=(1,), opt=(4,), max=(8,)],\n",
+      "             context [min=(1, 80, 2048), opt=(4, 80, 2048), max=(8, 80, 2048)]}\n",
+      "    ]\n",
+      "\u001B[38;5;11m[W] It looks like some layers in the network have compute precision set, but precision constraints were not enabled. \n",
+      "    Precision constraints must be set to 'prefer' or 'obey' for layer compute precision to take effect. \n",
+      "    Note: Layers and their requested precisions were: {'/input_blocks.0/input_blocks.0.0/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.0/input_blocks.0.0/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.0/input_blocks.0.0/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.0/input_blocks.0.0/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.3/input_blocks.3.0/op/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.3/input_blocks.3.0/op/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.3/input_blocks.3.0/op/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.3/input_blocks.3.0/op/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.6/input_blocks.6.0/op/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.6/input_blocks.6.0/op/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.6/input_blocks.6.0/op/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.6/input_blocks.6.0/op/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.2/conv/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.2/conv/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.2/conv/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.2/conv/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.2/conv/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.2/conv/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.2/conv/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.2/conv/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/out/out.1/input_quantizer/QuantizeLinear': 'FLOAT', '/out/out.1/input_quantizer/DequantizeLinear': 'INT8', '/out/out.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/out/out.1/weight_quantizer/DequantizeLinear': 'INT8'}\u001B[0m\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16, INT8]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 881.973 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/int8_unet_xl.plan\n"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py model.restore_from_path=$WORKDIR/sdxl_base.nemo onnx_export.onnx_dir=$WORKDIR/nemo_onnx quantize.quantized_ckpt=$WORKDIR/nemo.unet.state_dict.nemo.pt trt_export.trt_engine=$WORKDIR/int8_unet_xl.plan\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f97d6bfa",
+   "metadata": {},
+   "source": [
+    "### Build end to end TRT inference pipeline\n",
+    "In order to run an end to end inference with quantized U-Net engine, we need to export and build engines for the other compenents in SDXL, which includes the VAE and two CLIP encoder. The following script restores SDXL from the `nemo` checkpoint and saves the corresponding engine files to `infer.out_path`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2e8b7742",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo W 2024-04-24 22:17:42 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+      "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+      "      ret = run_job(\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:17:42 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!\n",
+      "    \n",
+      "Using 16bit Automatic Mixed Precision (AMP)\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[W init.cpp:767] Warning: nvfuser is no longer supported in torch script, use _jit_set_nvfuser_enabled is deprecated and a no-op (function operator())\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:253] Rank 0 has data parallel group : [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:259] Rank 0 has combined group of data parallel and context parallel : [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:264] All data parallel group ranks with context parallel combined: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:267] Ranks 0 has data parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:284] Rank 0 has context parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:287] All context parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:288] Ranks 0 has context parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:299] Rank 0 has model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:300] All model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:310] Rank 0 has tensor model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:314] All tensor model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:315] Rank 0 has tensor model parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:344] Rank 0 has pipeline model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:356] Rank 0 has embedding group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:362] All pipeline model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:363] Rank 0 has pipeline model parallel rank 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:364] All embedding group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:365] Rank 0 has embedding rank: 0\n",
+      "24-04-24 22:17:50 - PID:703 - rank:(0, 0, 0, 0) - microbatches.py:39 - INFO - setting number of micro-batches to constant 1\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:51 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:51 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:53 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:53 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:54 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:54 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:56 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:56 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:58 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:58 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:00 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:18:00 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers>, cls=<OpenAIWrapper>\n",
+      "Loaded ViT-bigG-14 model config.\n",
+      "Loading pretrained ViT-bigG-14 weights (laion2b_s39b_b160k).\n",
+      "Initialized embedder #0: FrozenCLIPEmbedder with 123060480 params. Trainable: False\n",
+      "Initialized embedder #1: FrozenOpenCLIPEmbedder2 with 694659841 params. Trainable: False\n",
+      "Initialized embedder #2: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #3: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #4: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "Working with z of shape (1, 4, 32, 32) = 4096 dimensions.\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "[NeMo I 2024-04-24 22:18:35 nlp_overrides:1155] Model MegatronDiffusionEngine was successfully restored from /quantization/sdxl_base.nemo.\n",
+      "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "distributed_backend=nccl\n",
+      "All distributed processes registered. Starting with 1 processes\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "\n",
+      "[NeMo W 2024-04-24 22:18:36 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py:1184: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      assert y.shape[0] == x.shape[0]\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:18:36 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py:209: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      assert x.shape[1] == self.channels\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:18:37 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      assert x.shape[1] == self.channels\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:17 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py:172: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      w_ = w_ * (int(c) ** (-0.5))\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/torch/onnx/utils.py:2095: UserWarning: Provided key z_pooled for dynamic axes is not a valid input/output name\n",
+      "      warnings.warn(\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/modeling_attn_mask_utils.py:86: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if input_shape[-1] > 1 or self.sliding_window is not None:\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/modeling_attn_mask_utils.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if past_key_values_length > 0:\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/models/clip/modeling_clip.py:281: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/models/clip/modeling_clip.py:289: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/models/clip/modeling_clip.py:321: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:27 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/torch/onnx/symbolic_opset9.py:5859: UserWarning: Exporting aten::index operator of advanced indexing in opset 14 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
+      "      warnings.warn(\n",
+      "    \n",
+      "Building TensorRT engine for /quantization/onnx/unet_xl/unet_xl.onnx: /quantization/plan/unet_xl.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {x [min=(1, 4, 128, 128), opt=(2, 4, 128, 128), max=(8, 4, 128, 128)],\n",
+      "             y [min=(1, 2816), opt=(2, 2816), max=(8, 2816)],\n",
+      "             timesteps [min=(1,), opt=(2,), max=(8,)],\n",
+      "             context [min=(1, 80, 2048), opt=(2, 80, 2048), max=(8, 80, 2048)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;11m[W] Detected layernorm nodes in FP16.\u001B[0m\n",
+      "\u001B[38;5;11m[W] Running layernorm after self-attention in FP16 may cause overflow. Exporting the model to the latest available ONNX opset (later than opset 17) to use the INormalizationLayer, or forcing layernorm layers to run in FP32 precision can help with preserving accuracy.\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 553.937 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/unet_xl.plan\n",
+      "Building TensorRT engine for /quantization/onnx/vae/vae.onnx: /quantization/plan/vae.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {z [min=(1, 4, 128, 128), opt=(2, 4, 128, 128), max=(8, 4, 128, 128)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | []\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 266.743 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/vae.plan\n",
+      "Building TensorRT engine for /quantization/onnx/clip1/clip1.onnx: /quantization/plan/clip1.plan\n",
+      "\u001B[38;5;11m[W] ModelImporter.cpp:409: Make sure input input_ids has Int64 binding.\u001B[0m\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {input_ids [min=(1, 77), opt=(2, 77), max=(8, 77)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 16.988 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/clip1.plan\n",
+      "Building TensorRT engine for /quantization/onnx/clip2/clip2.onnx: /quantization/plan/clip2.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {input_ids [min=(1, 77), opt=(2, 77), max=(8, 77)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 72.535 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/clip2.plan\n"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_export.py model.restore_from_path=$WORKDIR/sdxl_base.nemo infer.out_path=$WORKDIR"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7eb2d03",
+   "metadata": {},
+   "source": [
+    "### Run TRT inference pipeline with original engines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25737be2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo W 2024-04-24 22:46:11 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+      "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+      "      ret = run_job(\n",
+      "    \n",
+      "Loading TensorRT engine: /quantization/plan/unet_xl.plan\n",
+      "[I] Loading bytes from /quantization/plan/unet_xl.plan\n",
+      "unet_xl trt engine loaded successfully\n",
+      "Loading TensorRT engine: /quantization/plan/vae.plan\n",
+      "[I] Loading bytes from /quantization/plan/vae.plan\n",
+      "vae trt engine loaded successfully\n",
+      "Loading TensorRT engine: /quantization/plan/clip1.plan\n",
+      "[I] Loading bytes from /quantization/plan/clip1.plan\n",
+      "clip1 trt engine loaded successfully\n",
+      "Loading TensorRT engine: /quantization/plan/clip2.plan\n",
+      "[I] Loading bytes from /quantization/plan/clip2.plan\n",
+      "clip2 trt engine loaded successfully\n",
+      "[NeMo I 2024-04-24 22:46:17 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:46:17 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:46:17 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:24<00:00,  1.60it/\n",
+      "This batch takes 27.204587490297854s\n",
+      "[NeMo I 2024-04-24 22:46:45 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:46:45 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:46:45 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:25<00:00,  1.57it/\n",
+      "This batch takes 25.58329666685313s\n",
+      "[NeMo I 2024-04-24 22:47:14 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:47:14 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:47:14 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:25<00:00,  1.55it/\n",
+      "This batch takes 25.87396944500506s\n",
+      "[NeMo I 2024-04-24 22:47:44 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:47:44 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:47:44 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:25<00:00,  1.54it/\n",
+      "This batch takes 26.03419069480151s\n",
+      "[NeMo I 2024-04-24 22:48:13 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:48:13 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:48:13 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  71%|▋| 29/41 [00:18<00:07,  1.52it/"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py \\\n",
+    "    out_path=$WORKDIR/trt_output_fp16 \\\n",
+    "    unet_xl=$WORKDIR/plan/unet_xl.plan \\\n",
+    "    vae=$WORKDIR/plan/vae.plan \\\n",
+    "    clip1=$WORKDIR/plan/clip1.plan \\\n",
+    "    clip2=$WORKDIR/plan/clip2.plan\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d11bbe7d",
+   "metadata": {},
+   "source": [
+    "### Run TRT inference pipeline with quantized U-Net engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3f2263b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "^C\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3109, in _dep_map\n",
+      "[2024-04-24 19:42:46,104] torch.distributed.elastic.agent.server.api: [WARNING] Received Signals.SIGINT death signal, shutting down workers\n",
+      "[2024-04-24 19:42:46,104] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1300 closing signal SIGINT\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 2902, in __getattr__\n",
+      "    raise AttributeError(attr)\n",
+      "AttributeError: _DistInfoDistribution__dep_map\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py\", line 25, in <module>\n",
+      "    from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser import DiscreteDenoiser\n",
+      "  File \"/opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py\", line 17, in <module>\n",
+      "    from nemo.collections.multimodal.parts.stable_diffusion.utils import append_dims, instantiate_from_config\n",
+      "  File \"/opt/NeMo/nemo/collections/multimodal/parts/stable_diffusion/utils.py\", line 25, in <module>\n",
+      "    from nemo.utils import logging\n",
+      "  File \"/opt/NeMo/nemo/utils/__init__.py\", line 31, in <module>\n",
+      "    from nemo.utils.lightning_logger_patch import add_memory_handlers_to_pl_logger\n",
+      "  File \"/opt/NeMo/nemo/utils/lightning_logger_patch.py\", line 18, in <module>\n",
+      "    import pytorch_lightning as pl\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/__init__.py\", line 27, in <module>\n",
+      "    from pytorch_lightning.callbacks import Callback  # noqa: E402\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/__init__.py\", line 14, in <module>\n",
+      "    from pytorch_lightning.callbacks.batch_size_finder import BatchSizeFinder\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/batch_size_finder.py\", line 26, in <module>\n",
+      "    from pytorch_lightning.callbacks.callback import Callback\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/callback.py\", line 22, in <module>\n",
+      "    from pytorch_lightning.utilities.types import STEP_OUTPUT\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/utilities/types.py\", line 41, in <module>\n",
+      "    from torchmetrics import Metric\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/torchmetrics/__init__.py\", line 22, in <module>\n",
+      "    from torchmetrics import functional  # noqa: E402\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/torchmetrics/functional/__init__.py\", line 121, in <module>\n",
+      "    from torchmetrics.functional.text._deprecated import _bleu_score as bleu_score\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/torchmetrics/functional/text/__init__.py\", line 49, in <module>\n",
+      "    if _TRANSFORMERS_GREATER_EQUAL_4_4:\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/imports.py\", line 164, in __bool__\n",
+      "    self._check_available()\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/imports.py\", line 158, in _check_available\n",
+      "    self._check_requirement()\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/imports.py\", line 132, in _check_requirement\n",
+      "    pkg_resources.require(self.requirement)\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 968, in require\n",
+      "    needed = self.resolve(parse_requirements(requirements))\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 834, in resolve\n",
+      "    new_requirements = dist.requires(req.extras)[::-1]\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 2822, in requires\n",
+      "    dm = self._dep_map\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3111, in _dep_map\n",
+      "    self.__dep_map = self._compute_dependencies()\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3133, in _compute_dependencies\n",
+      "    dm[s_extra] = [r for r in reqs_for_extra(extra) if r not in common]\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3133, in <listcomp>\n",
+      "    dm[s_extra] = [r for r in reqs_for_extra(extra) if r not in common]\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3125, in reqs_for_extra\n",
+      "    if not req.marker or req.marker.evaluate({'extra': extra}):\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/_vendor/packaging/markers.py\", line 252, in evaluate\n",
+      "    return _evaluate_markers(self._markers, current_environment)\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/_vendor/packaging/markers.py\", line 164, in _evaluate_markers\n",
+      "    return any(all(item) for item in groups)\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/_vendor/packaging/markers.py\", line 164, in <genexpr>\n",
+      "    return any(all(item) for item in groups)\n",
+      "KeyboardInterrupt\n"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py \\\n",
+    "    out_path=$WORKDIR/trt_output_int8 \\\n",
+    "    unet_xl=$WORKDIR/int8_unet_xl.plan \\\n",
+    "    vae=$WORKDIR/plan/vae.plan \\\n",
+    "    clip1=$WORKDIR/plan/clip1.plan \\\n",
+    "    clip2=$WORKDIR/plan/clip2.plan"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c48c21dd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file

From a9dac0edabba2297fd6e1d521cb297b1318a8df8 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Tue, 30 Apr 2024 11:10:25 -0700
Subject: [PATCH 005/178] Fix #8948, allow preprocessor to be stream captured
 to a cuda graph when doing per_feature normalization (#8964)

* Do feature normalization in parallel, rather than via a for loop.

At large batch sizes, this becomes a bottleneck, taking about 9 ms at
batch size 16, for example. See issue #8948.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* Remove all instances of cudaStreamSynchronize() in the featurizer when
doing "per_feature" normalization.

With this change, we can now do stream capture to a cuda graph on the
preprocessor. This is bound to increase performance
significantly. Even at batch size 16, the GPU is idle about 50% of the
time because these kernels finish so fast.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix crash in CPU mode.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../asr/parts/preprocessing/features.py       | 40 +++++++++++++------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/nemo/collections/asr/parts/preprocessing/features.py b/nemo/collections/asr/parts/preprocessing/features.py
index 67813f3e66d2..dccc81b1816c 100644
--- a/nemo/collections/asr/parts/preprocessing/features.py
+++ b/nemo/collections/asr/parts/preprocessing/features.py
@@ -60,17 +60,33 @@ def normalize_batch(x, seq_len, normalize_type):
     x_mean = None
     x_std = None
     if normalize_type == "per_feature":
-        x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
-        x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
-        for i in range(x.shape[0]):
-            if x[i, :, : seq_len[i]].shape[1] == 1:
-                raise ValueError(
-                    "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result "
-                    "in torch.std() returning nan. Make sure your audio length has enough samples for a single "
-                    "feature (ex. at least `hop_length` for Mel Spectrograms)."
-                )
-            x_mean[i, :] = x[i, :, : seq_len[i]].mean(dim=1)
-            x_std[i, :] = x[i, :, : seq_len[i]].std(dim=1)
+        batch_size = x.shape[0]
+        max_time = x.shape[2]
+
+        # When doing stream capture to a graph, item() is not allowed
+        # becuase it calls cudaStreamSynchronize(). Therefore, we are
+        # sacrificing some error checking when running with cuda graphs.
+        if (
+            torch.cuda.is_available()
+            and not torch.cuda.is_current_stream_capturing()
+            and torch.any(seq_len == 1).item()
+        ):
+            raise ValueError(
+                "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result "
+                "in torch.std() returning nan. Make sure your audio length has enough samples for a single "
+                "feature (ex. at least `hop_length` for Mel Spectrograms)."
+            )
+        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
+        valid_mask = time_steps < seq_len.unsqueeze(1)
+        x_mean_numerator = torch.where(valid_mask.unsqueeze(1), x, 0.0).sum(axis=2)
+        x_mean_denominator = valid_mask.sum(axis=1)
+        x_mean = x_mean_numerator / x_mean_denominator.unsqueeze(1)
+
+        # Subtract 1 in the denominator to correct for the bias.
+        x_std = torch.sqrt(
+            torch.sum(torch.where(valid_mask.unsqueeze(1), x - x_mean.unsqueeze(2), 0.0) ** 2, axis=2)
+            / (x_mean_denominator.unsqueeze(1) - 1.0)
+        )
         # make sure x_std is not zero
         x_std += CONSTANT
         return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2), x_mean, x_std
@@ -457,7 +473,7 @@ def forward(self, x, seq_len, linear_spec=False):
 
         # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
         max_len = x.size(-1)
-        mask = torch.arange(max_len).to(x.device)
+        mask = torch.arange(max_len, device=x.device)
         mask = mask.repeat(x.size(0), 1) >= seq_len.unsqueeze(1)
         x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value)
         del mask

From fe4b291175bdc2bc2c1f3b919ddd16e1233e9252 Mon Sep 17 00:00:00 2001
From: anteju <108555623+anteju@users.noreply.github.com>
Date: Tue, 30 Apr 2024 13:08:15 -0700
Subject: [PATCH 006/178] [ASR] Support for transcription of multi-channel
 audio for AED models (#9007)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Propagate channel selector for AED model + add channel selector to get_lhotse_dataloader_from config

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Included comments

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Added unit test

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

---------

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
---
 .../asr/models/aed_multitask_models.py        |   1 +
 .../common/data/lhotse/dataloader.py          |  28 +++++
 .../common/test_lhotse_dataloading.py         | 100 ++++++++++++++++++
 3 files changed, 129 insertions(+)

diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 7e20d7a16559..f9413a4dd738 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -875,6 +875,7 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
             'drop_last': False,
             'text_field': config.get('text_field', 'answer'),
             'lang_field': config.get('lang_field', 'target_lang'),
+            'channel_selector': config.get('channel_selector', None),
         }
 
         temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config), inference=True)
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 5bb3bf2988ea..eabc3da5d11b 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -89,6 +89,7 @@ class LhotseDataLoadingConfig:
     seed: int | str = 0
     num_workers: int = 0
     pin_memory: bool = False
+    channel_selector: int | str | None = None
 
     # 4. Optional Lhotse data augmentation.
     #   a. On-the-fly noise/audio mixing.
@@ -156,6 +157,11 @@ def get_lhotse_dataloader_from_config(
     # 1. Load a manifest as a Lhotse CutSet.
     cuts, is_tarred = read_cutset_from_config(config)
 
+    # Apply channel selector
+    if config.channel_selector is not None:
+        logging.info('Using channel selector %s.', config.channel_selector)
+        cuts = cuts.map(partial(_select_channel, channel_selector=config.channel_selector))
+
     # Resample as a safeguard; it's a no-op when SR is already OK
     cuts = cuts.resample(config.sample_rate)
 
@@ -443,3 +449,25 @@ def _flatten_alt_text(cut) -> list:
         text_instance.custom = {"text": data.pop("text"), "lang": data.pop("lang"), **data}
         ans.append(text_instance)
     return ans
+
+
+def _select_channel(cut, channel_selector: int | str) -> list:
+    if isinstance(channel_selector, int):
+        channel_idx = channel_selector
+    elif isinstance(channel_selector, str):
+        if channel_selector in cut.custom:
+            channel_idx = cut.custom[channel_selector]
+        else:
+            raise ValueError(f"Channel selector {channel_selector} not found in cut.custom")
+
+    if channel_idx >= cut.num_channels:
+        raise ValueError(
+            f"Channel index {channel_idx} is larger than the actual number of channels {cut.num_channels}"
+        )
+
+    if cut.num_channels == 1:
+        # one channel available and channel_idx==0
+        return cut
+    else:
+        # with_channels only defined on MultiCut
+        return cut.with_channels(channel_idx)
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index d4b3ad03050e..8eaebb2af68a 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -104,6 +104,51 @@ def nemo_manifest_path(cutset_path: Path):
     return p
 
 
+@pytest.fixture(scope="session")
+def mc_cutset_path(tmp_path_factory) -> Path:
+    """10 two-channel utterances of length 1s as a Lhotse CutSet."""
+    from lhotse import CutSet, MultiCut
+    from lhotse.testing.dummies import DummyManifest
+
+    num_examples = 10  # number of examples
+    num_channels = 2  # number of channels per example
+
+    # create a dummy manifest with single-channel examples
+    sc_cuts = DummyManifest(CutSet, begin_id=0, end_id=num_examples * num_channels, with_data=True)
+    mc_cuts = []
+
+    for n in range(num_examples):
+        # sources for individual channels
+        mc_sources = []
+        for channel in range(num_channels):
+            source = sc_cuts[n * num_channels + channel].recording.sources[0]
+            source.channels = [channel]
+            mc_sources.append(source)
+
+        # merge recordings
+        rec = Recording(
+            sources=mc_sources,
+            id=f'mc-dummy-recording-{n:02d}',
+            num_samples=sc_cuts[0].num_samples,
+            duration=sc_cuts[0].duration,
+            sampling_rate=sc_cuts[0].sampling_rate,
+        )
+
+        # multi-channel cut
+        cut = MultiCut(
+            recording=rec, id=f'mc-dummy-cut-{n:02d}', start=0, duration=1.0, channel=list(range(num_channels))
+        )
+        mc_cuts.append(cut)
+
+    mc_cuts = CutSet.from_cuts(mc_cuts)
+
+    tmp_path = tmp_path_factory.mktemp("data")
+    p = tmp_path / "mc_cuts.jsonl.gz"
+    pa = tmp_path / "mc_audio"
+    mc_cuts.save_audios(pa).to_file(p)
+    return p
+
+
 @pytest.fixture(scope="session")
 def nemo_tarred_manifest_path(nemo_manifest_path: Path) -> Tuple[str, str]:
     """10 utterances of length 1s as a NeMo tarred manifest."""
@@ -247,6 +292,61 @@ def test_dataloader_from_lhotse_cuts_cut_into_windows(cutset_path: Path):
     # exactly 20 cuts were used because we cut 10x 1s cuts into 20x 0.5s cuts
 
 
+def test_dataloader_from_lhotse_cuts_channel_selector(mc_cutset_path: Path):
+    # Dataloader without channel selector
+    config = OmegaConf.create(
+        {
+            "cuts_path": mc_cutset_path,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "use_lhotse": True,
+            "num_workers": 0,
+            "batch_size": 4,
+            "seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(
+        config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
+    )
+    batches = [b for b in dl]
+    assert len(batches) == 3
+
+    # 1.0s = 16000 samples, two channels, note the constant duration and batch size
+    assert batches[0]["audio"].shape == (4, 2, 16000)
+    assert batches[1]["audio"].shape == (4, 2, 16000)
+    assert batches[2]["audio"].shape == (2, 2, 16000)
+    # exactly 10 cuts were used
+
+    # Apply channel selector
+    for channel_selector in [None, 0, 1]:
+
+        config_cs = OmegaConf.create(
+            {
+                "cuts_path": mc_cutset_path,
+                "channel_selector": channel_selector,
+                "sample_rate": 16000,
+                "shuffle": True,
+                "use_lhotse": True,
+                "num_workers": 0,
+                "batch_size": 4,
+                "seed": 0,
+            }
+        )
+
+        dl_cs = get_lhotse_dataloader_from_config(
+            config=config_cs, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
+        )
+
+        for n, b_cs in enumerate(dl_cs):
+            if channel_selector is None:
+                # no channel selector, needs to match the original dataset
+                assert torch.equal(b_cs["audio"], batches[n]["audio"])
+            else:
+                # channel selector, needs to match the selected channel
+                assert torch.equal(b_cs["audio"], batches[n]["audio"][:, channel_selector, :])
+
+
 @requires_torchaudio
 def test_dataloader_from_lhotse_shar_cuts(cutset_shar_path: Path):
     config = OmegaConf.create(

From 33494f566e07f4387a35cac06461d12f12f2ac41 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Tue, 30 Apr 2024 13:12:33 -0700
Subject: [PATCH 007/178] Enable Sequence Packing and Pipeline Parallel in NeVA
 (#8957)

* temp save

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* temp save 2

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable seq packing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix neva and clip

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Enable parallel seq packing algo and few other fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Pipeline parallel support

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update data preprocess

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix few pp issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable sequence packing w/ PP

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix cu_seqlens in inputs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add assert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Depend on PP to decide whether do padding

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add docstring

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few evaluation issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few PP evaluation issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix license

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../multimodal_llm/neva/conf/neva_config.yaml |   1 +
 .../multimodal_llm/neva/eval/vqa_science.py   |  42 ++-
 .../multimodal_llm/neva/neva_evaluation.py    |  38 +-
 .../sequence_packing/preprocess_dataset.py    | 354 ++++++++++++++++++
 .../multimodal/data/neva/neva_dataset.py      |  74 +++-
 .../models/multimodal_llm/neva/neva_model.py  | 169 +++++++--
 nemo/collections/multimodal/parts/utils.py    |   3 +-
 .../language_modeling/megatron_gpt_model.py   |   6 +-
 .../modules/common/text_generation_utils.py   |   4 +
 .../vision/data/megatron/data_samplers.py     |   4 +-
 10 files changed, 627 insertions(+), 68 deletions(-)
 create mode 100644 examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py

diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
index b41f15c384a8..0caf4beb6a12 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
@@ -181,6 +181,7 @@ model:
     additional_special_tokens: null # ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>"]
 
   data:
+    packed_sequence: False
     num_workers: 8
     dataloader_type: cyclic
     data_path:
diff --git a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
index 8ea267ac8116..62d8788067bb 100644
--- a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
+++ b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
@@ -79,7 +79,8 @@ def eval_model(args):
     cfg.base_model_file = args.model_base
     cfg.inference.images_base_path = args.image_folder
     cfg.tensor_model_parallel_size = args.tp
-    cfg.trainer.devices = args.tp
+    cfg.pipeline_model_parallel_size = args.pp
+    cfg.trainer.devices = args.tp * args.pp
 
     model, image_processor = create_neva_model_and_processor(cfg)
     length_params: LengthParam = {
@@ -102,7 +103,8 @@ def eval_model(args):
     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
     answers_file = os.path.expanduser(args.answers_file)
     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
-    ans_file = open(answers_file, "w")
+    if is_global_rank_zero():
+        ans_file = open(answers_file, "w")
     for i, line in enumerate(tqdm(questions, disable=(not is_global_rank_zero()))):
         idx = line["id"]
         question = line['conversations'][0]
@@ -123,7 +125,8 @@ def eval_model(args):
             sampling_params=sampling_params,
             inference_config=cfg,
         )
-        # import  pdb; pdb.set_trace()
+        if responses is None:
+            continue
         outputs = responses[0]["clean_response"]
 
         # prompt for answer
@@ -139,22 +142,24 @@ def eval_model(args):
             outputs = responses[0]["clean_response"]
             outputs = outputs_reasoning + '\n The answer is ' + outputs
 
-        ans_id = shortuuid.uuid()
-        ans_file.write(
-            json.dumps(
-                {
-                    "question_id": idx,
-                    "prompt": cur_prompt,
-                    "text": outputs,
-                    "answer_id": ans_id,
-                    "model_id": args.model_path,
-                    "metadata": {},
-                }
+        if is_global_rank_zero():
+            ans_id = shortuuid.uuid()
+            ans_file.write(
+                json.dumps(
+                    {
+                        "question_id": idx,
+                        "prompt": cur_prompt,
+                        "text": outputs,
+                        "answer_id": ans_id,
+                        "model_id": args.model_path,
+                        "metadata": {},
+                    }
+                )
+                + "\n"
             )
-            + "\n"
-        )
-        ans_file.flush()
-    ans_file.close()
+            ans_file.flush()
+    if is_global_rank_zero():
+        ans_file.close()
 
 
 if __name__ == "__main__":
@@ -166,6 +171,7 @@ def eval_model(args):
     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
     parser.add_argument("--conv-mode", type=str, default="llava_v0")
     parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
     parser.add_argument("--num-chunks", type=int, default=1)
     parser.add_argument("--chunk-idx", type=int, default=0)
     parser.add_argument("--temperature", type=float, default=0.2)
diff --git a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
index bd3f975e4d54..d9d9a71db757 100644
--- a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
+++ b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
@@ -20,6 +20,7 @@
 from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 from nemo.core.config import hydra_runner
+from nemo.utils.get_rank import is_global_rank_zero
 
 
 try:
@@ -121,22 +122,27 @@ def forward_loop():
         )
     # ============== Quantization End =========================
 
-    results = []
-    for response, prompt in zip(responses, final_prompts):
-        prompt['full_text'] = response["clean_text"]
-        prompt['text'] = response["clean_response"]
-        prompt['model_id'] = cfg.neva_model_file
-        if 'image_path' in prompt:
-            prompt['image'] = prompt.pop('image_path')
-        if 'answer_id' not in prompt:
-            prompt['answer_id'] = 0
-        if 'metadata' not in prompt:
-            prompt['metadata'] = {}
-        results.append(prompt)
-
-    with open(cfg.output_file, 'w') as f:
-        for result in results:
-            f.write(json.dumps(result) + '\n')
+    # PP middle stages do not yield any responses
+    if responses is None:
+        return
+
+    if is_global_rank_zero():
+        results = []
+        for response, prompt in zip(responses, final_prompts):
+            prompt['full_text'] = response["clean_text"]
+            prompt['text'] = response["clean_response"]
+            prompt['model_id'] = cfg.neva_model_file
+            if 'image_path' in prompt:
+                prompt['image'] = prompt.pop('image_path')
+            if 'answer_id' not in prompt:
+                prompt['answer_id'] = 0
+            if 'metadata' not in prompt:
+                prompt['metadata'] = {}
+            results.append(prompt)
+
+        with open(cfg.output_file, 'w') as f:
+            for result in results:
+                f.write(json.dumps(result) + '\n')
 
 
 if __name__ == '__main__':
diff --git a/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
new file mode 100644
index 000000000000..ee96ff6489d3
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
@@ -0,0 +1,354 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example Usage:
+--------------
+This script preprocesses a dataset for the NeMo Multimodal Learning framework. It requires specifying paths for data, images, and the tokenizer model, among other parameters.
+
+Command:
+python examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py \
+ --data_path=/path/to/LLaVA-Instruct-150K/llava_v1_5_mix665k_filtered.json \
+ --image_folder=/path/to/LLaVA-Instruct-150K/images \
+ --tokenizer_path=/path/to/checkpoints/tokenizer_add_special.model \
+ --output_dir=/path/to/LLaVA-Instruct-150K/packed_seq_4096_336_v1 \
+ --max_seq_length=12288 \
+ --packing_algorithm=first_fit_shuffle \
+ --hf_vision_encoder=openai/clip-vit-large-patch14-336 \
+ --conv_template=v1 \
+ --image_aspect_ratio=pad \
+ --seed=42
+
+Parameters:
+-----------
+--data_path: Path to the dataset file in JSON format.
+--image_folder: Directory containing the images referenced in the dataset.
+--tokenizer_path: Path to the tokenizer model.
+--output_dir: Directory where the processed dataset will be stored.
+--max_seq_length: The maximum sequence length of the model.
+--packing_algorithm: Algorithm used for packing sequences. Defaults to 'first_fit_shuffle'.
+--hf_vision_encoder: The Hugging Face vision encoder to use. Default is 'openai/clip-vit-large-patch14-336'.
+--conv_template: Template for data conversion. Default is 'plain', with 'v1' as an alternative.
+--image_aspect_ratio: The aspect ratio for processing images. Defaults to 'square', 'pad' for padding to maintain aspect ratio.
+--seed: Seed for random operations in 'first_fit_shuffle'.
+--hparams_file: Optional path to a YAML file containing additional hyperparameters.
+"""
+
+import collections
+import os
+import random
+import re
+from argparse import ArgumentParser
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import numpy as np
+import torch
+from megatron.core.datasets.indexed_dataset import IndexedDataset, IndexedDatasetBuilder, get_bin_path, get_idx_path
+from omegaconf import OmegaConf
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from nemo.collections.multimodal.data.neva.neva_dataset import make_supervised_data_module
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils import logging
+
+PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle', 'shuffle_and_pack']
+
+
+def first_fit(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins using the First Fit algorithm, by integrating the search
+    and assignment within the same function. It moves bins that can no longer fit the minimum sequence length
+    to a completed bins list, avoiding direct modification of the bins list during iteration.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    min_seq_len = min(seq_lens)  # Find the minimum sequence length
+    completed_bins = []  # Initialize the completed bins list
+    bins = []  # Initialize the bins list to store active bins
+
+    for s in tqdm(seq_lens):  # Iterate through each sequence length
+        found_bin = False
+        for i, abin in enumerate(bins[:]):  # Iterate over a shallow copy of bins
+            if sum(abin) + min_seq_len > max_seq_length:
+                completed_bins.append(abin)  # Add to completed bins
+                bins[i] = 'TO_REMOVE'  # Mark this bin for removal
+                continue
+            if sum(abin) + s <= max_seq_length:  # Check if the bin can fit the sequence
+                bins[i].append(s)  # If so, add the sequence to this bin
+                found_bin = True
+                break
+
+        if not found_bin:  # If no existing bin can fit the sequence
+            bins.append([s])  # Open a new bin for this sequence
+
+        # Clean up bins marked 'TO_REMOVE'
+        bins = [bin for bin in bins if bin != 'TO_REMOVE']
+
+    # Combine completed bins with any remaining active bins
+    all_bins = completed_bins + bins
+    return all_bins
+
+
+def chunkify(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
+
+
+def parallel_first_fit(seq_lens, max_seq_length, chunk_size, num_workers):
+    """
+    Assigns sequences to bins in parallel using the First Fit algorithm.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+    - chunk_size: Size of chunks to divide seq_lens into for parallel processing.
+    - num_workers: Number of worker threads to use in the ThreadPoolExecutor.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    # Split the sequence lengths into chunks
+    chunks = list(chunkify(seq_lens, chunk_size))
+
+    # Function to process each chunk
+    def process_chunk(chunk):
+        return first_fit(chunk, max_seq_length)
+
+    bins = []  # This will hold the final bins
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        # Submit each chunk to the executor
+        futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
+
+        # As each future completes, combine its bins with the final bins
+        for future in as_completed(futures):
+            bins.extend(future.result())
+
+    return bins
+
+
+def first_fit_decreasing(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins using the First Fit Decreasing algorithm.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    sorted_seq_lens = sorted(seq_lens, reverse=True)
+    return first_fit(sorted_seq_lens, max_seq_length)
+
+
+def first_fit_shuffle(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins using a shuffled version of the First Fit algorithm.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    shuffled_seq_lens = seq_lens[:]
+    np.random.shuffle(shuffled_seq_lens)
+    return parallel_first_fit(shuffled_seq_lens, max_seq_length, 20000, 32)
+
+
+def shuffle_and_pack(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins with shuffling, trying to maximize the packing efficiency.
+    After shuffling the sequences, they will be added to one bin in order. Once the bin cannot
+    take more sequences, we will move on to the next bin.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    shuffled_seq_lens = np.array(seq_lens)
+    np.random.shuffle(shuffled_seq_lens)
+    bins = [[]]
+    cur_bin_total = 0
+    for s in tqdm(shuffled_seq_lens):
+        if cur_bin_total + s <= max_seq_length:
+            bins[-1].append(s)
+            cur_bin_total += s
+        else:
+            bins.append([s])
+            cur_bin_total = s
+    return bins
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument("--data_path", type=str)
+    parser.add_argument("--image_folder", type=str)
+    parser.add_argument("--tokenizer_path", type=str)
+    parser.add_argument('--output_dir', required=True, type=str)
+    parser.add_argument("--max_seq_length", default=4096, type=int)
+    parser.add_argument('--packing_algorithm', default='first_fit_shuffle', choices=PACKING_ALGOS, type=str)
+    parser.add_argument("--hf_vision_encoder", default='openai/clip-vit-large-patch14-336', type=str)
+    parser.add_argument("--conv_template", default='plain', type=str)
+    parser.add_argument("--image_aspect_ratio", default='square', type=str)
+    parser.add_argument('--seed', default=0, type=int, help="Seed for shuffling, used with first_fit_shuffle.")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(os.path.dirname(__file__), '../conf/llava_config.yaml'),
+        required=False,
+        help="Path to the hparams file.",
+    )
+    return parser.parse_args()
+
+
+def pack_sequence(args, seq_lens):
+    """
+    Packs sequences according to the specified algorithm in args.
+
+    Parameters:
+    - args: Command line arguments.
+    - seq_lens: List of sequence lengths.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+
+    packing_fn = globals()[args.packing_algorithm]
+    bins = packing_fn(seq_lens, args.max_seq_length)
+    return bins
+
+
+def main():
+    torch.multiprocessing.set_sharing_strategy('file_system')
+
+    args = get_args()
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.model.mm_cfg.vision_encoder.from_pretrained = args.hf_vision_encoder
+    nemo_config.model.data.data_path = args.data_path
+    nemo_config.model.data.image_folder = args.image_folder
+    nemo_config.model.data.conv_template = args.conv_template
+    nemo_config.model.data.image_aspect_ratio = args.image_aspect_ratio
+
+    tokenizer = get_nmt_tokenizer(library="sentencepiece", tokenizer_model=args.tokenizer_path,)
+    train_ds = make_supervised_data_module(tokenizer=tokenizer, model_cfg=nemo_config.model)["train_dataset"]
+    train_dl = DataLoader(train_ds, num_workers=32, collate_fn=None, shuffle=False)
+    # Example shape: {'tokens': torch.Size([1, 344]), 'labels': torch.Size([1, 344]), 'image': torch.Size([1, 1, 3, 224, 224])}
+
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    logging.info(f"Output directory: {output_dir}")
+
+    prefix_path = f"{output_dir}/packed_seq_dataset"
+    # Original Datasets to Sequence Lengths Files
+    builders = {}
+    for item_dict in tqdm(train_dl, desc="Building indexed datasets"):
+        item_dict = {k: v[0] for k, v in item_dict.items()}
+        seq_len = len(item_dict['tokens'])
+        if seq_len in builders:
+            builder = builders[seq_len]
+        else:
+            builder_path = get_bin_path(f"{prefix_path}/seqlen_{seq_len}")
+            logging.info(f"Creating builder for sequence length {seq_len} at {builder_path}")
+            builder = IndexedDatasetBuilder(builder_path, dtype=np.float32, multimodal=True)
+            builders[seq_len] = builder
+        builder.add_item(item_dict['tokens'])
+        builder.add_item(item_dict['labels'])
+        builder.add_item(item_dict['image'], 1)
+        builder.end_document()
+        del item_dict
+
+    for seq_len, builder in builders.items():
+        idx_path = get_idx_path(f"{prefix_path}/seqlen_{seq_len}")
+        logging.info(f"Finalizing builder for sequence length {seq_len} at {idx_path}")
+        builder.finalize(idx_path)
+
+    # Packing Sequences into Bins
+    files = os.listdir(f"{output_dir}/packed_seq_dataset")
+    pattern = rf"seqlen_(\d+).bin"
+    seq_len_list = []
+    for file in files:
+        match = re.match(pattern, file)
+        if match:
+            seq_len = int(match.group(1))
+            seq_len_list.append(seq_len)
+
+    aggregated_seq_lens = []
+    doc_pop_order = {}
+    indexed_datasets = {}
+    for seq_len in seq_len_list:
+        dataset_path = f"{prefix_path}/seqlen_{seq_len}"
+        dataset = IndexedDataset(dataset_path, multimodal=True)
+        aggregated_seq_lens.extend([seq_len] * (len(dataset.document_indices) - 1))
+        doc_pop_order[seq_len] = list(np.random.permutation(len(dataset.document_indices) - 1))
+        indexed_datasets[seq_len] = dataset
+
+    logging.info("Getting bins")
+    bins = pack_sequence(args, aggregated_seq_lens)
+    logging.info("Finished getting bins")
+
+    num_bins = len(bins)
+    avg_bins_len = sum([len(x) for x in bins]) / num_bins
+    avg_bins_sum = sum([sum(x) for x in bins]) / num_bins
+    logging.info(f"Number of bins: {num_bins}, Average bin length: {avg_bins_len}, Average bin sum: {avg_bins_sum}")
+
+    # Reading Sequence Lengths and Packing into New Files
+    final_builder_path = get_bin_path(f"{prefix_path}")
+    logging.info(f"Creating final builder at {final_builder_path}")
+    final_builder = IndexedDatasetBuilder(final_builder_path, dtype=np.float32, multimodal=True)
+
+    for assignment in tqdm(bins, desc="Building final dataset"):
+        packed_items = collections.defaultdict(list)
+        packed_items["seq_indices"] = [0]
+        for seq_len in assignment:
+            doc_index = doc_pop_order[seq_len].pop()
+            doc_start = indexed_datasets[seq_len].document_indices[doc_index]
+            doc_end = indexed_datasets[seq_len].document_indices[doc_index + 1]
+            item_dict = {
+                "tokens": torch.tensor((indexed_datasets[seq_len][doc_start:doc_end][0])[0]),
+                "labels": torch.tensor((indexed_datasets[seq_len][doc_start:doc_end][0])[1]),
+                "image": torch.tensor((indexed_datasets[seq_len][doc_start:doc_end][0])[2]),
+            }
+            for key in ["tokens", "labels", "image"]:
+                packed_items[key].append(item_dict[key])
+            packed_items["seq_indices"].append(packed_items["seq_indices"][-1] + seq_len)
+
+        for key in ["seq_indices", "tokens", "labels", "image"]:
+            final_builder.add_item(
+                torch.tensor(packed_items[key]) if key == "seq_indices" else torch.cat(packed_items[key], dim=0),
+                1 if key == "image" else 0,
+            )
+        final_builder.end_document()
+
+    idx_path = get_idx_path(f"{prefix_path}")
+    logging.info(f"Finalizing final builder at {idx_path}")
+    final_builder.finalize(idx_path)
+    logging.info(f"Number of bins: {num_bins}, Average bin length: {avg_bins_len}, Average bin sum: {avg_bins_sum}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 71d9bda12de1..ddd409e928b2 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -18,7 +18,7 @@
 import re
 import tarfile
 from dataclasses import dataclass
-from typing import Any, Dict, List, Sequence, Union
+from typing import Any, Dict, List, Sequence, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -49,6 +49,15 @@
 MAX_NUM_IMAGES = 1
 IGNORE_INDEX = -1
 
+try:
+    from megatron.core.datasets.indexed_dataset import IndexedDataset
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
 
 class TarOrFolderImageLoader:
     """
@@ -781,12 +790,27 @@ class DataCollatorForSupervisedDataset(object):
     tokenizer: transformers.PreTrainedTokenizer
 
     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        packed_sequence = "cu_seqlens" in instances[0]
         max_len = max(instance['tokens'].shape[0] for instance in instances)
         max_len = (max_len - 1) // 64 * 64 + 64
         for instance in instances:
             pad_len = max_len - instance['tokens'].shape[0]
             instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0)
             instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', -1)
+            if packed_sequence and instance["cu_seqlens"][-1] != max_len:
+                instance["cu_seqlens"] = torch.cat((instance["cu_seqlens"], torch.IntTensor([max_len])), 0)
+
+        if packed_sequence:
+            max_len_cu = max(instance['cu_seqlens'].shape[0] for instance in instances)
+            max_len_image = max(instance['image'].shape[0] for instance in instances)
+            for instance in instances:
+                pad_len_cu = max_len_cu - instance['cu_seqlens'].shape[0]
+                instance['cu_seqlens'] = F.pad(instance['cu_seqlens'], (0, pad_len_cu), 'constant', max_len)
+
+                x = instance['image']
+                num_pad = max_len_image - x.shape[0]
+                pad_tensor = torch.zeros(num_pad, *x.shape[1:], dtype=x.dtype, device=x.device)
+                instance['image'] = torch.cat((x, pad_tensor), dim=0)
 
         batch = default_collate(instances)
         tokenizer = self.tokenizer
@@ -796,13 +820,25 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         labels = batch['labels']
         media = batch.get('image')
 
-        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-            data=tokens,
-            eod_token=tokenizer.eos_id,
-            eod_mask_loss=model_cfg.data.get("eod_mask_loss", False),
-            reset_attention_mask=False,
-            reset_position_ids=False,
-        )
+        if packed_sequence:
+            cu_seqlens = batch["cu_seqlens"]
+            position_ids = []
+            for cu_seqlen in cu_seqlens:
+                position_ids.append([])
+                for ind in range(0, len(cu_seqlen) - 1):
+                    seqlen = cu_seqlen[ind + 1] - cu_seqlen[ind]
+                    position_ids[-1].extend(list(range(seqlen)))
+            position_ids = torch.LongTensor(position_ids)
+            loss_mask = torch.ones(tokens.size(), dtype=torch.float, device=tokens.device)
+            attention_mask = torch.ones(tokens.size(), dtype=torch.long, device=tokens.device)
+        else:
+            attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+                data=tokens,
+                eod_token=tokenizer.eos_id,
+                eod_mask_loss=model_cfg.data.get("eod_mask_loss", False),
+                reset_attention_mask=False,
+                reset_position_ids=False,
+            )
 
         loss_mask[labels == -1] = 0.0
         tokens[tokens == -1] = 0
@@ -821,6 +857,8 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
             'position_ids': position_ids,
             'media': media,
         }
+        if packed_sequence:
+            batch["cu_seqlens"] = cu_seqlens
         return batch
 
 
@@ -859,3 +897,23 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
     )
 
     return dict(train_dataset=train_dataset, eval_dataset=train_dataset)
+
+
+class NevaPackedSeqDatatset(Dataset):
+    def __init__(self, data_path: str, crop_size: Tuple[int, int] = (224, 224)):
+        self.ds = IndexedDataset(data_path)
+        self.crop_size = crop_size
+
+    def __len__(self):
+        return len(self.ds.document_indices) - 1
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        doc_start = self.ds.document_indices[i]
+        batch = {
+            "cu_seqlens": torch.IntTensor(self.ds[doc_start]),
+            "tokens": torch.LongTensor(self.ds[doc_start + 1]),
+            "labels": torch.LongTensor(self.ds[doc_start + 2]),
+            "image": torch.FloatTensor(self.ds[doc_start + 3]).reshape(-1, 3, *self.crop_size),
+        }
+
+        return batch
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index cff8ab1a7b5f..5b50a8340b06 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -21,6 +21,7 @@
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from omegaconf.dictconfig import DictConfig
+from pkg_resources import packaging
 from pytorch_lightning.trainer.trainer import Trainer
 from transformers import CLIPVisionModel
 
@@ -28,6 +29,7 @@
 from nemo.collections.multimodal.data.neva.conversation import DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN
 from nemo.collections.multimodal.data.neva.neva_dataset import (
     DataCollatorForSupervisedDataset,
+    NevaPackedSeqDatatset,
     make_supervised_data_module,
 )
 from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import (
@@ -43,7 +45,10 @@
     AdapterName,
     MultimodalProjectorAdapterConfig,
 )
-from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    average_losses_across_data_parallel_group,
+    get_iterator_k_split,
+)
 from nemo.collections.nlp.modules.common.text_generation_utils import (
     generate,
     get_computeprob_response,
@@ -61,6 +66,7 @@
 
 try:
     import apex.transformer.pipeline_parallel.utils
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 
     HAVE_APEX = True
 
@@ -71,6 +77,7 @@
 try:
     from megatron.core import InferenceParams, dist_checkpointing, parallel_state
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
 
@@ -385,14 +392,24 @@ def __init__(
         NevaBaseModel.__init__(self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs)
 
     def freeze_llm(self, mm_cfg):
-        for param in chain(self.embedding.parameters(), self.decoder.parameters(), self.output_layer.parameters(),):
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            embedding_parameters = self.embedding.parameters()
+        else:
+            embedding_parameters = {}
+        if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+            output_layer_parameters = self.output_layer.parameters()
+        else:
+            output_layer_parameters = {}
+
+        for param in chain(embedding_parameters, self.decoder.parameters(), output_layer_parameters,):
             param.requires_grad = False
 
     def forward(
         self, *args, **kwargs,
     ):
         media = kwargs.pop('media', None)
-        self.embedding.word_embeddings.set_media(media)
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            self.embedding.word_embeddings.set_media(media)
         return MCoreGPTModel.forward(self, *args, **kwargs)
 
 
@@ -418,7 +435,8 @@ def forward(
         self, *args, **kwargs,
     ):
         media = kwargs.pop('media', None)
-        self.embedding.word_embeddings.set_media(media)
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            self.embedding.word_embeddings.set_media(media)
         return GPTModel.forward(self, *args, **kwargs)
 
 
@@ -611,7 +629,73 @@ def forward(self, tokens, text_position_ids, attention_mask, labels, media=None)
         return output_tensor
 
     def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
-        return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step)
+        if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+            return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step)
+        else:
+            batch, _, _ = next(dataloader_iter)
+            _, seq_length = batch['tokens'].shape
+            batch_iter = get_iterator_k_split(batch, get_num_microbatches())
+
+            # handle asynchronous grad reduction
+            no_sync_func = None
+            grad_sync_func = None
+            param_sync_func = None
+            if not forward_only and self.with_distributed_adam:
+                no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+                grad_sync_func = self.reduce_overlap_gradients
+                param_sync_func = self.sync_overlap_parameters
+
+            # pipeline schedules will get these from self.model.config
+            for module in self.get_model_module_list():
+                module.config.no_sync_func = no_sync_func
+                module.config.grad_sync_func = grad_sync_func
+                module.config.param_sync_func = param_sync_func
+
+            # run forward and backwards passes for an entire global batch
+            # we do this inside training_step to support pipeline parallelism
+            fwd_bwd_function = get_forward_backward_func()
+            # print(f"{torch.distributed.get_rank()}: {parallel_state.is_pipeline_last_stage()} {fwd_bwd_function}")
+
+            # TODO @akhattar: add num_micro_batches_with_partial_activation_checkpoints when ready
+            losses_reduced_per_micro_batch = fwd_bwd_function(
+                forward_step_func=self.get_forward_output_and_loss_func(forward_only),
+                data_iterator=self._make_data_iterator_list(batch_iter),
+                model=self.model,
+                num_microbatches=get_num_microbatches(),
+                forward_only=forward_only,
+                seq_length=seq_length,
+                micro_batch_size=self.cfg.micro_batch_size,
+                first_val_step=first_val_step,
+            )
+
+            # only the last stages of the pipeline return losses
+            if losses_reduced_per_micro_batch:
+                if (not forward_only) or self.cfg.data.get('validation_drop_last', True):
+                    # average loss across micro batches
+                    loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch]
+                    loss_tensor = torch.concat(loss_tensors_list)
+                    loss_mean = loss_tensor.mean()
+                else:
+                    # Get the total loss since micro batches sizes are not uniform
+                    loss_sum_tensors_list = [
+                        loss_sum['loss_sum_and_ub_size']
+                        for loss_sum in losses_reduced_per_micro_batch
+                        if loss_sum['loss_sum_and_ub_size'][1] > 0
+                    ]
+                    loss_sum = (
+                        torch.vstack(loss_sum_tensors_list).sum(axis=0)
+                        if len(loss_sum_tensors_list) > 0
+                        else torch.tensor([0.0, 0.0]).cuda()
+                    )
+                    return loss_sum
+            else:
+                # we're not on the last pipeline stage so no losses
+                if forward_only:
+                    loss_mean = []
+                else:
+                    loss_mean = torch.tensor(0.0).cuda()
+
+            return loss_mean
 
     def training_step(self, dataloader_iter):
         """
@@ -631,7 +715,9 @@ def loss_func(output_tensor, loss_mask):
                 return loss_for_ub, dict(avg=reduced_loss[0].unsqueeze(0))
 
         def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
-            batch, _, _ = next(dataloader_iter)
+            batch = next(dataloader_iter)
+            if isinstance(batch, tuple):
+                batch = batch[0]
             if parallel_state.get_pipeline_model_parallel_world_size() == 1:
                 for k in batch.keys():
                     if self.get_attention_mask_from_fusion:
@@ -644,28 +730,36 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                     for k in batch.keys():
                         if self.get_attention_mask_from_fusion:
                             batch[k] = (
-                                batch[k].cuda(non_blocking=True) if k in ['tokens', 'position_ids', 'media'] else None
+                                batch[k].cuda(non_blocking=True)
+                                if k in ['tokens', 'position_ids', 'media', 'cu_seqlens']
+                                else None
                             )
                         else:
                             batch[k] = (
                                 batch[k].cuda(non_blocking=True)
-                                if k in ['tokens', 'position_ids', 'attention_mask', 'media']
+                                if k in ['tokens', 'position_ids', 'attention_mask', 'media', 'cu_seqlens']
                                 else None
                             )
                 elif parallel_state.is_pipeline_last_stage():
                     # Last pipeline stage needs the labels, loss_mask, and attention_mask
                     for k in batch.keys():
                         if self.get_attention_mask_from_fusion:
-                            batch[k] = batch[k].cuda(non_blocking=True) if k in ['labels', 'loss_mask'] else None
+                            batch[k] = (
+                                batch[k].cuda(non_blocking=True)
+                                if k in ['labels', 'loss_mask', 'cu_seqlens']
+                                else None
+                            )
                         else:
                             batch[k] = (
                                 batch[k].cuda(non_blocking=True)
-                                if k in ['labels', 'loss_mask', 'attention_mask']
+                                if k in ['labels', 'loss_mask', 'attention_mask', 'cu_seqlens']
                                 else None
                             )
                 else:
                     # Intermediate pipeline stage doesn't need any inputs
-                    batch = {k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels', 'media']}
+                    batch = {
+                        k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels', 'media', 'loss_mask']
+                    }
 
             forward_args = {
                 'input_ids': batch['tokens'],
@@ -678,16 +772,40 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                 if self.use_loss_mask:
                     forward_args['loss_mask'] = batch['loss_mask']
                 forward_args['checkpoint_activations_all_layers'] = checkpoint_activations_all_layers
+            else:
+                if 'cu_seqlens' in batch:  # packed sequence
+                    # these args are passed eventually into TEDotProductAttention.forward()
+                    cu_seqlens = batch['cu_seqlens'].squeeze()  # remove batch size dimension (mbs=1)
+                    max_seqlen = batch['max_seqlen'].squeeze() if 'max_seqlen' in batch else None
+
+                    try:
+                        from megatron.core.packed_seq_params import PackedSeqParams
+                    except (ImportError, ModuleNotFoundError) as e:
+                        mcore_version = packaging.version.Version(version('megatron-core'))
+                        logging.error(
+                            f"megatron-core v{mcore_version} does not support training with packed sequence. "
+                            "Please use megatron-core >= 0.5.0, or set model.data.train_ds.packed_sequence=False"
+                        )
+                        raise e
+                    forward_args['packed_seq_params'] = PackedSeqParams(
+                        cu_seqlens_q=cu_seqlens,
+                        cu_seqlens_kv=cu_seqlens,
+                        max_seqlen_q=max_seqlen,
+                        max_seqlen_kv=max_seqlen,
+                        qkv_format='thd',
+                    )
 
             output_tensor = model(**forward_args)
 
-            return output_tensor, partial(loss_func, loss_mask=batch['loss_mask'])
+            return output_tensor, partial(loss_func, loss_mask=batch.get('loss_mask'))
 
         return fwd_output_and_loss_func
 
     def get_forward_output_only_func(self):
         def fwd_output_only_func(dataloader_iter, model):
-            batch, _, _ = next(dataloader_iter)
+            batch = next(dataloader_iter)
+            if isinstance(batch, tuple):
+                batch = batch[0]
             extra_arg = {}
             (
                 tokens,
@@ -859,9 +977,14 @@ def setup(self, stage=None):
 
     def build_train_valid_test_datasets(self):
         logging.info('Building Neva datasets.')
-        ds_dict = make_supervised_data_module(tokenizer=self.tokenizer, model_cfg=self.cfg,)
-        self._train_ds = ds_dict["train_dataset"]
-        self._validation_ds = ds_dict["eval_dataset"]
+        if self.cfg.data.get("packed_sequence", False):
+            assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"
+            self._train_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size"))
+            self._validation_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size"))
+        else:
+            ds_dict = make_supervised_data_module(tokenizer=self.tokenizer, model_cfg=self.cfg,)
+            self._train_ds = ds_dict["train_dataset"]
+            self._validation_ds = ds_dict["eval_dataset"]
 
         return self._train_ds, self._validation_ds
 
@@ -872,12 +995,17 @@ def build_pretraining_data_loader(
 
         logging.info(f'Building dataloader with consumed samples: {consumed_samples}')
         # Megatron sampler
+        if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+            micro_batch_size = self.cfg.micro_batch_size
+        else:
+            micro_batch_size = self.cfg.global_batch_size // parallel_state.get_data_parallel_world_size()
+
         if hasattr(self.cfg.data, 'dataloader_type') and self.cfg.data.dataloader_type is not None:
             if self.cfg.data.dataloader_type == 'single':
                 batch_sampler = MegatronPretrainingSampler(
                     total_samples=len(dataset),
                     consumed_samples=consumed_samples,
-                    micro_batch_size=self.cfg.micro_batch_size,
+                    micro_batch_size=micro_batch_size,
                     data_parallel_rank=parallel_state.get_data_parallel_rank(),
                     data_parallel_size=parallel_state.get_data_parallel_world_size(),
                     drop_last=drop_last,
@@ -889,7 +1017,7 @@ def build_pretraining_data_loader(
                     dataset=dataset,
                     total_samples=len(dataset),
                     consumed_samples=consumed_samples,
-                    micro_batch_size=self.cfg.micro_batch_size,
+                    micro_batch_size=micro_batch_size,
                     data_parallel_rank=parallel_state.get_data_parallel_rank(),
                     data_parallel_size=parallel_state.get_data_parallel_world_size(),
                     drop_last=self.cfg.get('drop_last', True),
@@ -953,14 +1081,9 @@ def load_state_dict(self, state_dict, strict=False):
 
     def on_load_checkpoint(self, checkpoint) -> None:
         pass
-        # if self.mcore_gpt:
-        #     state_dict = checkpoint["state_dict"]
-        #     self.load_state_dict(state_dict)
 
     def sharded_state_dict(self, prefix: str = ''):
         return None
-        # sharded_state_dict = MegatronGPTModel.sharded_state_dict(self, prefix)
-        # return sharded_state_dict
 
     def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any:
         inference_config = self.get_inference_config()
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 723e965eb8a8..71c28cf00855 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -320,7 +320,7 @@ def dummy():
 
 
 def create_neva_model_and_processor(cfg):
-    from nemo.collections.multimodal.models.neva.neva_model import MegatronNevaModel
+    from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
 
     plugins = []
     if cfg.get('cluster_type', None) == 'BCP':
@@ -366,6 +366,7 @@ def create_neva_model_and_processor(cfg):
             neva_cfg.precision = trainer.precision
             neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None)
             neva_cfg.apply_rope_fusion = False
+            neva_cfg.fp8 = False
         #    neva_cfg.mm_cfg.vision_encoder.from_pretrained = None
 
         model = MegatronNevaModel.restore_from(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index c2e1f0ed48b7..7a2f3459470c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -784,7 +784,11 @@ def training_step(self, dataloader_iter):
             self._optimizer._finish_bucket_grad_sync()
         elif self.megatron_amp_O2:
             # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
-            if self.cfg.get('pipeline_model_parallel_size', 1) > 1 or self.cfg.get('sequence_parallel', False):
+            if (
+                self.cfg.get('pipeline_model_parallel_size', 1) > 1
+                or self.cfg.get('sequence_parallel', False)
+                or not self.cfg.get('async_grad_allreduce', True)
+            ):
                 # main grads are stored in the MainParamsOptimizer wrapper
                 self._optimizer.allreduce_main_grads()
         else:
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index d130322404b6..b50c9de682f7 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -173,6 +173,10 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
             **strategy_args,
         )
 
+        # Middle stages of PP will return None
+        if response is None:
+            continue
+
         # Regular expression pattern to match the sequence
         pattern = re.compile(rf'{DEFAULT_IM_START_TOKEN}( ⁇ )+{DEFAULT_IM_END_TOKEN}')
         pattern_nvgpt = re.compile(rf'{DEFAULT_IM_START_TOKEN}({DEFAULT_IMAGE_PATCH_TOKEN})+{DEFAULT_IM_END_TOKEN}')
diff --git a/nemo/collections/vision/data/megatron/data_samplers.py b/nemo/collections/vision/data/megatron/data_samplers.py
index 82fc49990c49..2f63e675731b 100644
--- a/nemo/collections/vision/data/megatron/data_samplers.py
+++ b/nemo/collections/vision/data/megatron/data_samplers.py
@@ -67,7 +67,9 @@ def __iter__(self):
             random_idx = torch.randperm(bucket_size, generator=g).tolist()
             idx_range = [start_idx + x for x in random_idx[bucket_offset:]]
         else:
-            full_bucket_size = (self.total_samples // self.micro_batch_size) * self.micro_batch_size
+            full_bucket_size = (
+                self.total_samples // self.micro_batch_times_data_parallel_size
+            ) * self.micro_batch_times_data_parallel_size
             full_bucket_offset = current_epoch_samples
             g = torch.Generator()
             g.manual_seed(self.epoch)

From 43ccc1d6bd82ec788d970f90c3ed7192882651b3 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Tue, 30 Apr 2024 13:55:24 -0700
Subject: [PATCH 008/178] [Nemo CICD] Trigger on comment issued (#9062)

* match pytorch

* match pytorch
---
 .github/workflows/cicd-main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index a13284521b3c..de250596da62 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -44,7 +44,7 @@ jobs:
 #  checkout-repository:
 #    runs-on: self-hosted-azure
 #    container:
-#      image: nvcr.io/nvidia/pytorch:24.01-py3
+#      image: nvcr.io/nvidia/pytorch:24.02-py3
 #      volumes:
 #        - ${{ github.workspace }}:/workspace
 #    steps:
@@ -60,7 +60,7 @@ jobs:
     if: ${{ github.event.label.name == 'Run CICD' }}
     # uses: actions/cache@v2
     #container:
-#      image: nvcr.io/nvidia/pytorch:24.01-py3
+#      image: nvcr.io/nvidia/pytorch:24.02-py3
 #      options: 
 #        # --user 0:128
 #        --device=/dev/nvidia0
@@ -78,7 +78,7 @@ jobs:
       run: |
         # Pull base PyTorch container
         docker pull nvcr.io/nvidia/pytorch:24.02-py3
-        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
+        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.02-py3 /bin/bash -c '
             set -x
 
             # PyTorch version

From 28506795514060c4d7741e1e401717e1c3b10beb Mon Sep 17 00:00:00 2001
From: Ming <111467530+Victor49152@users.noreply.github.com>
Date: Tue, 30 Apr 2024 22:25:33 -0700
Subject: [PATCH 009/178] Mingyuanm/add back fp8 support to sd (#9070)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update branch

Signed-off-by: eharper <eharper@nvidia.com>

* Add dist ckpt support for regular optimizers (#7749)

* Add dist ckpt support for regular optimizers

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* fix imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* imports fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ci imports fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert asr notebook

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr notebook

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Pin lhotse=1.19.2 in r1.23.0 (#8303)

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Cache Aware Streaming tutorial notebook (#8296)

* add notebook

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* rename old notebook to Buffered_Streaming

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* call setup_streaming_params in set_default_att_context_size method

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update links in docs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update links to tutorials in docs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* remove hard-coding

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* rename var

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix path location and branch (#8304)

* fix path location and branch

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* change to a floating point number

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* add deallocate pipeline output optimization (#8279)

* add deallocate pipeline output optimization

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix memory leak caused by context parallelism hanging references by omegaconf (#8299)

* save cp_size to self

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* use parallel_state instead of self

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* remove assertion (#8302)

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Update PEFT Doc (#8262)

* update peft doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove old prompt learning doc and notebook

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Merge branch 'r1.23.0' into chcui/update_peft_doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks  (#8242) (#8324)

* Rebasing canary changes at current main

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Move the changes from asr transformer to nlp transformer as originally intended

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* update eval to strip spaces before punctuations

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update pc strip

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [canary] Refactor: `PromptedAudioToTextLhotseDataset` and `EncDecMultiTaskModel` (#8247)

* Create a separate CanaryDataset and use it inside `transformer_bpe_models.py`. Ditches `token_sequence_format`.

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* [canary] Refactor: move changes in transformer_bpe_models.py to Canar… (#8252)

* [canary] Refactor: move changes in transformer_bpe_models.py to CanaryModel

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Rename `CanaryModel` to `EncDecMultiTaskModel` and remove inheritance from `EncDecTransfModelBPE`; add a separate config for this model

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Rename `CanaryDataset` to `PromptedAudioToTextLhotseDataset`; add `prompt_format_fn` argument; clean-up the `_canary_prompt_format` function a bit

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Move tokenization into `prompt_format_fn`, fix usage, add docs

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Backward-compatible utterance validation

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Improve type annotations

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* config and prompt_fn registration changes from review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix transcribe config

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Refactor Canary to follow schema of remaining ASR models (#8260)

* Initial draft of multi task beam decoding strategy

Signed-off-by: smajumdar <titu1994@gmail.com>

* Stabilize inference

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update AED Multi Task model to mostly conform to Archetype-Type format. Update config

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add change decoding strategy

Signed-off-by: smajumdar <titu1994@gmail.com>

* Remove redundant imports

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Cleanup

Signed-off-by: smajumdar <titu1994@gmail.com>

* Cleanup

Signed-off-by: smajumdar <titu1994@gmail.com>

* remove asr transformer dependency on nlp

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* copy token_classifier from nlp to asr

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Address comments

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add typing to beam decoding

Signed-off-by: smajumdar <titu1994@gmail.com>

* Make prompt format configurable

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* drop asr dependency on nlp

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>

* fix transcribe, update asr evaluator

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Extend the docs for the canary prompt_fn

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Incorporate changes from Nithin's code review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* training bug fix and adding launch script for speech_multitask (#8270)

* bug fix and adding launch script for speech_multitask

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* update launch script example in speech_to_text_aed.py

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

---------

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>

* Fix: drop_last must be true in validation/test otherwise the training will hang

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>

* revert to current transcribe API

Signed-off-by: stevehuang52 <heh@nvidia.com>

* revert changes to NLP, update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update eval utils

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Remove DALI; rename compute_audio_loss to compute_loss

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* set default use_model_transcribe=False

Signed-off-by: stevehuang52 <heh@nvidia.com>

* change os.path.dirname to pathlib

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [canary] Test for CanaryTokenizer + refactoring (#8285)

* Test for CanaryTokenizer

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Attempt at refactor...

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Update config for AED models (#8294)

Signed-off-by: smajumdar <titu1994@gmail.com>

* set default calculate_wer=False in transcribe_speech.py

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Apply suggestions from code review, part 1

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Apply suggestions from code review, part 2

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Document compute_loss

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* update transcribe_speech.py

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
(cherry picked from commit d10726da72f74eb5a95056843d1f9e2562a5051c)

Co-authored-by: Piotr Żelasko <petezor@gmail.com>

* Multimodal r1.23.0 bug fix  (#8315)

* Rename quick-gelu

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* ddpm config guard

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix ddpm edit api

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix insert_image_token cfg issue

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* neva updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add back jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update default neva template

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fixes for MoE parameter passing & use of AutoTokenizer/Model for mistral. (#8272)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 (#8334)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Remove asr webapp (#8347)

Signed-off-by: smajumdar <titu1994@gmail.com>

* remove _target_ at model level in aed config (#8351)

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>

* Add change_vocabulary and save_tokenizers() support to Multitask ASR models (#8357)

* Add change_vocabulary and save_tokenizers() support

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/asr/models/aed_multitask_models.py

Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>

* Change default (#8371)

Signed-off-by: smajumdar <titu1994@gmail.com>

* bug fix in fast-conformer-aed.yaml and adding jenkins test for speech_to_text_aed model (#8368)

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* Enable megatron core loggers for GPT pretraining (#8354)

* Logging changes tested for gpt_pretraining

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>

* Additional args

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* mcore ds fix (#8283)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* mcore ds fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Bert unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update bert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert mcore test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix gpt jenkins tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex & TE commits

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert apex installation

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* turn off the fusion for jenkins

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* Add Finetuning tutorial with HF Datasets (#8356)

* Add Finetuning tutorial with HF Datasets

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* update on Som comments

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* release updates (#8378)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* mcore ds fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Bert unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update bert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert mcore test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix gpt jenkins tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mock ds test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add test for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* mcore ds fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* data input fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* MCore dataset compatibility for tokenizers (#8390)

* Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* Mcore customization doc (#8298)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* Add Bert HF checkpoint converter (#8088)

* Add Bert HF checkpoint converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add BERT ONNX export

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add NeMo BERT to HF BERT script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update argument names

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update build_transformer_config in Bert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>

* initial placeholder

Signed-off-by: Huiying Li <huiyingl@nvidia.com>

* add to intro/index.rst

Signed-off-by: Huiying Li <huiyingl@nvidia.com>

* initial content update

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* add diff images

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

size

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* minor fixes

* minor language change

Signed-off-by: Chen Cui <chcui@nvidia.com>

* clean changes

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Huiying Li <huiyingl@nvidia.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: Huiying Li <huiyingl@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* wer fix (#8404)

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* updated link to pubmed (#8402)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* Update NFA video download link (#8406)

* update nfa nasa video link

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update link in markdown

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* revert changes (#8410)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Fix dreambooth data sampler issue (#8400)

* Turn on drop last

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Some neva fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fixed errors in the CTM gen functions (#8416)

Signed-off-by: Taejin Park <tango4j@gmail.com>

* add ensemble decoding fix (#8427)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* SDE bugfix log (#8430)

Signed-off-by: George <gzelenfroind@nvidia.com>

* mcore customization doc minor fix (#8421)

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* NeMo-Mistral to HF converter bugfix. (#8353)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Fixing mcore bert for TP, PP and SP (#8336)

* Fixing mcore bert for TP, PP and SP

* Fixing mcore bert for TP, PP and SP

* Fixing mcore version

* Fixing mcore version

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

---------

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add settings to suppress bf16 compile errors in CI on V100 (#8481)

* Add settings to suppress bf16 compile errors in CI on V100

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* MoE parameter passing (#8255)

* MoE parameter passing

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Pass EP/MoE params in consumer scripts.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* PR fixes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Use latest commit of mcore-0.5

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* CI fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update k2 version (#8478) (#8492)

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add fp8 support for SD/Update notebook paths (#8489)

* Add fp8 support for SD/Update notebook paths

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* pin to 0.5.0 (#8465)

Signed-off-by: eharper <eharper@nvidia.com>

* Update NeMo Multimodal Requirements (#8515)

* Update requirements_multimodal.txt

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* update github raw content link (#8517)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Add dep notice for notebooks (#8522)

* add dep notice

Signed-off-by: eharper <eharper@nvidia.com>

* revert

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>

* Revert FP8 integration (#8520)

* Revert FP8 integration

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update data prep notebook (#8532)

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add back fp8 support

* SD-FP8: fix the bug of normalization location

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* map potential FP8 ckpt to FP16

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add TE fp8 training

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Only overwrite unet precision when self.megatron_amp_O2 is true

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* New structure is now compatible with old ckpts

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add support on mapping old unet checkpoint to new structure and FP8 structure

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Sync with main branch

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Huiying Li <huiyingl@nvidia.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: George <gzelenfroind@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: eharper <eharper@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: Huiying Li <huiyingl@nvidia.com>
Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: Mengdi Wang <didow@nvidia.com>
---
 .../stable_diffusion/conf/sd_train.yaml       |  37 ++---
 .../stable_diffusion/sd_infer.py              |   3 +
 .../stable_diffusion/ldm/ddpm.py              |   2 +-
 .../modules/stable_diffusion/attention.py     |  66 ++++++--
 .../diffusionmodules/openaimodel.py           | 143 ++++++++++++++++--
 nemo/collections/nlp/parts/nlp_overrides.py   |  26 ++++
 6 files changed, 236 insertions(+), 41 deletions(-)

diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
index 8ce009d5458f..dff963590864 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
@@ -49,8 +49,8 @@ model:
   precision: ${trainer.precision}
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
-  micro_batch_size: 1 # limited by GPU memory
-  global_batch_size: 1 # will use more micro batches to reach global batch size
+  micro_batch_size: 16 # limited by GPU memory
+  global_batch_size: 16 # will use more micro batches to reach global batch size
   native_amp_init_scale: 65536.0 # Init scale for grad scaler used at fp16
 
 
@@ -97,15 +97,15 @@ model:
   unet_config:
     _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
     from_pretrained: #/ckpts/nemo-v1-2.ckpt
-    from_NeMo: True #Must be specified when from pretrained is not None, False means loading unet from HF ckpt
+    from_NeMo: False #Must be specified when from pretrained is not None, False means loading unet from HF ckpt
     image_size: 32 # unused
     in_channels: 4
     out_channels: 4
     model_channels: 320
     attention_resolutions:
-    - 4
-    - 2
-    - 1
+      - 4
+      - 2
+      - 1
     num_res_blocks: 2
     channel_mult:
     - 1
@@ -121,6 +121,7 @@ model:
     use_flash_attention: True
     unet_precision: fp32
     resblock_gn_groups: 32
+    use_te_fp8: False
 
   first_stage_config:
     _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
@@ -140,22 +141,22 @@ model:
       - 4
       - 4
       num_res_blocks: 2
-      attn_resolutions: []
+      attn_resolutions: [ ]
       dropout: 0.0
     lossconfig:
       target: torch.nn.Identity
 
   cond_stage_config:
-    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenMegatronCLIPEmbedder
-    restore_from_path: /ckpts/openai.nemo
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
+    version: openai/clip-vit-large-patch14
     device: cuda
-    freeze: True
-    layer: "last"
-    #    For compatibility of history version that uses HF clip model
-    #    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
-    #    version: openai/clip-vit-large-patch14
-    #    device: cuda
-    #    max_length: 77
+    max_length: 77
+  #    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenMegatronCLIPEmbedder
+  #    restore_from_path: /ckpts/openai-old.nemo
+  #    device: cuda
+  #    freeze: True
+  #    layer: "last"
+
 
 
   # miscellaneous
@@ -163,7 +164,7 @@ model:
   resume_from_checkpoint: null # manually set the checkpoint file to load from
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  ddp_overlap: True # True for using PyTorch DDP overlap.
+  ddp_overlap: False # True for using PyTorch DDP overlap.
 
   optim:
     name: fused_adam
@@ -191,7 +192,7 @@ model:
       synthetic_data_length: 10000
       train:
           dataset_path:
-            - /datasets/coyo/test.pkl
+            - /datasets/coyo/wdinfo/coyo-700m/wdinfo-selene.pkl
           augmentations:
             resize_smallest_side: 512
             center_crop_h_w: 512, 512
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py b/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py
index f1e5e2872ea7..58e9e6e64470 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py
@@ -28,6 +28,9 @@ def model_cfg_modifier(model_cfg):
         model_cfg.unet_config.use_flash_attention = False
         model_cfg.unet_config.from_pretrained = None
         model_cfg.first_stage_config.from_pretrained = None
+        model_cfg.first_stage_config._target_ = (
+            'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL'
+        )
 
     torch.backends.cuda.matmul.allow_tf32 = True
     trainer, megatron_diffusion_model = setup_trainer_and_model_for_inference(
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
index 7023f57652b5..6ea4314ab71f 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -1674,7 +1674,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         # megatron_amp_O2 is not yet supported in diffusion models
         self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
 
-        if self.cfg.precision in ['16', 16, 'bf16']:
+        if self.megatron_amp_O2 and self.cfg.precision in ['16', 16, 'bf16']:
             self.model_parallel_config.enable_autocast = False
             if not hasattr(self.cfg.unet_config, 'unet_precision') or not '16' in str(
                 self.cfg.unet_config.unet_precision
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
index c92980d904f6..3fcab2127f4f 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+import os
 from inspect import isfunction
 
 import torch
@@ -21,6 +22,13 @@
 from torch import einsum, nn
 from torch._dynamo import disable
 
+if os.environ.get("USE_NATIVE_GROUP_NORM", "0") == "1":
+    from nemo.gn_native import GroupNormNormlization as GroupNorm
+else:
+    from apex.contrib.group_norm import GroupNorm
+
+from transformer_engine.pytorch.module import LayerNormLinear, LayerNormMLP
+
 from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.util import checkpoint
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
     AdapterName,
@@ -96,13 +104,19 @@ def forward(self, x):
 
 
 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0, use_te=False):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(LinearWrapper(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
 
-        self.net = nn.Sequential(project_in, nn.Dropout(dropout), LinearWrapper(inner_dim, dim_out))
+        if use_te:
+            activation = 'gelu' if not glu else 'geglu'
+            # TODO: more parameters to be confirmed, dropout, seq_length
+            self.net = LayerNormMLP(hidden_size=dim, ffn_hidden_size=inner_dim, activation=activation,)
+        else:
+            norm = nn.LayerNorm(dim)
+            project_in = nn.Sequential(LinearWrapper(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
+            self.net = nn.Sequential(norm, project_in, nn.Dropout(dropout), LinearWrapper(inner_dim, dim_out))
 
     def forward(self, x):
         return self.net(x)
@@ -225,10 +239,15 @@ def __init__(
         dropout=0.0,
         use_flash_attention=False,
         lora_network_alpha=None,
+        use_te=False,
     ):
         super().__init__()
 
         self.inner_dim = dim_head * heads
+        if context_dim is None:
+            self.is_self_attn = True
+        else:
+            self.is_self_attn = False  # cross-attention
         context_dim = default(context_dim, query_dim)
         # make attention part be aware of self-attention/cross-attention
         self.context_dim = context_dim
@@ -238,10 +257,19 @@ def __init__(
         self.scale = dim_head ** -0.5
         self.heads = heads
 
-        self.to_q = LinearWrapper(query_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
         self.to_k = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
         self.to_v = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
 
+        self.use_te = use_te
+        if use_te:
+            return_layernorm_output = True if self.is_self_attn else False
+            self.norm_to_q = LayerNormLinear(
+                query_dim, self.inner_dim, bias=False, return_layernorm_output=return_layernorm_output
+            )
+        else:
+            self.norm = nn.LayerNorm(query_dim)
+            self.to_q = LinearWrapper(query_dim, self.inner_dim, bias=False)
+
         self.to_out = nn.Sequential(
             LinearWrapper(self.inner_dim, query_dim, lora_network_alpha=lora_network_alpha), nn.Dropout(dropout)
         )
@@ -262,8 +290,18 @@ def forward(self, x, context=None, mask=None, additional_tokens=None, n_times_cr
             # add additional token
             x = torch.cat([additional_tokens, x], dim=1)
 
-        q = self.to_q(x)
-        context = default(context, x)
+        if self.use_te:
+            q_out = self.norm_to_q(x)
+            if self.is_self_attn:
+                q, ln_out = q_out
+                context = default(context, ln_out)
+            else:
+                q = q_out
+                context = default(context, x)
+        else:
+            x = self.norm(x)
+            q = self.to_q(x)
+            context = default(context, x)
         k = self.to_k(context)
         v = self.to_v(context)
 
@@ -351,6 +389,7 @@ def __init__(
         use_flash_attention=False,
         disable_self_attn=False,
         lora_network_alpha=None,
+        use_te=False,
     ):
         super().__init__()
         self.disable_self_attn = disable_self_attn
@@ -362,8 +401,9 @@ def __init__(
             use_flash_attention=use_flash_attention,
             context_dim=context_dim if self.disable_self_attn else None,
             lora_network_alpha=lora_network_alpha,
+            use_te=use_te,
         )  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff, use_te=use_te)
         self.attn2 = CrossAttention(
             query_dim=dim,
             context_dim=context_dim,
@@ -372,10 +412,8 @@ def __init__(
             dropout=dropout,
             use_flash_attention=use_flash_attention,
             lora_network_alpha=lora_network_alpha,
+            use_te=use_te,
         )  # is self-attn if context is none
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
         self.use_checkpoint = use_checkpoint
 
     def forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
@@ -397,15 +435,15 @@ def forward(self, x, context=None, additional_tokens=None, n_times_crossframe_at
     def _forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
         x = (
             self.attn1(
-                self.norm1(x),
+                x,
                 context=context if self.disable_self_attn else None,
                 additional_tokens=additional_tokens,
                 n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self if not self.disable_self_attn else 0,
             )
             + x
         )
-        x = self.attn2(self.norm2(x), context=context, additional_tokens=additional_tokens) + x
-        x = self.ff(self.norm3(x)) + x
+        x = self.attn2(x, context=context, additional_tokens=additional_tokens) + x
+        x = self.ff(x) + x
         return x
 
 
@@ -431,6 +469,7 @@ def __init__(
         use_checkpoint=False,
         use_flash_attention=False,
         lora_network_alpha=None,
+        use_te=False,
     ):
         super().__init__()
         logging.info(
@@ -473,6 +512,7 @@ def __init__(
                     use_flash_attention=use_flash_attention,
                     disable_self_attn=disable_self_attn,
                     lora_network_alpha=lora_network_alpha,
+                    use_te=use_te,
                 )
                 for d in range(depth)
             ]
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index 5ff0f6aa8a8a..b610f921a22a 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+import os
+import re
 from abc import abstractmethod
 from collections.abc import Iterable
+from contextlib import nullcontext
 from functools import partial
 from typing import Iterable
 
@@ -22,6 +25,9 @@
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
+
+# FP8 related import
+import transformer_engine
 from apex.contrib.group_norm import GroupNorm
 
 from nemo.collections.multimodal.modules.stable_diffusion.attention import SpatialTransformer
@@ -62,6 +68,34 @@ def convert_module_to_fp32(module, enable_norm_layers=False):
     convert_module_to_dtype(module, torch.float32, enable_norm_layers)
 
 
+def convert_module_to_fp8(model):
+    def _set_module(model, submodule_key, module):
+        tokens = submodule_key.split('.')
+        sub_tokens = tokens[:-1]
+        cur_mod = model
+        for s in sub_tokens:
+            cur_mod = getattr(cur_mod, s)
+        setattr(cur_mod, tokens[-1], module)
+
+    import copy
+
+    from transformer_engine.pytorch.module import Linear as te_Linear
+
+    for n, v in model.named_modules():
+        if isinstance(v, torch.nn.Linear):
+            # if n in ['class_embed', 'bbox_embed.layers.0', 'bbox_embed.layers.1', 'bbox_embed.layers.2']: continue
+            logging.info(f'[INFO] Replace Linear: {n}, weight: {v.weight.shape}')
+            if v.bias is None:
+                is_bias = False
+            else:
+                is_bias = True
+            newlinear = te_Linear(v.in_features, v.out_features, bias=is_bias)
+            newlinear.weight = copy.deepcopy(v.weight)
+            if v.bias is not None:
+                newlinear.bias = copy.deepcopy(v.bias)
+            _set_module(model, n, newlinear)
+
+
 class AttentionPool2d(nn.Module):
     """
     Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
@@ -553,6 +587,7 @@ def __init__(
         unet_precision: str = "fp32",
         lora_network_alpha=None,
         timesteps=1000,
+        use_te_fp8: bool = False,
     ):
         super().__init__()
         from omegaconf.listconfig import ListConfig
@@ -663,6 +698,7 @@ def __init__(
         input_block_chans = [model_channels]
         ch = model_channels
         ds = 1
+        self.use_te_fp8 = use_te_fp8
         for level, mult in enumerate(channel_mult):
             for nr in range(self.num_res_blocks[level]):
                 layers = [
@@ -713,6 +749,7 @@ def __init__(
                                 use_checkpoint=use_checkpoint,
                                 use_flash_attention=use_flash_attention,
                                 lora_network_alpha=lora_network_alpha,
+                                use_te=self.use_te_fp8,
                             )
                         )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
@@ -778,6 +815,7 @@ def __init__(
                 use_linear=use_linear_in_transformer,
                 use_checkpoint=use_checkpoint,
                 use_flash_attention=use_flash_attention,
+                use_te=self.use_te_fp8,
                 lora_network_alpha=lora_network_alpha,
             ),
             ResBlock(
@@ -844,6 +882,7 @@ def __init__(
                                 use_checkpoint=use_checkpoint,
                                 use_flash_attention=use_flash_attention,
                                 lora_network_alpha=lora_network_alpha,
+                                use_te=self.use_te_fp8,
                             )
                         )
                 if level and i == self.num_res_blocks[level]:
@@ -899,6 +938,34 @@ def __init__(
             self.convert_to_fp16()
         elif unet_precision == 'fp16':
             self.convert_to_fp16(enable_norm_layers=True)
+        elif self.use_te_fp8:
+            assert unet_precision != 'fp16', "fp8 training can't work with fp16 O2 amp recipe"
+            convert_module_to_fp8(self)
+
+            fp8_margin = int(os.getenv("FP8_MARGIN", '0'))
+            fp8_interval = int(os.getenv("FP8_INTERVAL", '1'))
+            fp8_format = os.getenv("FP8_FORMAT", "hybrid")
+            fp8_amax_history_len = int(os.getenv("FP8_HISTORY_LEN", '1024'))
+            fp8_amax_compute_algo = os.getenv("FP8_COMPUTE_ALGO", 'max')
+            fp8_wgrad = os.getenv("FP8_WGRAD", '1') == '1'
+
+            fp8_format_dict = {
+                'hybrid': transformer_engine.common.recipe.Format.HYBRID,
+                'e4m3': transformer_engine.common.recipe.Format.E4M3,
+            }
+            fp8_format = fp8_format_dict[fp8_format]
+
+            self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+                margin=fp8_margin,
+                interval=fp8_interval,
+                fp8_format=fp8_format,
+                amax_history_len=fp8_amax_history_len,
+                amax_compute_algo=fp8_amax_compute_algo,
+                override_linear_precision=(False, False, not fp8_wgrad),
+            )
+            old_state_dict = self.state_dict()
+            new_state_dict = self.te_fp8_key_mapping(old_state_dict)
+            self.load_state_dict(new_state_dict, strict=False)
 
         self.unet_precision = unet_precision
 
@@ -1000,8 +1067,65 @@ def _sdxl_embedding_mapping(self, sdxl_dict):
             res_dict[new_key_] = value_
         return res_dict
 
+    def _legacy_unet_ckpt_mapping(self, unet_dict):
+        new_dict = {}
+        key_map = {
+            'transformer_blocks.0.norm1.weight': 'transformer_blocks.0.attn1.norm.weight',
+            'transformer_blocks.0.norm1.bias': 'transformer_blocks.0.attn1.norm.bias',
+            'transformer_blocks.0.norm2.weight': 'transformer_blocks.0.attn2.norm.weight',
+            'transformer_blocks.0.norm2.bias': 'transformer_blocks.0.attn2.norm.bias',
+            'transformer_blocks.0.norm3.weight': 'transformer_blocks.0.ff.net.0.weight',
+            'transformer_blocks.0.norm3.bias': 'transformer_blocks.0.ff.net.0.bias',
+            'transformer_blocks.0.ff.net.0.proj.weight': 'transformer_blocks.0.ff.net.1.proj.weight',
+            'transformer_blocks.0.ff.net.0.proj.bias': 'transformer_blocks.0.ff.net.1.proj.bias',
+            'transformer_blocks.0.ff.net.2.weight': 'transformer_blocks.0.ff.net.3.weight',
+            'transformer_blocks.0.ff.net.2.bias': 'transformer_blocks.0.ff.net.3.bias',
+        }
+
+        pattern = re.compile(r'(input_blocks|output_blocks)\.[\d\w]+\.[\d\w]+\.')
+        pattern_middle_block = re.compile(r'middle_block\.[\d\w]+\.')
+        for old_key, value in unet_dict.items():
+            match = pattern.match(old_key)
+            match_middle = pattern_middle_block.match(old_key)
+            if match or match_middle:
+                prefix = match.group(0) if match else match_middle.group(0)
+                suffix = old_key.split('.', 3)[-1] if match else old_key.split('.', 2)[-1]
+                if suffix in key_map:
+                    new_key = prefix + key_map[suffix]
+                    new_dict[new_key] = value
+                else:
+                    new_dict[old_key] = value
+            else:
+                new_dict[old_key] = value
+
+        return new_dict
+
+    def te_fp8_key_mapping(self, unet_dict):
+        new_state_dict = {}
+        for key in unet_dict.keys():
+            if 'extra_state' in key:
+                continue
+
+            ### LayerNormLinear
+            # norm_to_q.layer_norm_{weight|bias} -> norm.{weight|bias}
+            # norm_to_q.weight -> to_q.weight
+            new_key = key.replace('attn1.norm.', 'attn1.norm_to_q.layer_norm_')
+            new_key = new_key.replace('attn1.to_q.weight', 'attn1.norm_to_q.weight',)
+            new_key = new_key.replace('attn2.norm.', 'attn2.norm_to_q.layer_norm_')
+            new_key = new_key.replace('attn2.to_q.weight', 'attn2.norm_to_q.weight',)
+
+            ### LayerNormMLP
+            # ff.net.layer_norm_{weight|bias} -> ff.net.0.{weight|bias}
+            # ff.net.fc1_{weight|bias} -> ff.net.1.proj.{weight|bias}
+            # ff.net.fc2_{weight|bias} -> ff.net.3.{weight|bias}
+            new_key = new_key.replace('ff.net.0.', 'ff.net.layer_norm_')
+            new_key = new_key.replace('ff.net.1.proj.', 'ff.net.fc1_')
+            new_key = new_key.replace('ff.net.3.', 'ff.net.fc2_')
+
+            new_state_dict[new_key] = unet_dict[key]
+        return new_state_dict
+
     def _state_key_mapping(self, state_dict: dict):
-        import re
 
         res_dict = {}
         input_dict = {}
@@ -1027,13 +1151,7 @@ def _state_key_mapping(self, state_dict: dict):
         mid_dict = self._mid_blocks_mapping(mid_dict)
         other_dict = self._other_blocks_mapping(other_dict)
         sdxl_dict = self._sdxl_embedding_mapping(sdxl_dict)
-        # key_list = state_dict.keys()
-        # key_str = " ".join(key_list)
 
-        # for key_, val_ in state_dict.items():
-        #     key_ = key_.replace("down_blocks", "input_blocks")\
-        #         .replace("up_blocks", 'output_blocks')
-        #     res_dict[key_] = val_
         res_dict.update(input_dict)
         res_dict.update(output_dict)
         res_dict.update(mid_dict)
@@ -1046,6 +1164,7 @@ def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False, from
         state_dict = self._strip_unet_key_prefix(state_dict)
         if not from_NeMo:
             state_dict = self._state_key_mapping(state_dict)
+        state_dict = self._legacy_unet_ckpt_mapping(state_dict)
 
         model_state_dict = self.state_dict()
         loaded_keys = [k for k in state_dict.keys()]
@@ -1151,7 +1270,7 @@ def convert_to_fp16(self, enable_norm_layers=False):
         """
         self.apply(lambda module: convert_module_to_fp16(module=module, enable_norm_layers=enable_norm_layers))
 
-    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+    def _forward(self, x, timesteps=None, context=None, y=None, **kwargs):
         """
         Apply the model to an input batch.
 
@@ -1170,7 +1289,6 @@ def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
             self.num_classes is not None
         ), "must specify y if and only if the model is class-conditional"
         hs = []
-
         if self.unet_precision == "fp16-mixed" or self.unet_precision == "fp16":
             x = x.type(torch.float16)
             if context is not None:
@@ -1197,6 +1315,13 @@ def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
         else:
             return self.out(h)
 
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        with transformer_engine.pytorch.fp8_autocast(
+            enabled=self.use_te_fp8, fp8_recipe=self.fp8_recipe,
+        ) if self.use_te_fp8 else nullcontext():
+            out = self._forward(x, timesteps, context, y, **kwargs)
+        return out
+
 
 class EncoderUNetModel(nn.Module):
     """
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index a6f68f0666b5..0a030759fe9b 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -73,6 +73,7 @@
 
 try:
     from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
     from nemo.core.optim.distributed_adam import MegatronDistributedFusedAdam
 
     HAVE_APEX = True
@@ -1057,6 +1058,31 @@ def should_process(key):
                     new_state_dict[key_] = state_dict[key_]
             state_dict = new_state_dict
 
+        if conf.get('unet_config') and conf.get('unet_config').get('use_te_fp8') == False:
+            # Mapping potential fp8 ckpt to fp16 model
+            # remove _extra_state in fp8 if there is.
+            new_state_dict = {}
+            for key in state_dict.keys():
+                if 'extra_state' in key:
+                    continue
+
+                ### LayerNormLinear
+                # norm_to_q.layer_norm_{weight|bias} -> norm.{weight|bias}
+                # norm_to_q.weight -> to_q.weight
+                new_key = key.replace('norm_to_q.layer_norm_', 'norm.')
+                new_key = new_key.replace('norm_to_q.weight', 'to_q.weight')
+
+                ### LayerNormMLP
+                # ff.net.layer_norm_{weight|bias} -> ff.net.0.{weight|bias}
+                # ff.net.fc1_{weight|bias} -> ff.net.1.proj.{weight|bias}
+                # ff.net.fc2_{weight|bias} -> ff.net.3.{weight|bias}
+                new_key = new_key.replace('ff.net.layer_norm_', 'ff.net.0.')
+                new_key = new_key.replace('ff.net.fc1_', 'ff.net.1.proj.')
+                new_key = new_key.replace('ff.net.fc2_', 'ff.net.3.')
+
+                new_state_dict[new_key] = state_dict[key]
+            state_dict = new_state_dict
+
         return state_dict
 
     def _load_state_dict_from_disk(self, model_weights, map_location=None):

From fcc19d8f45098a924e755b03ceecedf432f4cefb Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Tue, 30 Apr 2024 23:12:26 -0700
Subject: [PATCH 010/178] Add ASR latest news (#9073)

* Add ASR news

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix url

Signed-off-by: smajumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 README.rst | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/README.rst b/README.rst
index 66b3a5806c2d..0b05bd0390f8 100644
--- a/README.rst
+++ b/README.rst
@@ -77,6 +77,31 @@ Latest News
 
   </details>
 
+  <details open>
+    <summary><b>Speech Recognition</b></summary>
+        <details>
+          <summary><a href="https://developer.nvidia.com/blog/new-standard-for-speech-recognition-and-translation-from-the-nvidia-nemo-canary-model/">New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model</a> (2024/04/18) </summary>
+
+          The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization. Canary also provides bi-directional translation, between English and the three other supported languages.
+          <br><br>
+        </details>
+
+      <details>
+        <summary><a href="https://developer.nvidia.com/blog/pushing-the-boundaries-of-speech-recognition-with-nemo-parakeet-asr-models/">Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models</a> (2024/04/18) </summary>
+
+        NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhere—on any cloud and on-premises—released the Parakeet family of automatic speech recognition (ASR) models. These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy.
+        <br><br>
+    </details>
+
+    <details>
+      <summary><a href="https://developer.nvidia.com/blog/turbocharge-asr-accuracy-and-speed-with-nvidia-nemo-parakeet-tdt/">Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT</a> (2024/04/18) </summary>
+
+      NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhere—on any cloud and on-premises—recently released Parakeet-TDT. This new addition to the  NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B.
+      <br><br>
+    </details>
+
+  </details>
+
    
 
From 8e65042d15062ce3fbe639f9d428c639510d894c Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Wed, 1 May 2024 14:00:20 +0300
Subject: [PATCH 011/178] zarr ckpt to torch_dist ckpt converter (#8842)

* add zarr to torch_dist ckpt converter

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add zarr to torch_dist ckpt converter

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add bert and gpt-sft support

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* switch to MegatronTrainerBuilder

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove unused imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove unused imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../convert_zarr_to_torch_dist.py             | 193 ++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 scripts/checkpoint_converters/convert_zarr_to_torch_dist.py

diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
new file mode 100644
index 000000000000..29b56aa706fa
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert zarr checkpoints into torch distributed checkpoint.
+  Example to run this conversion script:
+    python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
+     megatron_zarr_ckpt_to_torch_dist.py \
+     --model_type <model_type> \
+     --checkpoint_folder <path_to_PTL_checkpoints_folder> \
+     --checkpoint_name <checkpoint_name> \
+     --path_to_save <path_to_output_ckpt_files> \
+     --tensor_model_parallel_size <tensor_model_parallel_size> \
+     --pipeline_model_parallel_size <pipeline_model_parallel_size> \
+     --hparams_file <path_to_model_yaml_config> \
+     --gpus_per_node <gpus_per_node>
+"""
+
+import os
+from argparse import ArgumentParser
+
+import torch
+from megatron.core import parallel_state
+from omegaconf import OmegaConf, open_dict
+
+from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.utils import AppState, logging
+from nemo.utils.distributed import initialize_distributed
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--checkpoint_folder",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to PTL checkpoints saved during training. Ex: /raid/nemo_experiments/megatron_gpt/checkpoints",
+    )
+    parser.add_argument(
+        "--checkpoint_name",
+        type=str,
+        default=None,
+        required=True,
+        help="Name of checkpoint to be used. Ex: megatron_gpt--val_loss=0.14-step=20-consumed_samples=160.0-last",
+    )
+
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=None,
+        required=True,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.")
+    parser.add_argument(
+        "--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.",
+    )
+    parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
+    parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
+    parser.add_argument("--pipeline_model_parallel_size", type=int, required=True, default=None)
+    parser.add_argument(
+        "--pipeline_model_parallel_split_rank",
+        type=int,
+        required=False,
+        default=None,
+        help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
+    )
+    parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
+    parser.add_argument("--cluster_type", required=False, default=None, help="Whether on BCP platform")
+    parser.add_argument(
+        "--precision",
+        type=str,
+        required=False,
+        default='bf16-mixed',
+        choices=['32-true', '16-mixed', 'bf16-mixed'],
+        help="Precision value for the trainer that matches with precision of the ckpt",
+    )
+
+    parser.add_argument(
+        "--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"],
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def convert(local_rank, rank, world_size, args):
+
+    app_state = AppState()
+    app_state.data_parallel_rank = 0
+    num_nodes = world_size // args.gpus_per_node
+
+    cfg = {
+        'trainer': {
+            'devices': args.gpus_per_node,
+            'num_nodes': num_nodes,
+            'accelerator': 'gpu',
+            'precision': args.precision,
+        },
+        'model': {
+            'native_amp_init_scale': 2 ** 32,
+            'native_amp_growth_interval': 1000,
+            'hysteresis': 2,
+            'gradient_as_bucket_view': True,
+        },
+        'cluster_type': args.cluster_type,
+    }
+    cfg = OmegaConf.create(cfg)
+
+    # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+    # precision plugins and precision to exist
+    cfg.trainer.precision = None
+
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+
+    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
+    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
+    app_state.pipeline_model_parallel_split_rank = None
+
+    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size
+
+    parallel_state.initialize_model_parallel(
+        tensor_model_parallel_size=app_state.tensor_model_parallel_size,
+        pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
+        pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
+    )
+
+    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()
+    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank()
+
+    # check for distributed checkpoint
+    checkpoint_path = os.path.join(args.checkpoint_folder, args.checkpoint_name)
+
+    logging.info(
+        f'rank: {rank}, local_rank: {local_rank}, is loading checkpoint: {checkpoint_path} for tp_rank: {app_state.tensor_model_parallel_rank} and pp_rank: {app_state.pipeline_model_parallel_rank}'
+    )
+
+    if args.model_type == "gpt":
+        model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
+    elif args.model_type == "sft":
+        model = MegatronGPTSFTModel.load_from_checkpoint(
+            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
+        )
+        # we force the target for the loaded model to have the correct target
+        # because the hparams.yaml sometimes contains MegatronGPTModel as the target.
+        with open_dict(model.cfg):
+            model.cfg.target = f"{MegatronGPTSFTModel.__module__}.{MegatronGPTSFTModel.__name__}"
+    elif args.model_type == 'bert':
+        model = MegatronBertModel.load_from_checkpoint(
+            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
+        )
+
+    with open_dict(model.cfg):
+        model.cfg.torch_distributed_checkpoint = True
+
+    model._save_restore_connector = NLPSaveRestoreConnector()
+    save_file_path = args.path_to_save
+    if not args.save_to_nemo:
+        # With --save_to_nemo, save_to_path is expected to be a directory.
+        # Adding a dummy model filename here conforms with SaveRestoreConnector's convention.
+        model._save_restore_connector.pack_nemo_file = False
+        save_file_path = os.path.join(save_file_path, 'model.nemo')
+
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    model.save_to(save_file_path)
+
+    logging.info(f'NeMo model saved to: {args.path_to_save}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    local_rank, rank, world_size = initialize_distributed(args)
+
+    convert(local_rank, rank, world_size, args)

From 5d5919fe8a4877d260a22cd80690b7ef3acde7a0 Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithya.r@gmail.com>
Date: Wed, 1 May 2024 10:50:52 -0700
Subject: [PATCH 012/178] unfused lora (#9004)

* WIP unfused lora

Signed-off-by: arendu <adithya.r@gmail.com>

* unfused lora training and generation

Signed-off-by: arendu <adithya.r@gmail.com>

* update

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithya.r@gmail.com>

* GQA support for unfused lora

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* converter for fused to unfused lora added

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* defaults

Signed-off-by: arendu <adithya.r@gmail.com>

* refac

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* cleaned

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* unfusing h to 4h adapter

Signed-off-by: arendu <adithya.r@gmail.com>

* unfused hto 4h

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix for canonical

Signed-off-by: arendu <adithya.r@gmail.com>

* updates

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../conf/megatron_gpt_finetuning_config.yaml  |   1 +
 .../conf/megatron_gpt_generate_config.yaml    |   1 +
 .../common/megatron/adapters/mcore_mixins.py  |  42 +++-
 .../megatron/adapters/parallel_adapters.py    | 179 +++++++++++++++
 nemo/collections/nlp/parts/peft_config.py     |  50 ++++-
 .../convert_nemo_to_canonical.py              | 212 ++++++++++++++++++
 6 files changed, 469 insertions(+), 16 deletions(-)
 create mode 100644 scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index 40347f317fbb..6517b62010b4 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -101,6 +101,7 @@ model:
       position_embedding_strategy: null # used only when weight_tying is True
 
     lora_tuning:
+      variant: "nemo" # can be "nemo" or "canonical"
       target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
       adapter_dim: 32
       alpha: ${model.peft.lora_tuning.adapter_dim} 
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
index 67d43eb303f4..592eed6c4420 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
@@ -89,6 +89,7 @@ model:
       position_embedding_strategy: null # used only when weight_tying is True
 
     lora_tuning:
+      variant: "nemo" # can be either "canonical" or "nemo"
       target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
       adapter_dim: 32
       adapter_dropout: 0.0
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index a5e886f3b479..16ded8e2c682 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -37,6 +37,8 @@
     LoraDenseAttentionAdapterConfig,
     LoraHto4HAdapterConfig,
     LoraKQVAdapterConfig,
+    LoraUnfusedHto4HAdapterConfig,
+    LoraUnfusedKQVAdapterConfig,
     MLPInfusedAdapterConfig,
     ParallelLinearAdapterConfig,
     PromptEncoderAdapterConfig,
@@ -67,7 +69,12 @@ def mcore_register_adapters(self):
         Setup NeMo LoRA or IA3 adapter to this MCore layer.
         """
         self.set_accepted_adapter_types(
-            [LoraKQVAdapterConfig._target_, LoraDenseAttentionAdapterConfig._target_, InfusedAdapterConfig._target_]
+            [
+                LoraUnfusedKQVAdapterConfig._target_,
+                LoraKQVAdapterConfig._target_,
+                LoraDenseAttentionAdapterConfig._target_,
+                InfusedAdapterConfig._target_,
+            ]
         )
         self.linear_qkv.return_layernorm_output = True  # need layernorm output for lora mlp
         if (
@@ -102,12 +109,20 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
 
         # LoRA logic
         if self.is_adapter_available():
+            lora_adapter = None
             lora_kqv_adapter = self.get_adapter_module(AdapterName.LORA_KQV_ADAPTER)
+            lora_unfused_kqv_adapter = self.get_adapter_module(AdapterName.LORA_UNFUSED_KQV_ADAPTER)
             if lora_kqv_adapter and self.adapter_cfg[AdapterName.LORA_KQV_ADAPTER]['enabled']:
+                lora_adapter = lora_kqv_adapter
+            if lora_unfused_kqv_adapter and self.adapter_cfg[AdapterName.LORA_UNFUSED_KQV_ADAPTER]['enabled']:
+                assert lora_adapter is None, "Expected only one of lora_kqv_adapter or lora_unfused_kqv_adapter"
+                lora_adapter = lora_unfused_kqv_adapter
+
+            if lora_adapter:
                 if layernorm_output is not None:
-                    lora_mixed_qkv = lora_kqv_adapter(layernorm_output)
+                    lora_mixed_qkv = lora_adapter(layernorm_output)
                 else:
-                    lora_mixed_qkv = lora_kqv_adapter(hidden_states)
+                    lora_mixed_qkv = lora_adapter(hidden_states)
 
                 mixed_qkv = mixed_qkv + lora_mixed_qkv
 
@@ -251,7 +266,12 @@ def mcore_register_adapters(self):
         Setup NeMo IA3 adapter to this MCore layer.
         """
         self.set_accepted_adapter_types(
-            [LoraHto4HAdapterConfig._target_, Lora4HtoHAdapterConfig._target_, MLPInfusedAdapterConfig._target_]
+            [
+                LoraUnfusedHto4HAdapterConfig._target_,
+                LoraHto4HAdapterConfig._target_,
+                Lora4HtoHAdapterConfig._target_,
+                MLPInfusedAdapterConfig._target_,
+            ]
         )  # only self attn (packed qkv) for now
         self.linear_fc1.return_layernorm_output = True  # need layernorm output for lora mlp
         if (
@@ -274,9 +294,17 @@ def forward(self, hidden_states):
 
         # LoRA logic
         if self.is_adapter_available():
-            lora_linear_fc1_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER)
-            if lora_linear_fc1_adapter and self.adapter_cfg[AdapterName.LORA_Hto4H_ADAPTER]['enabled']:
-                lora_output = lora_linear_fc1_adapter(layernorm_output)
+            lora_adapter = None
+            lora_fc1_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER)
+            lora_unfused_fc1_adapter = self.get_adapter_module(AdapterName.LORA_UNFUSED_Hto4H_ADAPTER)
+            if lora_fc1_adapter and self.adapter_cfg[AdapterName.LORA_Hto4H_ADAPTER]['enabled']:
+                lora_adapter = lora_fc1_adapter
+            if lora_unfused_fc1_adapter and self.adapter_cfg[AdapterName.LORA_UNFUSED_Hto4H_ADAPTER]['enabled']:
+                assert lora_adapter is None, "Expected only one of LORA_Hto4H_ADAPTER or LORA_UNFUSED_Hto4H_ADAPTER"
+                lora_adapter = lora_unfused_fc1_adapter
+
+            if lora_adapter:
+                lora_output = lora_adapter(layernorm_output)
                 intermediate_parallel = intermediate_parallel + lora_output
 
         if self.config.bias_activation_fusion:
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 5037bb1b3634..2a5372d11ab5 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -75,11 +75,13 @@ class AdapterName(str, enum.Enum):
     POST_ATTN_ADAPTER = 'adapter_2'
     PTUNING_ADAPTER = "ptuning_adapter"
     LORA_KQV_ADAPTER = "lora_kqv_adapter"
+    LORA_UNFUSED_KQV_ADAPTER = "lora_unfused_kqv_adapter"
     LORA_KV_ADAPTER = "lora_kv_adapter"
     LORA_Q_ADAPTER = "lora_q_adapter"
     MM_LINEAR_ADAPTER = "mm_linear_adapter"
     LORA_DENSE_ATTENTION_ADAPTER = "lora_dense_attention_adapter"
     LORA_Hto4H_ADAPTER = "lora_hto4h_adapter"
+    LORA_UNFUSED_Hto4H_ADAPTER = "lora_unfused_hto4h_adapter"
     LORA_4HtoH_ADAPTER = "lora_4htoh_adapter"
     MULTIMODAL_PROJECTOR_ADAPTER = "mm_projector_adapter"
     PARALLEL_LINEAR_ADAPTER = "parallel_linear_adapter"
@@ -457,6 +459,183 @@ class Lora4HtoHAdapterConfig(ParallelLinearAdapterConfig):
     input_is_parallel: bool = True
 
 
+class LoraUnfusedHto4HAdapter(nn.Module, AdapterModuleUtil):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        dim: int,
+        activation: str = 'swish',
+        norm_position: Optional[str] = 'post',
+        norm_type: Optional[str] = 'mixedfusedlayernorm',
+        column_init_method: str = 'xavier',  # TODO: (@adithyare) should rename this to input_init_method to be more precise.
+        row_init_method: str = 'zero',  # TODO: (@adithyare) should rename this to output_init_method to be more precise.
+        gather_output: bool = True,
+        input_is_parallel: bool = False,  # NOTE: (@ertkonuk) we need this for LoRA adapters that are applied to RowParallelLinear layers
+        dropout: float = 0.0,
+        model_parallel_config: Optional[ModelParallelConfig] = None,
+        alpha: float | None = None,
+        dropout_position: str = 'post',
+        a2a_experimental: bool = False,  # TODO: should rename this or make it a default feature
+        **kwargs,
+    ):
+        super().__init__()
+        self.gate_adapter = ParallelLinearAdapter(
+            in_features,
+            out_features // 2,
+            dim,
+            activation,
+            norm_position,
+            norm_type,
+            column_init_method,
+            row_init_method,
+            gather_output,
+            input_is_parallel,
+            dropout,
+            model_parallel_config,
+            alpha,
+            dropout_position,
+            a2a_experimental,
+        )
+        self.up_adapter = ParallelLinearAdapter(
+            in_features,
+            out_features // 2,
+            dim,
+            activation,
+            norm_position,
+            norm_type,
+            column_init_method,
+            row_init_method,
+            gather_output,
+            input_is_parallel,
+            dropout,
+            model_parallel_config,
+            alpha,
+            dropout_position,
+            a2a_experimental,
+        )
+
+    def forward(self, x):
+        gate_x = self.gate_adapter(x)
+        up_x = self.up_adapter(x)
+        x = torch.concat([gate_x, up_x], dim=2)
+        return x
+
+
+@dataclass
+class LoraUnfusedHto4HAdapterConfig(ParallelLinearAdapterConfig):
+    _target_: str = "{0}.{1}".format(LoraUnfusedHto4HAdapter.__module__, LoraUnfusedHto4HAdapter.__name__)
+
+
+class LoraUnfusedKQVAdapter(nn.Module, AdapterModuleUtil):
+    def __init__(
+        self,
+        in_features: int,
+        dim: int,
+        activation: str = 'swish',
+        norm_position: Optional[str] = 'post',
+        norm_type: Optional[str] = 'mixedfusedlayernorm',
+        column_init_method: str = 'xavier',  # TODO: (@adithyare) should rename this to input_init_method to be more precise.
+        row_init_method: str = 'zero',  # TODO: (@adithyare) should rename this to output_init_method to be more precise.
+        gather_output: bool = True,
+        input_is_parallel: bool = False,  # NOTE: (@ertkonuk) we need this for LoRA adapters that are applied to RowParallelLinear layers
+        dropout: float = 0.0,
+        model_parallel_config: Optional[ModelParallelConfig] = None,
+        alpha: float | None = None,
+        dropout_position: str = 'post',
+        a2a_experimental: bool = False,  # TODO: should rename this or make it a default feature
+        num_query_groups: Optional[int] = None,
+        kv_channels: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        if num_query_groups is not None and kv_channels is not None:
+            out_features = kv_channels * num_query_groups
+        else:
+            out_features = in_features
+
+        self.q_adapter = ParallelLinearAdapter(
+            in_features,
+            in_features,
+            dim,
+            activation,
+            norm_position,
+            norm_type,
+            column_init_method,
+            row_init_method,
+            gather_output,
+            input_is_parallel,
+            dropout,
+            model_parallel_config,
+            alpha,
+            dropout_position,
+            a2a_experimental,
+        )
+
+        self.k_adapter = ParallelLinearAdapter(
+            in_features,
+            out_features,
+            dim,
+            activation,
+            norm_position,
+            norm_type,
+            column_init_method,
+            row_init_method,
+            gather_output,
+            input_is_parallel,
+            dropout,
+            model_parallel_config,
+            alpha,
+            dropout_position,
+            a2a_experimental,
+        )
+        self.v_adapter = ParallelLinearAdapter(
+            in_features,
+            out_features,
+            dim,
+            activation,
+            norm_position,
+            norm_type,
+            column_init_method,
+            row_init_method,
+            gather_output,
+            input_is_parallel,
+            dropout,
+            model_parallel_config,
+            alpha,
+            dropout_position,
+            a2a_experimental,
+        )
+
+    def forward(self, x):
+        qx = self.q_adapter(x)
+        kx = self.k_adapter(x)
+        vx = self.v_adapter(x)
+        x = torch.concat([qx, kx, vx], dim=2)
+        return x
+
+
+@dataclass
+class LoraUnfusedKQVAdapterConfig(AdapterConfig):
+    in_features: int
+    dim: int
+    activation: str = 'swish'
+    norm_position: Optional[str] = 'post'
+    norm_type: Optional[str] = 'mixedfusedlayernorm'
+    column_init_method: str = 'xavier'
+    row_init_method: str = 'zero'
+    gather_output: bool = True
+    input_is_parallel: bool = False
+    dropout: float = 0.0
+    dropout_position: str = 'post'
+    alpha: float | None = None
+    network_alpha: int | None = None
+    a2a_experimental: bool = False
+    num_query_groups: Optional[int] = None
+    kv_channels: Optional[int] = None
+    _target_: str = "{0}.{1}".format(LoraUnfusedKQVAdapter.__module__, LoraUnfusedKQVAdapter.__name__)
+
+
 class PromptEncoderAdapter(nn.Module, AdapterModuleUtil):
     """
     The Tensor Parallel MLP prompt encoder network that is used to generate the virtual
diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index 63caa409b218..47d5167d630e 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -36,6 +36,8 @@
     LoraHto4HAdapterConfig,
     LoraKQVAdapterConfig,
     LoraKQVAdapterWeightTyingConfig,
+    LoraUnfusedHto4HAdapterConfig,
+    LoraUnfusedKQVAdapterConfig,
     MLPInfusedAdapterConfig,
     ParallelLinearAdapterConfig,
     ParallelLinearAdapterWeightTyingConfig,
@@ -132,11 +134,26 @@ def __init__(self, cfg):
 
         for module in target_modules:
             if module == PEFT_MODULE_MAP["qkv_module"]:
-                adapter_cfg = self._create_lora_config(
-                    cfg, lora_cfg, cfg.hidden_size, qkv_projection_size, LoraKQVAdapterConfig
-                )
-                name_key_to_cfg[AdapterName.LORA_KQV_ADAPTER] = adapter_cfg
-                name_key_to_mcore_mixins[AdapterName.LORA_KQV_ADAPTER] = [("self_attention", MCoreSelfAttentionMixin)]
+                if lora_cfg.get("variant", "nemo") == "canonical":
+                    _adapter_name = AdapterName.LORA_UNFUSED_KQV_ADAPTER
+                    _adapter_cfg_cls = LoraUnfusedKQVAdapterConfig
+                    adapter_cfg = self._create_lora_config(
+                        cfg,
+                        lora_cfg,
+                        cfg.hidden_size,
+                        qkv_projection_size,
+                        _adapter_cfg_cls,
+                        num_query_groups=num_query_groups,
+                        kv_channels=kv_channels,
+                    )
+                else:
+                    _adapter_name = AdapterName.LORA_KQV_ADAPTER
+                    _adapter_cfg_cls = LoraKQVAdapterConfig
+                    adapter_cfg = self._create_lora_config(
+                        cfg, lora_cfg, cfg.hidden_size, qkv_projection_size, _adapter_cfg_cls
+                    )
+                name_key_to_cfg[_adapter_name] = adapter_cfg
+                name_key_to_mcore_mixins[_adapter_name] = [("self_attention", MCoreSelfAttentionMixin)]
 
             elif module == PEFT_MODULE_MAP["dense_module"]:
                 adapter_cfg = self._create_lora_config(
@@ -149,11 +166,18 @@ def __init__(self, cfg):
 
             elif module == PEFT_MODULE_MAP["hto4h_module"]:
                 hto4h_projection_size = cfg.ffn_hidden_size * 2 if fast_glu_activation else cfg.ffn_hidden_size
+                if lora_cfg.get("variant", "nemo") == "canonical":
+                    _adapter_name = AdapterName.LORA_UNFUSED_Hto4H_ADAPTER
+                    _adapter_cfg_cls = LoraUnfusedHto4HAdapterConfig
+                else:
+                    _adapter_name = AdapterName.LORA_Hto4H_ADAPTER
+                    _adapter_cfg_cls = LoraHto4HAdapterConfig
+
                 adapter_cfg = self._create_lora_config(
-                    cfg, lora_cfg, cfg.hidden_size, hto4h_projection_size, LoraHto4HAdapterConfig
+                    cfg, lora_cfg, cfg.hidden_size, hto4h_projection_size, _adapter_cfg_cls
                 )
-                name_key_to_cfg[AdapterName.LORA_Hto4H_ADAPTER] = adapter_cfg
-                name_key_to_mcore_mixins[AdapterName.LORA_Hto4H_ADAPTER] = [("mlp", MCoreMLPMixin)]
+                name_key_to_cfg[_adapter_name] = adapter_cfg
+                name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)]
             elif module == PEFT_MODULE_MAP["4htoh_module"]:
                 adapter_cfg = self._create_lora_config(
                     cfg, lora_cfg, cfg.ffn_hidden_size, cfg.hidden_size, Lora4HtoHAdapterConfig
@@ -170,7 +194,9 @@ def __init__(self, cfg):
         self.name_key_to_mcore_mixins = name_key_to_mcore_mixins
         super().__init__(lora_cfg, name_key_to_cfg)
 
-    def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_cfg_cls):
+    def _create_lora_config(
+        self, cfg, lora_cfg, in_features, out_features, adapter_cfg_cls, num_query_groups=None, kv_channels=None
+    ):
         config_args = {
             "in_features": in_features,
             "out_features": out_features,
@@ -187,6 +213,12 @@ def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_
             "a2a_experimental": lora_cfg.get("a2a_experimental", False),
         }
 
+        if adapter_cfg_cls == LoraUnfusedKQVAdapterConfig:
+            assert num_query_groups is not None, "num_query_groups must be provided for canonical Lora"
+            assert kv_channels is not None, "kv_channels must be provided for canonical Lora"
+            config_args.update({"num_query_groups": num_query_groups, "kv_channels": kv_channels})
+            config_args.pop("out_features")
+
         if lora_cfg.weight_tying:
             position_embedding_strategy = lora_cfg.get("position_embedding_strategy", None)
             if position_embedding_strategy is None:
diff --git a/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py b/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py
new file mode 100644
index 000000000000..f2974aca1642
--- /dev/null
+++ b/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Convert nemo style (fused) lora checkpoint to canonical (unfused) lora checkpoint.
+Currently supports TP=PP=1 only.
+
+Example usage:
+python scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py \
+    --lora_path nemo_style_lora_model.nemo \
+    --output_path ./canonical_style_lora_model.nemo 
+
+"""
+import tempfile
+from argparse import ArgumentParser
+from typing import Dict
+
+import torch
+from omegaconf import OmegaConf, open_dict
+from scripts.nlp_language_modeling.merge_lora_weights.merge import replace_number_add_offset
+
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+
+
+def rename_keys(key):
+    new_keys = []
+    if "lora_kqv_adapter" in key:
+        new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.q_adapter."))
+        new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.k_adapter."))
+        new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.v_adapter."))
+    elif "lora_hto4h_adapter" in key:
+        new_keys.append(key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.gate_adapter."))
+        new_keys.append(key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.up_adapter."))
+    return new_keys
+
+
+def reformat_module_names_to_hf(tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    new_tensors = dict()
+    for module_name, module_weight in tensors.items():
+        # map linear_in and linear_out to lora_a/lora_b counterparts
+        new_module_name = "base_model." + module_name.replace("linear_in", "lora_A").replace("linear_out", "lora_B")
+
+        # map target modules to their vLLM/HF counterparts
+        new_module_name = new_module_name.replace("q_adapter", "q_proj")
+        new_module_name = new_module_name.replace("k_adapter", "k_proj")
+        new_module_name = new_module_name.replace("v_adapter", "v_proj")
+        new_module_name = new_module_name.replace("lora_dense_attention_adapter", "o_proj")
+        new_module_name = new_module_name.replace("lora_4htoh_adapter", "down_proj")
+        new_module_name = new_module_name.replace("gate_adapter", "gate_proj")
+        new_module_name = new_module_name.replace("up_adapter", "up_proj")
+
+        # map other parts of the module names to fit vLLM/huggingface
+        new_module_name = new_module_name.replace(".adapter_layer", "")
+        new_module_name = new_module_name.replace(".lora_unfused_kqv_proj", "")
+        new_module_name = new_module_name.replace(".lora_unfused_hto4h_adapter", "")
+        new_module_name = new_module_name.replace("self_attention", "self_attn")
+        new_module_name = new_module_name.replace("decoder", "model")
+
+        new_tensors[new_module_name] = module_weight
+    return new_tensors
+
+
+def convert_hto4h(lora_weights, lora_config):
+    assert len(lora_weights) == 1, "Only single TP supported for now"
+    keys_to_update = []
+    for key in lora_weights[0].keys():
+        if "lora_hto4h_adapter" in key:
+            keys_to_update.append(key)
+
+    for key in keys_to_update:
+        if "linear_in" in key:
+            for new_key in rename_keys(key):
+                lora_weights[0][new_key] = lora_weights[0][key]
+                print(new_key, lora_weights[0][new_key].shape)
+        elif "linear_out" in key:
+            for idx, new_key in enumerate(rename_keys(key)):
+                orginal_shape = lora_weights[0][key].shape[0]
+                lora_weights[0][new_key] = lora_weights[0][key][
+                    idx * (orginal_shape // 2) : (idx + 1) * (orginal_shape // 2)
+                ]
+                print(new_key, lora_weights[0][new_key].shape)
+
+        lora_weights[0].pop(key)
+    return lora_weights
+
+
+def convert_qkv(lora_weights, lora_model_cfg):
+    assert len(lora_weights) == 1, "Only single TP supported for now"
+    if (
+        lora_model_cfg.get("num_query_groups", lora_model_cfg.num_attention_heads)
+        != lora_model_cfg.num_attention_heads
+    ):
+        kv_channels = int(lora_model_cfg.hidden_size / lora_model_cfg.num_attention_heads)
+        kv_size = int(lora_model_cfg.num_query_groups * kv_channels)
+    else:
+        kv_size = int(lora_model_cfg.hidden_size)
+    q_size = lora_model_cfg.hidden_size
+    k_size, v_size = kv_size, kv_size
+
+    keys_to_update = []
+    for key in lora_weights[0].keys():
+        if "lora_kqv_adapter" in key:
+            keys_to_update.append(key)
+
+    for key in keys_to_update:
+        if "linear_in" in key:
+            for new_key in rename_keys(key):
+                lora_weights[0][new_key] = lora_weights[0][key]
+                print(new_key, lora_weights[0][new_key].shape)
+        elif "linear_out" in key:
+            srt = 0
+            for new_key, size in zip(rename_keys(key), [q_size, k_size, v_size]):
+                lora_weights[0][new_key] = lora_weights[0][key][srt : srt + size]
+                print(new_key, lora_weights[0][new_key].shape)
+                srt = srt + size
+
+        lora_weights[0].pop(key)
+    return lora_weights
+
+
+def convert_lora(lora_nemo, save_path, hf_format=False):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        NLPSaveRestoreConnector._unpack_nemo_file(lora_nemo, tmpdir)
+        config_file = f"{tmpdir}/model_config.yaml"
+        lora_config = OmegaConf.load(config_file)
+        tp_size = lora_config.tensor_model_parallel_size
+        pp_size = lora_config.pipeline_model_parallel_size
+
+        lora_state_dict = [{}] * tp_size
+
+        for pp in range(pp_size):
+            for tp in range(tp_size):
+                if tp_size == 1:
+                    ckpt_file = f"{tmpdir}/model_weights.ckpt"
+                elif pp_size == 1:
+                    ckpt_file = f"{tmpdir}/mp_rank_{tp:02d}/model_weights.ckpt"
+                else:
+                    ckpt_file = f"{tmpdir}/tp_rank_{tp:02d}_pp_rank_{pp:03d}/model_weights.ckpt"
+
+                l = torch.load(ckpt_file, map_location=torch.device('cpu'))
+                if pp == 0:
+                    lora_state_dict[tp] = l
+                else:
+                    # calculate layer offset
+                    layer_offset = lora_config.num_layers // pp_size * pp
+                    for key, value in l.items():
+                        new_key = replace_number_add_offset(key, layer_offset)
+                        lora_state_dict[tp][new_key] = value
+
+        with open_dict(lora_config):
+            lora_config.peft.lora_tuning.variant = "canonical"
+        with open(f"{tmpdir}/model_config.yaml", "w") as f:
+            OmegaConf.save(lora_config, f)
+        lora_state_dict = convert_qkv(lora_state_dict, lora_config)
+        lora_state_dict = convert_hto4h(lora_state_dict, lora_config)
+        # TODO: currently suport tp=1
+        lora_state_dict = lora_state_dict[0]
+        if hf_format:
+            lora_state_dict = reformat_module_names_to_hf(lora_state_dict)
+            torch.save(lora_state_dict, f"{save_path}/model_weights_hf_formatted.pt")
+        else:
+            torch.save(lora_state_dict, f"{tmpdir}/model_weights.ckpt")
+            NLPSaveRestoreConnector._make_nemo_file_from_folder(save_path, tmpdir)
+
+    return lora_state_dict, lora_config
+
+
+def fix_for_O2(state_dict):
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if "model.module." not in k:
+            new_state_dict[k.replace('model.', 'model.module.')] = v
+    return new_state_dict
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--lora_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to NeMo style (fused) lora checkpoint in .nemo file format",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to save the canonical (unfused) lora .nemo file.",
+    )
+    parser.add_argument("--hf_format", action='store_true', help="saves tensors in huggingface naming format.")
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert_lora(args.lora_path, args.output_path, args.hf_format)

From e267406afe4369c67d29989b4fe7bd0c0a9a1f5e Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Wed, 1 May 2024 19:54:47 +0200
Subject: [PATCH 013/178] Restore PTQ tests for Llama2 (reopened) (#9064)

* Restore PTQ tests for Llama2 (MR-9018)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* try not using release

Signed-off-by: eharper <eharper@nvidia.com>

* checkout v4

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: eharper <eharper@nvidia.com>
---
 .github/workflows/cicd-main.yml   | 149 +++++++++++++++++++++++++++++-
 nemo/export/quantize/quantizer.py |   4 +-
 nemo/utils/model_utils.py         |  20 +++-
 tests/setup/__main__.py           |   4 +-
 4 files changed, 166 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index de250596da62..6f090bd34213 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -132,6 +132,9 @@ jobs:
                 apt-get update && apt-get install libsox-fmt-all -y && \
                 popd
 
+            # AMMO installation
+            pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
+
             # PyTorch Lightning version
             python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
 
@@ -220,7 +223,26 @@ jobs:
     - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
       if: "failure()"
 
-
+  L0_Setup_Test_Data_And_Models:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options:
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python -m tests.setup --save_dir /home/TestData/nlp
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
 
 ##     - name: L2: Multimodal Imagen Train
 
@@ -243,10 +265,9 @@ jobs:
           uses: actions/checkout@v4
         - run: |
             CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
-            --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \
-            --output_path=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
+            --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \
+            --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
             --precision=16
-            rm -f /home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
@@ -322,6 +343,124 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
+  L2_PTQ_Llama2_Export_Only:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options:
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+            quantization.algorithm=null \
+            model_save=/home/TestData/nlp/megatron_llama/ci_baseline
+
+            rm -rf /home/TestData/nlp/megatron_llama/ci_baseline
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
+  L2_PTQ_Llama2_FP8:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options:
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+            tensor_model_parallel_size=2 \
+            trainer.devices=2 \
+            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+            quantization.algorithm=fp8 \
+            quantization.num_calib_size=8 \
+            inference.batch_size=2 \
+            export.inference_tensor_parallel=2 \
+            model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+
+            rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
+  L2_PTQ_Llama2_INT8_SQ:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options:
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+            quantization.algorithm=int8_sq \
+            quantization.num_calib_size=8 \
+            inference.batch_size=2 \
+            model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+
+            rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
+  L2_PTQ_Llama2_INT4_AWQ:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options:
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+            tensor_model_parallel_size=1 \
+            trainer.devices=1 \
+            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+            quantization.algorithm=int4_awq \
+            quantization.num_calib_size=8 \
+            inference.batch_size=2 \
+            model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
+
+            rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]
@@ -4664,7 +4803,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             rm -rf /home/TestData/nlp/megatron_ir/working_dir
 
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 2663f8fe9bac..783f47a08e79 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import tarfile
 from contextlib import nullcontext
 from typing import List, Optional
@@ -21,7 +20,6 @@
 import torch.distributed as dist
 from megatron.core import parallel_state
 from megatron.core.transformer.module import Float16Module
-from megatron.training.utils import unwrap_model
 from omegaconf import OmegaConf
 from omegaconf.omegaconf import DictConfig, open_dict
 from pytorch_lightning.trainer.trainer import Trainer
@@ -31,7 +29,7 @@
 from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 from nemo.utils.distributed import temporary_directory
-from nemo.utils.model_utils import load_config, save_artifacts
+from nemo.utils.model_utils import load_config, save_artifacts, unwrap_model
 
 try:
     import ammo.torch.quantization as atq
diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py
index 95d1bc414625..f4eefd39a9ea 100644
--- a/nemo/utils/model_utils.py
+++ b/nemo/utils/model_utils.py
@@ -24,7 +24,7 @@
 from enum import Enum
 from functools import lru_cache
 from pathlib import Path
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import wrapt
 
@@ -92,6 +92,24 @@ def load_config(model_file: str) -> DictConfig:
     return model_config
 
 
+def unwrap_model(model, module_instances: Union[Type, Tuple[Type]]):
+    """Unwrap model from wrapper classes like Float16Module, for example."""
+
+    # TODO: Import this from megatron.core once moved there from megatron.training.
+    return_list = True
+    if not isinstance(model, list):
+        model = [model]
+        return_list = False
+    unwrapped_model = []
+    for model_module in model:
+        while isinstance(model_module, module_instances):
+            model_module = model_module.module
+        unwrapped_model.append(model_module)
+    if not return_list:
+        return unwrapped_model[0]
+    return unwrapped_model
+
+
 def param_is_not_shared(param):
     return not hasattr(param, 'shared') or not param.shared
 
diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py
index 289a2537e2f2..a08ccdaa1634 100644
--- a/tests/setup/__main__.py
+++ b/tests/setup/__main__.py
@@ -34,8 +34,8 @@
 )
 
 create_hf_model(
-    model_name_or_path="/home/TestData/nlp/meta-llama/Llama-2-7b-hf",
-    output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf"),
+    model_name_or_path="/home/TestData/nlp/megatron_llama/llama-ci-hf",
+    output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf-tiny"),
     config_updates={"hidden_size": 256, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4},
     overwrite=args.overwrite,
 )

From 3d87ed7109a456b526f2fe19809623cd5183d5b3 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Wed, 1 May 2024 13:10:59 -0500
Subject: [PATCH 014/178] add clip H config (#9082)

* add clip H config

* add comment to 1st line of yaml
---
 .../clip/conf/megatron_clip_VIT-H-14.yaml     | 204 ++++++++++++++++++
 1 file changed, 204 insertions(+)
 create mode 100644 examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-H-14.yaml

diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-H-14.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-H-14.yaml
new file mode 100644
index 000000000000..b37d64a325e5
--- /dev/null
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-H-14.yaml
@@ -0,0 +1,204 @@
+# An example model that works with this config is "https://huggingface.co/yuvalkirstain/PickScore_v1"
+model:
+  precision: 32
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 32 # limited by GPU memory
+  global_batch_size: 32 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  restore_from_pretrained: null # used in fine-tuning
+  # multimodal configs
+  output_dim: 1024
+  #  As the number of devices used to train increases, so does the space complexity of
+  #  the logit matrix. Using a naïve all-gather scheme, space complexity will be
+  #  `O(n^2)`. Instead, complexity may become effectively linear if the flags
+  #  `--gather-with-grad` and `--local-loss` are used. This alteration results in one-to-one
+  #  numerical results as the naïve method.
+  local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
+  gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue
+
+  vision:
+    precision: 32
+    # vision configs
+    patch_dim: 14
+    img_h: 224
+    img_w: 224
+    image_mean: null
+    image_std: null
+    num_channels: 3
+    drop_patch_rate: 0.0
+    drop_path_rate: 0.0
+    global_average_pool: False
+    output_dim: ${model.output_dim}
+    class_token_length: 1
+    preprocess_layernorm: True # apply layer norm to embedded tokens
+
+    # model architecture
+    encoder_seq_length: 196
+    max_position_embeddings: ${.encoder_seq_length}
+    position_embedding_type: learned_parameters
+    num_layers: 32
+    hidden_size: 1280
+    ffn_hidden_size: 5120 # Transformer FFN hidden size. Usually 4 * hidden_size.
+    num_attention_heads: 16
+    init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+    use_scaled_init_method: True # use scaled residuals initialization
+    hidden_dropout: 0. # Dropout probability for hidden state transformer.
+    attention_dropout: 0.
+    kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+    apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+    normalization: layernorm # Type of normalization layers
+    layernorm_epsilon: 1e-5
+    do_layer_norm_weight_decay: False # True means weight decay on all params
+    pre_process: True # add embedding
+    post_process: True # add pooler
+    persist_layer_norm: True # Use of persistent fused layer norm kernel.
+
+    ## Activation Checkpointing
+    activations_checkpoint_granularity: null # 'selective' or 'full'
+    activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+    activations_checkpoint_num_layers: null # not used with 'selective'
+    sequence_parallel: False
+
+    # precision
+    native_amp_init_scale: 4294967296 # 2 ** 32
+    native_amp_growth_interval: 1000
+    hysteresis: 2 # Gradient scale hysteresis
+    fp32_residual_connection: False # Move residual connections to fp32
+    fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+    # model fusions
+    masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+    bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+    use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+    onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+    gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+    openai_gelu: False
+    bias_activation_fusion: False
+    megatron_legacy: True
+    activation: gelu
+
+
+
+  text:
+    precision: 32
+    # text configs
+    output_dim: ${model.output_dim}
+
+    # model architecture
+    encoder_seq_length: 77
+    max_position_embeddings: ${.encoder_seq_length}
+    position_embedding_type: learned_parameters
+    num_layers: 24
+    hidden_size: 1024
+    ffn_hidden_size: 4096 # Transformer FFN hidden size. Usually 4 * hidden_size.
+    num_attention_heads: 16
+    init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+    use_scaled_init_method: True # use scaled residuals initialization
+    hidden_dropout: 0. # Dropout probability for hidden state transformer.
+    attention_dropout: 0.
+    kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+    apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+    normalization: layernorm # Type of normalization layers
+    layernorm_epsilon: 1e-5
+    do_layer_norm_weight_decay: False # True means weight decay on all params
+    pre_process: True # add embedding
+    post_process: True # add pooler
+    persist_layer_norm: True # Use of persistent fused layer norm kernel.
+
+    ## Activation Checkpointing
+    activations_checkpoint_granularity: null # 'selective' or 'full'
+    activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+    activations_checkpoint_num_layers: null # not used with 'selective'
+    num_micro_batches_with_partial_activation_checkpoints: null
+    activations_checkpoint_layers_per_pipeline: null
+    sequence_parallel: False
+
+    # precision
+    native_amp_init_scale: 4294967296 # 2 ** 32
+    native_amp_growth_interval: 1000
+    hysteresis: 2 # Gradient scale hysteresis
+    fp32_residual_connection: False # Move residual connections to fp32
+    fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+    # model fusions
+    masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+    bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+    use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+    onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+    gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+    openai_gelu: False
+    bias_activation_fusion: False
+    megatron_legacy: True
+
+    transformer_engine: False
+    fp8: False # enables fp8 in TransformerLayer forward
+    fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+    fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+    fp8_margin: 0 # scaling margin
+    fp8_interval: 1 # scaling update interval
+    fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
+    fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
+    use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+    activation: gelu
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+
+  # miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'openai/clip-vit-large-patch14'
+    model: null
+    vocab_file: null
+    merge_file: null
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+
+  data:
+    num_workers: 8
+    train:
+      dataset_path: # List of paths to pkl files or tar files
+        - /datasets/coyo/test.pkl
+    validation: # List of paths to pkl files or tar files
+      dataset_path:
+        - /datasets/coyo/test.pkl
+    webdataset:
+      infinite_sampler: False
+      local_root_path: /datasets/coyo
+
+    imagenet_val: null # Path to imagenet val set for conducting zero shot evaluation.
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: fused_adam
+    lr: 1e-3
+    weight_decay: 0.2
+    betas:
+      - 0.9
+      - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 0
+      min_lr: 1e-5
\ No newline at end of file

From f658b6f0445403c338c7371941b1fe644832df48 Mon Sep 17 00:00:00 2001
From: anteju <108555623+anteju@users.noreply.github.com>
Date: Wed, 1 May 2024 12:59:24 -0700
Subject: [PATCH 015/178] Score-based generative enhancement model (#8567)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Score-based generative enhancement model in NeMo
* Addressed comments, added unit test

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
---
 examples/audio_tasks/audio_to_audio_eval.py   |   18 +-
 examples/audio_tasks/conf/beamforming.yaml    |    1 -
 examples/audio_tasks/conf/masking.yaml        |    3 -
 examples/audio_tasks/conf/predictive.yaml     |  130 ++
 .../conf/score_based_generative.yaml          |  149 ++
 examples/audio_tasks/speech_enhancement.py    |   45 +-
 nemo/collections/asr/data/audio_to_audio.py   |   47 +-
 .../asr/data/audio_to_audio_dataset.py        |    3 +
 nemo/collections/asr/losses/__init__.py       |    2 +-
 nemo/collections/asr/losses/audio_losses.py   |  226 ++-
 nemo/collections/asr/metrics/audio.py         |    7 +
 nemo/collections/asr/models/__init__.py       |    6 +-
 .../asr/models/audio_to_audio_model.py        |  386 ++++-
 .../asr/models/enhancement_models.py          |  695 +++++----
 nemo/collections/asr/modules/audio_modules.py |    7 +-
 .../asr/modules/audio_preprocessing.py        |   59 +-
 .../asr/parts/submodules/diffusion.py         | 1310 +++++++++++++++++
 requirements/requirements_asr.txt             |    1 +
 tests/collections/asr/test_asr_datasets.py    |   33 +
 tests/collections/asr/test_asr_losses.py      |  192 ++-
 .../asr/test_audio_preprocessing.py           |   14 +-
 21 files changed, 2985 insertions(+), 349 deletions(-)
 create mode 100644 examples/audio_tasks/conf/predictive.yaml
 create mode 100644 examples/audio_tasks/conf/score_based_generative.yaml
 create mode 100644 nemo/collections/asr/parts/submodules/diffusion.py

diff --git a/examples/audio_tasks/audio_to_audio_eval.py b/examples/audio_tasks/audio_to_audio_eval.py
index 4ac68dfc84e7..ab6623df298d 100644
--- a/examples/audio_tasks/audio_to_audio_eval.py
+++ b/examples/audio_tasks/audio_to_audio_eval.py
@@ -61,6 +61,7 @@
 import json
 import os
 import tempfile
+from collections import defaultdict
 from dataclasses import dataclass, field, is_dataclass
 from typing import List, Optional
 
@@ -101,6 +102,9 @@ class AudioEvaluationConfig(process_audio.ProcessConfig):
     # Metrics to calculate
     metrics: List[str] = field(default_factory=lambda: ['sdr', 'estoi'])
 
+    # Return metric values for each example
+    return_values_per_example: bool = False
+
 
 def get_evaluation_dataloader(config):
     """Prepare a dataloader for evaluation.
@@ -174,6 +178,9 @@ def main(cfg: AudioEvaluationConfig):
     # Setup metrics
     metrics = get_metrics(cfg)
 
+    if cfg.return_values_per_example and cfg.batch_size > 1:
+        raise ValueError('return_example_values is only supported for batch_size=1.')
+
     # Processing
     if not cfg.only_score_manifest:
         # Process audio using the configured model and save in the output directory
@@ -236,6 +243,10 @@ def main(cfg: AudioEvaluationConfig):
 
                 num_files += 1
 
+                if cfg.max_utts is not None and num_files >= cfg.max_utts:
+                    logging.info('Reached max_utts: %s', cfg.max_utts)
+                    break
+
         # Prepare dataloader
         config = {
             'manifest_filepath': temporary_manifest_filepath,
@@ -249,6 +260,8 @@ def main(cfg: AudioEvaluationConfig):
         }
         temporary_dataloader = get_evaluation_dataloader(config)
 
+        metrics_value_per_example = defaultdict(list)
+
         # Calculate metrics
         for eval_batch in tqdm(temporary_dataloader, desc='Evaluating'):
             processed_signal, processed_length, target_signal, target_length = eval_batch
@@ -257,7 +270,9 @@ def main(cfg: AudioEvaluationConfig):
                 raise RuntimeError(f'Length mismatch.')
 
             for name, metric in metrics.items():
-                metric.update(preds=processed_signal, target=target_signal, input_length=target_length)
+                value = metric(preds=processed_signal, target=target_signal, input_length=target_length)
+                if cfg.return_values_per_example:
+                    metrics_value_per_example[name].append(value.item())
 
     # Convert to a dictionary with name: value
     metrics_value = {name: metric.compute().item() for name, metric in metrics.items()}
@@ -277,6 +292,7 @@ def main(cfg: AudioEvaluationConfig):
     # Inject the metric name and score into the config, and return the entire config
     with open_dict(cfg):
         cfg.metrics_value = metrics_value
+        cfg.metrics_value_per_example = dict(metrics_value_per_example)
 
     return cfg
 
diff --git a/examples/audio_tasks/conf/beamforming.yaml b/examples/audio_tasks/conf/beamforming.yaml
index 18e04f0bd12a..3abc4f134e64 100644
--- a/examples/audio_tasks/conf/beamforming.yaml
+++ b/examples/audio_tasks/conf/beamforming.yaml
@@ -44,7 +44,6 @@ model:
     _target_: nemo.collections.asr.modules.audio_preprocessing.AudioToSpectrogram
     fft_length: 512 # Length of the window and FFT for calculating spectrogram
     hop_length: 256 # Hop length for calculating spectrogram
-    power: null
 
   decoder:
     _target_: nemo.collections.asr.modules.audio_preprocessing.SpectrogramToAudio
diff --git a/examples/audio_tasks/conf/masking.yaml b/examples/audio_tasks/conf/masking.yaml
index c667bec53076..68adca116aa5 100644
--- a/examples/audio_tasks/conf/masking.yaml
+++ b/examples/audio_tasks/conf/masking.yaml
@@ -1,5 +1,3 @@
-# This configuration contains the exemplary values for training a multichannel speech enhancement model with a mask-based beamformer.
-#
 name: "masking"
 
 model:
@@ -44,7 +42,6 @@ model:
     _target_: nemo.collections.asr.modules.audio_preprocessing.AudioToSpectrogram
     fft_length: 512 # Length of the window and FFT for calculating spectrogram
     hop_length: 256 # Hop length for calculating spectrogram
-    power: null
 
   decoder:
     _target_: nemo.collections.asr.modules.audio_preprocessing.SpectrogramToAudio
diff --git a/examples/audio_tasks/conf/predictive.yaml b/examples/audio_tasks/conf/predictive.yaml
new file mode 100644
index 000000000000..b141ba6fd1ee
--- /dev/null
+++ b/examples/audio_tasks/conf/predictive.yaml
@@ -0,0 +1,130 @@
+name: "predictive_model"
+
+model:
+  type: predictive
+  sample_rate: 16000
+  skip_nan_grad: false
+  num_outputs: 1
+  normalize_input: true # normalize the input signal to 0dBFS
+
+  train_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    audio_duration: 2.04 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 256
+    random_offset: true
+    normalization_signal: input_signal
+    batch_size: 8 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+
+  validation_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    batch_size: 8
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  encoder:
+    _target_: nemo.collections.asr.modules.audio_preprocessing.AudioToSpectrogram
+    fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
+    hop_length: 128
+    magnitude_power: 0.5
+    scale: 0.33
+
+  decoder:
+    _target_: nemo.collections.asr.modules.audio_preprocessing.SpectrogramToAudio
+    fft_length: ${model.encoder.fft_length} 
+    hop_length: ${model.encoder.hop_length}
+    magnitude_power: ${model.encoder.magnitude_power}
+    scale: ${model.encoder.scale}
+
+  estimator:
+    _target_: nemo.collections.asr.parts.submodules.diffusion.SpectrogramNoiseConditionalScoreNetworkPlusPlus
+    in_channels: 1 # single-channel noisy input
+    out_channels: 1 # single-channel estimate
+    num_res_blocks: 3 # increased number of res blocks
+    pad_time_to: 64 # pad to 64 frames for the time dimension
+    pad_dimension_to: 0 # no padding in the frequency dimension
+    
+  loss:
+    _target_: nemo.collections.asr.losses.MSELoss # computed in the time domain
+
+  metrics:
+    val:
+      sisdr: # output SI-SDR
+        _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
+    
+  optim:
+    name: adam
+    lr: 1e-4
+    # optimizer arguments
+    betas: [0.9, 0.999]
+    weight_decay: 0.0
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: null
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+
+  # use exponential moving average for model parameters
+  ema:
+      enable: true
+      decay: 0.999  # decay rate
+      cpu_offload: false  # offload EMA parameters to CPU to save GPU memory
+      every_n_steps: 1  # how often to update EMA weights
+      validate_original_weights: False  # use original weights for validation calculation?
+
+  # logging
+  create_tensorboard_logger: true
+
+  # checkpointing
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: val_sisdr
+    mode: max
+    save_top_k: 5
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # early stopping
+  create_early_stopping_callback: true
+  early_stopping_callback_params:
+    monitor: val_sisdr
+    mode: max
+    min_delta: 0.0
+    patience: 20 # patience in terms of check_val_every_n_epoch
+    verbose: true
+    strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
diff --git a/examples/audio_tasks/conf/score_based_generative.yaml b/examples/audio_tasks/conf/score_based_generative.yaml
new file mode 100644
index 000000000000..c0b36bd750a2
--- /dev/null
+++ b/examples/audio_tasks/conf/score_based_generative.yaml
@@ -0,0 +1,149 @@
+name: score_based_generative_model
+
+model:
+  type: score_based
+  sample_rate: 16000
+  skip_nan_grad: false
+  num_outputs: 1
+  normalize_input: true
+  max_utts_evaluation_metrics: 50 # metric calculation needs full inference and is slow, so we limit to first few files
+
+  train_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    audio_duration: 2.04 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 256
+    random_offset: true
+    normalization_signal: input_signal
+    batch_size: 8 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+
+  validation_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    normalize_input: false # load data as is for validation, the model will normalize it for inference
+    batch_size: 4
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  encoder:
+    _target_: nemo.collections.asr.modules.audio_preprocessing.AudioToSpectrogram
+    fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
+    hop_length: 128
+    magnitude_power: 0.5
+    scale: 0.33
+
+  decoder:
+    _target_: nemo.collections.asr.modules.audio_preprocessing.SpectrogramToAudio
+    fft_length: ${model.encoder.fft_length} 
+    hop_length: ${model.encoder.hop_length}
+    magnitude_power: ${model.encoder.magnitude_power}
+    scale: ${model.encoder.scale}
+
+  estimator:
+    _target_: nemo.collections.asr.parts.submodules.diffusion.SpectrogramNoiseConditionalScoreNetworkPlusPlus
+    in_channels: 2 # concatenation of single-channel perturbed and noisy
+    out_channels: 1 # single-channel score estimate
+    conditioned_on_time: true
+    num_res_blocks: 3 # increased number of res blocks
+    pad_time_to: 64 # pad to 64 frames for the time dimension
+    pad_dimension_to: 0 # no padding in the frequency dimension
+
+  sde:
+    _target_: nemo.collections.asr.parts.submodules.diffusion.OrnsteinUhlenbeckVarianceExplodingSDE
+    stiffness: 1.5
+    std_min: 0.05
+    std_max: 0.5
+    num_steps: 1000
+
+  sampler:
+    _target_: nemo.collections.asr.parts.submodules.diffusion.PredictorCorrectorSampler
+    predictor: reverse_diffusion
+    corrector: annealed_langevin_dynamics
+    num_steps: 50
+    num_corrector_steps: 1
+    snr: 0.5
+    
+  loss:
+    _target_: nemo.collections.asr.losses.MSELoss
+    ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
+
+  metrics:
+    val:
+      sisdr: # output SI-SDR
+        _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
+    
+  optim:
+    name: adam
+    lr: 1e-4
+    # optimizer arguments
+    betas: [0.9, 0.999]
+    weight_decay: 0.0
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: null
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+
+  # use exponential moving average for model parameters
+  ema:
+      enable: true
+      decay: 0.999  # decay rate
+      cpu_offload: false  # offload EMA parameters to CPU to save GPU memory
+      every_n_steps: 1  # how often to update EMA weights
+      validate_original_weights: false  # use original weights for validation calculation?
+
+  # logging
+  create_tensorboard_logger: true
+
+  # checkpointing
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: val_sisdr
+    mode: max
+    save_top_k: 5
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # early stopping
+  create_early_stopping_callback: true
+  early_stopping_callback_params:
+    monitor: val_sisdr
+    mode: max
+    min_delta: 0.0
+    patience: 20 # patience in terms of check_val_every_n_epoch
+    verbose: true
+    strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
diff --git a/examples/audio_tasks/speech_enhancement.py b/examples/audio_tasks/speech_enhancement.py
index 250d212d2a25..33a25c1c107c 100644
--- a/examples/audio_tasks/speech_enhancement.py
+++ b/examples/audio_tasks/speech_enhancement.py
@@ -26,25 +26,64 @@
 
 PyTorch Lightning Trainer arguments and args of the model and the optimizer can be added or overriden from CLI
 """
+from enum import Enum
+
 import pytorch_lightning as pl
 import torch
 from omegaconf import OmegaConf
 
-from nemo.collections.asr.models import EncMaskDecAudioToAudioModel
+from nemo.collections.asr.models.enhancement_models import (
+    EncMaskDecAudioToAudioModel,
+    PredictiveAudioToAudioModel,
+    ScoreBasedGenerativeAudioToAudioModel,
+)
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
 
+class ModelType(str, Enum):
+    """Enumeration with the available model types.
+    """
+
+    MaskBased = 'mask_based'
+    Predictive = 'predictive'
+    ScoreBased = 'score_based'
+
+
+def get_model_class(model_type: ModelType):
+    """Get model class for a given model type.
+    """
+    if model_type == ModelType.MaskBased:
+        return EncMaskDecAudioToAudioModel
+    elif model_type == ModelType.Predictive:
+        return PredictiveAudioToAudioModel
+    elif model_type == ModelType.ScoreBased:
+        return ScoreBasedGenerativeAudioToAudioModel
+    else:
+        raise ValueError(f'Unknown model type: {model_type}')
+
+
 @hydra_runner(config_path="./conf", config_name="masking")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg, resolve=True)}')
 
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
-    model = EncMaskDecAudioToAudioModel(cfg=cfg.model, trainer=trainer)
 
-    # Initialize the weights of the model from another model, if provided via config
+    # Get model class
+    model_type = cfg.model.get('type')
+    if model_type is None:
+        model_type = ModelType.MaskBased
+        logging.warning('model_type not found in config. Using default: %s', model_type)
+
+    logging.info('Get class for model type: %s', model_type)
+    model_class = get_model_class(model_type)
+
+    logging.info('Instantiate model %s', model_class.__name__)
+    model = model_class(cfg=cfg.model, trainer=trainer)
+
+    logging.info('Initialize the weights of the model from another model, if provided via config')
     model.maybe_init_from_pretrained_checkpoint(cfg)
 
     # Train the model
diff --git a/nemo/collections/asr/data/audio_to_audio.py b/nemo/collections/asr/data/audio_to_audio.py
index a3c6dd0cc1b3..4f4727239a4b 100644
--- a/nemo/collections/asr/data/audio_to_audio.py
+++ b/nemo/collections/asr/data/audio_to_audio.py
@@ -130,13 +130,19 @@ class ASRAudioProcessor:
         sample_rate: sample rate used for all audio signals
         random_offset: If `True`, offset will be randomized when loading a subsegment
                        from a file.
+        normalization_signal: Normalize all audio with a factor that ensures the signal
+                    `example[normalization_signal]` in `process` is in range [-1, 1].
+                    All other audio signals are scaled by the same factor. Default is
+                    `None`, corresponding to no normalization.
     """
 
     def __init__(
-        self, sample_rate: float, random_offset: bool,
+        self, sample_rate: float, random_offset: bool, normalization_signal: Optional[str] = None, eps: float = 1e-8,
     ):
         self.sample_rate = sample_rate
         self.random_offset = random_offset
+        self.normalization_signal = normalization_signal
+        self.eps = eps
 
         self.sync_setup = None
         self.async_setup = None
@@ -314,7 +320,20 @@ def process_audio(self, audio: Dict[str, torch.Tensor]) -> Dict[str, torch.Tenso
         Returns:
             An ordered dictionary of signals and their tensors.
         """
-        # Currently, not doing any processing of the loaded signals.
+        if self.normalization_signal:
+            # Normalize all audio with a factor that ensures the normalization signal is in range [-1, 1].
+            norm_scale = audio[self.normalization_signal].abs().max()
+
+            # Do not normalize embeddings
+            skip_signals = self.embedding_setup.signals if self.embedding_setup is not None else []
+
+            # Normalize audio signals
+            for signal in audio:
+                if signal not in skip_signals:
+                    # All audio signals are scaled by the same factor.
+                    # This ensures that the relative level between signals is preserved.
+                    audio[signal] = audio[signal] / (norm_scale + self.eps)
+
         return audio
 
     def load_sync_signals(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
@@ -812,6 +831,9 @@ class AudioToTargetDataset(BaseAudioDataset):
                                 If `None`, all channels will be loaded.
         target_channel_selector: Optional, select subset of channels from each input audio file.
                                  If `None`, all channels will be loaded.
+        normalization_signal: Normalize audio signals with a scale that ensures the normalization signal is in range [-1, 1].
+                              All audio signals are scaled by the same factor. Supported values are `None` (no normalization),
+                              'input_signal', 'target_signal'.
     """
 
     def __init__(
@@ -827,6 +849,7 @@ def __init__(
         max_utts: Optional[int] = None,
         input_channel_selector: Optional[int] = None,
         target_channel_selector: Optional[int] = None,
+        normalization_signal: Optional[str] = None,
     ):
         audio_to_manifest_key = {
             'input_signal': input_key,
@@ -841,7 +864,9 @@ def __init__(
             max_number=max_utts,
         )
 
-        audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
+        audio_processor = ASRAudioProcessor(
+            sample_rate=sample_rate, random_offset=random_offset, normalization_signal=normalization_signal,
+        )
         audio_processor.sync_setup = SignalSetup(
             signals=['input_signal', 'target_signal'],
             duration=audio_duration,
@@ -932,6 +957,9 @@ class AudioToTargetWithReferenceDataset(BaseAudioDataset):
                                    from input and target.
         reference_duration: Optional, can be used to set a fixed duration of the reference utterance. If `None`,
                             complete audio file will be loaded.
+        normalization_signal: Normalize audio signals with a scale that ensures the normalization signal is in range [-1, 1].
+                              All audio signals are scaled by the same factor. Supported values are `None` (no normalization),
+                              'input_signal', 'target_signal', 'reference_signal'.
     """
 
     def __init__(
@@ -951,6 +979,7 @@ def __init__(
         reference_channel_selector: Optional[int] = None,
         reference_is_synchronized: bool = True,
         reference_duration: Optional[float] = None,
+        normalization_signal: Optional[str] = None,
     ):
         audio_to_manifest_key = {
             'input_signal': input_key,
@@ -966,7 +995,9 @@ def __init__(
             max_number=max_utts,
         )
 
-        audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
+        audio_processor = ASRAudioProcessor(
+            sample_rate=sample_rate, random_offset=random_offset, normalization_signal=normalization_signal,
+        )
 
         if reference_is_synchronized:
             audio_processor.sync_setup = SignalSetup(
@@ -1063,6 +1094,9 @@ class AudioToTargetWithEmbeddingDataset(BaseAudioDataset):
                                 If `None`, all channels will be loaded.
         target_channel_selector: Optional, select subset of channels from each input audio file.
                                  If `None`, all channels will be loaded.
+        normalization_signal: Normalize audio signals with a scale that ensures the normalization signal is in range [-1, 1].
+                              All audio signals are scaled by the same factor. Supported values are `None` (no normalization),
+                              'input_signal', 'target_signal'.
     """
 
     def __init__(
@@ -1079,6 +1113,7 @@ def __init__(
         max_utts: Optional[int] = None,
         input_channel_selector: Optional[int] = None,
         target_channel_selector: Optional[int] = None,
+        normalization_signal: Optional[str] = None,
     ):
         audio_to_manifest_key = {
             'input_signal': input_key,
@@ -1094,7 +1129,9 @@ def __init__(
             max_number=max_utts,
         )
 
-        audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
+        audio_processor = ASRAudioProcessor(
+            sample_rate=sample_rate, random_offset=random_offset, normalization_signal=normalization_signal,
+        )
         audio_processor.sync_setup = SignalSetup(
             signals=['input_signal', 'target_signal'],
             duration=audio_duration,
diff --git a/nemo/collections/asr/data/audio_to_audio_dataset.py b/nemo/collections/asr/data/audio_to_audio_dataset.py
index b296d64b1f2a..46e47020fda0 100644
--- a/nemo/collections/asr/data/audio_to_audio_dataset.py
+++ b/nemo/collections/asr/data/audio_to_audio_dataset.py
@@ -36,6 +36,7 @@ def get_audio_to_target_dataset(config: dict) -> audio_to_audio.AudioToTargetDat
         max_utts=config.get('max_utts', 0),
         input_channel_selector=config.get('input_channel_selector', None),
         target_channel_selector=config.get('target_channel_selector', None),
+        normalization_signal=config.get('normalization_signal', None),
     )
     return dataset
 
@@ -65,6 +66,7 @@ def get_audio_to_target_with_reference_dataset(config: dict) -> audio_to_audio.A
         reference_channel_selector=config.get('reference_channel_selector', None),
         reference_is_synchronized=config.get('reference_is_synchronized', True),
         reference_duration=config.get('reference_duration', None),
+        normalization_signal=config.get('normalization_signal', None),
     )
     return dataset
 
@@ -91,5 +93,6 @@ def get_audio_to_target_with_embedding_dataset(config: dict) -> audio_to_audio.A
         max_utts=config.get('max_utts', 0),
         input_channel_selector=config.get('input_channel_selector', None),
         target_channel_selector=config.get('target_channel_selector', None),
+        normalization_signal=config.get('normalization_signal', None),
     )
     return dataset
diff --git a/nemo/collections/asr/losses/__init__.py b/nemo/collections/asr/losses/__init__.py
index 3e50cea1d692..c03f7a48ffe3 100644
--- a/nemo/collections/asr/losses/__init__.py
+++ b/nemo/collections/asr/losses/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from nemo.collections.asr.losses.angularloss import AngularSoftmaxLoss
-from nemo.collections.asr.losses.audio_losses import SDRLoss
+from nemo.collections.asr.losses.audio_losses import MSELoss, SDRLoss
 from nemo.collections.asr.losses.ctc import CTCLoss
 from nemo.collections.asr.losses.lattice_losses import LatticeLoss
 from nemo.collections.asr.losses.ssl_losses.contrastive import ContrastiveLoss
diff --git a/nemo/collections/asr/losses/audio_losses.py b/nemo/collections/asr/losses/audio_losses.py
index 62ce4a9f7edd..b0214375a713 100644
--- a/nemo/collections/asr/losses/audio_losses.py
+++ b/nemo/collections/asr/losses/audio_losses.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import math
-from typing import List, Optional
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -21,31 +21,33 @@
 from nemo.collections.asr.parts.preprocessing.features import make_seq_mask_like
 from nemo.collections.asr.parts.utils.audio_utils import toeplitz
 from nemo.core.classes import Loss, Typing, typecheck
-from nemo.core.neural_types import AudioSignal, LengthsType, LossType, MaskType, NeuralType
+from nemo.core.neural_types import AudioSignal, LengthsType, LossType, MaskType, NeuralType, VoidType
 from nemo.utils import logging
 
-__all__ = ['SDRLoss']
+__all__ = ['SDRLoss', 'MSELoss']
 
 
-def temporal_mean(
+def calculate_mean(
     input: torch.Tensor,
     input_length: Optional[torch.Tensor] = None,
     mask: Optional[torch.Tensor] = None,
+    dim: Union[int, Tuple[int]] = -1,
     keepdim: bool = False,
     eps: float = 1e-10,
 ) -> torch.Tensor:
-    """Calculate mean along temporal dimension with optionally
+    """Calculate mean along dimension `dim` with optionally
     averaging only over valid samples (based on the input length).
 
     Args:
-        input: Batch of signals, shape (B, C, T)
+        input: signal, for example (B, C, T) or (B, C, D, T)
         input_length: Optional, length of each example in the batch, shape (B,)
-        mask: Optional, temporal mask for each example in the batch, shape (B, T)
+        mask: Optional, temporal mask for each example in the batch, same shape as the input signal
+        dim: dimension or dimensions to reduce
         keepdim: Whether to keep the temporal dimension
         eps: Regularization to avoid division by zero
 
     Returns:
-        (B, C, 1) if keepdim=True, otherwise (B, C)
+        Mean over dimensions `dim`.
     """
     if input_length is not None:
         if mask is not None:
@@ -53,17 +55,18 @@ def temporal_mean(
                 'Argument `input_length` is mutually exclusive with `mask`. Both cannot be used at the same time.'
             )
         # Construct a binary mask
-        mask = make_seq_mask_like(lengths=input_length, like=input, time_dim=-1, valid_ones=True).squeeze(1)
+        mask = make_seq_mask_like(lengths=input_length, like=input, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(input)
 
     if mask is None:
         # No length information, assume all samples are valid
-        mean = torch.mean(input, dim=-1, keepdim=keepdim)
+        mean = torch.mean(input, dim=dim, keepdim=keepdim)
     else:
         # Average using temporal mask
-        mean = mask.unsqueeze(1) * input
-        mean = torch.sum(mean, axis=-1, keepdim=keepdim)
-        normalization = torch.sum(mask, axis=-1, keepdim=keepdim)
-        mean = mean / (normalization.unsqueeze(1) + eps)
+        mean = mask * input
+        mean = torch.sum(mean, dim=dim, keepdim=keepdim)
+        normalization = torch.sum(mask, dim=dim, keepdim=keepdim)
+        mean = mean / (normalization + eps)
 
     return mean
 
@@ -101,16 +104,17 @@ def scale_invariant_target(
             )
 
         # Construct a binary mask
-        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True).squeeze(1)
+        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(estimate)
 
-    estimate_dot_target = temporal_mean(estimate * target, mask=mask, keepdim=True, eps=eps)
-    target_pow = temporal_mean(torch.abs(target) ** 2, mask=mask, keepdim=True, eps=eps)
+    estimate_dot_target = calculate_mean(estimate * target, mask=mask, dim=-1, keepdim=True, eps=eps)
+    target_pow = calculate_mean(torch.abs(target) ** 2, mask=mask, dim=-1, keepdim=True, eps=eps)
     scale = estimate_dot_target / (target_pow + eps)
     target_scaled = scale * target
 
     # Mask to keep only the valid samples
     if mask is not None:
-        target_scaled = mask.unsqueeze(1) * target_scaled
+        target_scaled = mask * target_scaled
 
     return target_scaled
 
@@ -162,12 +166,13 @@ def convolution_invariant_target(
             )
 
         # Construct a binary mask
-        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True).squeeze(1)
+        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(estimate)
 
     # Apply a mask, if available
     if mask is not None:
-        estimate = mask.unsqueeze(1) * estimate
-        target = mask.unsqueeze(1) * target
+        estimate = mask * estimate
+        target = mask * target
 
     # Calculate filtered target
     input_shape = estimate.shape
@@ -207,7 +212,7 @@ def convolution_invariant_target(
 
     # Mask to keep only the valid samples
     if mask is not None:
-        target_filt = mask.unsqueeze(1) * target_filt
+        target_filt = mask * target_filt
 
     return target_filt
 
@@ -261,11 +266,12 @@ def calculate_sdr_batch(
             )
 
         # Construct a binary mask
-        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True).squeeze(1)
+        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(estimate)
 
     if remove_mean:
-        estimate = estimate - temporal_mean(estimate, mask=mask, keepdim=True, eps=eps)
-        target = target - temporal_mean(target, mask=mask, keepdim=True, eps=eps)
+        estimate = estimate - calculate_mean(estimate, mask=mask, dim=-1, keepdim=True, eps=eps)
+        target = target - calculate_mean(target, mask=mask, dim=-1, keepdim=True, eps=eps)
 
     if scale_invariant or (convolution_invariant and convolution_filter_length == 1):
         target = scale_invariant_target(estimate=estimate, target=target, mask=mask, eps=eps)
@@ -276,8 +282,8 @@ def calculate_sdr_batch(
 
     distortion = estimate - target
 
-    target_pow = temporal_mean(torch.abs(target) ** 2, mask=mask, eps=eps)
-    distortion_pow = temporal_mean(torch.abs(distortion) ** 2, mask=mask, eps=eps)
+    target_pow = calculate_mean(torch.abs(target) ** 2, mask=mask, dim=-1, eps=eps)
+    distortion_pow = calculate_mean(torch.abs(distortion) ** 2, mask=mask, dim=-1, eps=eps)
 
     if sdr_max is not None:
         distortion_pow = distortion_pow + 10 ** (-sdr_max / 10) * target_pow
@@ -353,7 +359,7 @@ def input_types(self):
             "estimate": NeuralType(signal_shape, AudioSignal()),
             "target": NeuralType(signal_shape, AudioSignal()),
             "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
-            "mask": NeuralType(('B', 'T'), MaskType(), optional=True),
+            "mask": NeuralType(('B', 'C', 'T'), MaskType(), optional=True),
         }
 
     @property
@@ -376,10 +382,10 @@ def forward(
         perform averaging across channels (weighting optional), and apply reduction across the batch.
 
         Args:
-            estimate: Batch of signals, shape (B, T, C)
-            target: Batch of signals, shape (B, T, C)
+            estimate: Batch of signals, shape (B, C, T)
+            target: Batch of signals, shape (B, C, T)
             input_length: Batch of lengths, shape (B,)
-            mask: Batch of temporal masks, shape (B, T)
+            mask: Batch of temporal masks for each channel, shape (B, C, T)
 
         Returns:
             Scalar loss.
@@ -410,3 +416,161 @@ def forward(
         sdr = self.reduce(sdr)
 
         return -sdr
+
+
+def calculate_mse_batch(
+    estimate: torch.Tensor,
+    target: torch.Tensor,
+    input_length: Optional[torch.Tensor] = None,
+    mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Calculate MSE per channel.
+
+        MSE = ||estimate - target||_2^2 / input_length
+
+    Args:
+        estimate: estimated signal, shape (B, C, T) or (B, C, D, T)
+        target: target signal, shape (B, C, T) or (B, C, D, T)
+        input_length: Optional, length of valid samples, shape (B,)
+        mask: Optional, temporal mask, same shape as signals
+
+    Returns:
+        MSE for each channel, shape (B, C)
+    """
+    assert (
+        estimate.shape == target.shape
+    ), f'Estimate shape ({estimate.shape}) not matching target shape ({target.shape})'
+
+    if input_length is not None:
+        if mask is not None:
+            raise RuntimeError(
+                'Argument `input_length` is mutually exclusive with `mask`. Both cannot be used at the same time.'
+            )
+
+        # Construct a binary mask
+        mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True)
+        mask = mask.expand_as(estimate)
+
+    # error
+    err = estimate - target
+
+    # dimensions for averaging
+    if estimate.ndim == 3:
+        # average across time
+        dim = -1
+    elif estimate.ndim == 4:
+        # average across time and features
+        dim = (-2, -1)
+    else:
+        raise RuntimeError(f'Unexpected dimension of the input: {estimate.shape}')
+
+    # calculate masked mean
+    mse = calculate_mean(torch.abs(err) ** 2, mask=mask, dim=dim)
+
+    return mse
+
+
+class MSELoss(Loss, Typing):
+    """
+    Computes MSE loss with weighted average across channels.
+
+    Args:
+        weight: weight for loss of each output channel, used for averaging the loss across channels. Defaults to `None` (averaging).
+        reduction: batch reduction. Defaults to `mean` over the batch.
+        ndim: Number of dimensions for the input signal
+    """
+
+    def __init__(
+        self, weight: Optional[List[float]] = None, reduction: str = 'mean', ndim: int = 3,
+    ):
+        super().__init__()
+
+        # weight buffer
+        if weight is not None:
+            if any([w <= 0 for w in weight]):
+                raise ValueError(f'Weight must be positive! Current value: {weight}')
+            elif not np.isclose(sum(weight), 1, atol=1e-6):
+                raise ValueError(f'Weight should add to one, current weight: {weight}')
+            weight = torch.tensor(weight).reshape(1, -1)
+            logging.info(f'Channel weight set to %s', weight)
+        self.register_buffer('weight', weight)
+        self.weight: Optional[Tensor]
+
+        # Batch reduction
+        self.reduction = reduction
+        if reduction == 'mean':
+            self.reduce = torch.mean
+        else:
+            raise ValueError(f'Unexpected reduction mode {reduction}.')
+
+        # Input dimension
+        self.ndim = ndim
+
+        if self.ndim == 3:
+            # Time-domain input
+            self.signal_shape = ('B', 'C', 'T')
+        elif self.ndim == 4:
+            # Spectral-domain input
+            self.signal_shape = ('B', 'C', 'D', 'T')
+        else:
+            raise ValueError(f'Unexpected input dimension: {self.ndim}')
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tweight:       %s', self.weight)
+        logging.debug('\treduction:    %s', self.reduction)
+        logging.debug('\tndim:         %s', self.ndim)
+        logging.debug('\tsignal_shape: %s', self.signal_shape)
+
+    @property
+    def input_types(self):
+        """Input types definitions for SDRLoss.
+        """
+        return {
+            "estimate": NeuralType(self.signal_shape, VoidType()),
+            "target": NeuralType(self.signal_shape, VoidType()),
+            "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "mask": NeuralType(self.signal_shape, MaskType(), optional=True),
+        }
+
+    @property
+    def output_types(self):
+        """Output types definitions for SDRLoss.
+        loss:
+            NeuralType(None)
+        """
+        return {"loss": NeuralType(elements_type=LossType())}
+
+    @typecheck()
+    def forward(
+        self,
+        estimate: torch.Tensor,
+        target: torch.Tensor,
+        input_length: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """For input batch of multi-channel signals, calculate SDR between estimate and target for each channel,
+        perform averaging across channels (weighting optional), and apply reduction across the batch.
+
+        Args:
+            estimate: Estimate of the target signal
+            target: Target signal
+            input_length: Length of each example in the batch
+            mask: Mask for each signal
+
+        Returns:
+            Scalar loss.
+        """
+        mse = calculate_mse_batch(estimate=estimate, target=target, input_length=input_length, mask=mask,)
+
+        # channel averaging
+        if self.weight is None:
+            mse = torch.mean(mse, dim=1)
+        else:
+            # weighting across channels
+            mse = mse * self.weight
+            mse = torch.sum(mse, dim=1)
+
+        # reduction
+        mse = self.reduce(mse)
+
+        return mse
diff --git a/nemo/collections/asr/metrics/audio.py b/nemo/collections/asr/metrics/audio.py
index 5e8c2915e3fa..db63ac19c098 100644
--- a/nemo/collections/asr/metrics/audio.py
+++ b/nemo/collections/asr/metrics/audio.py
@@ -57,6 +57,7 @@ class AudioMetricWrapper(Metric):
     """
 
     full_state_update: bool = False
+    num_examples: torch.Tensor
 
     def __init__(
         self, metric: Metric, channel: Optional[int] = None, metric_using_batch_averaging: Optional[bool] = None
@@ -74,6 +75,7 @@ def __init__(
 
         self._metric = metric
         self._channel = channel
+        self.add_state('num_examples', default=torch.tensor(0), dist_reduce_fx='sum')
         logging.debug('Setup metric %s, channel %s', metric, str(channel))
 
     def _select_channel(self, preds: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -144,6 +146,8 @@ def update(self, preds: torch.Tensor, target: torch.Tensor, input_length: Option
             for b_preds, b_target in self._trim_inputs(preds=preds, target=target, input_length=input_length):
                 self._metric.update(preds=b_preds, target=b_target)
 
+        self.num_examples += preds.size(0)
+
     def compute(self) -> torch.Tensor:
         """Compute the underlying metric.
         """
@@ -179,6 +183,9 @@ def forward(
     def reset(self) -> None:
         """Reset the underlying metric.
         """
+        # reset the internal states
+        super().reset()
+        # reset the underlying metric
         self._metric.reset()
 
     def __repr__(self) -> str:
diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py
index 019c57f9c4e3..23c759afc80d 100644
--- a/nemo/collections/asr/models/__init__.py
+++ b/nemo/collections/asr/models/__init__.py
@@ -23,7 +23,11 @@
 from nemo.collections.asr.models.clustering_diarizer import ClusteringDiarizer
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.models.ctc_models import EncDecCTCModel
-from nemo.collections.asr.models.enhancement_models import EncMaskDecAudioToAudioModel
+from nemo.collections.asr.models.enhancement_models import (
+    EncMaskDecAudioToAudioModel,
+    PredictiveAudioToAudioModel,
+    ScoreBasedGenerativeAudioToAudioModel,
+)
 from nemo.collections.asr.models.hybrid_rnnt_ctc_bpe_models import EncDecHybridRNNTCTCBPEModel
 from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel
 from nemo.collections.asr.models.k2_sequence_models import (
diff --git a/nemo/collections/asr/models/audio_to_audio_model.py b/nemo/collections/asr/models/audio_to_audio_model.py
index 49364843e8b8..094dbc38b72a 100644
--- a/nemo/collections/asr/models/audio_to_audio_model.py
+++ b/nemo/collections/asr/models/audio_to_audio_model.py
@@ -12,15 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
+import os
+import tempfile
 from abc import ABC, abstractmethod
-from typing import List, Union
+from typing import Dict, List, Optional, Union
 
 import hydra
+import librosa
+import soundfile as sf
 import torch
 from omegaconf import DictConfig, OmegaConf
 from pytorch_lightning import Trainer
+from tqdm import tqdm
 
+from nemo.collections.asr.data import audio_to_audio_dataset
+from nemo.collections.asr.data.audio_to_audio_lhotse import LhotseAudioToTargetDataset
+from nemo.collections.asr.data.audio_to_text_dataset import inject_dataloader_value_from_model_config
 from nemo.collections.asr.metrics.audio import AudioMetricWrapper
+from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes import ModelPT
 from nemo.utils import logging, model_utils
 
@@ -158,23 +169,384 @@ def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0):
     def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):
         return self.multi_evaluation_epoch_end(outputs, dataloader_idx, 'test')
 
-    @abstractmethod
+    @torch.no_grad()
     def process(
-        self, paths2audio_files: List[str], output_dir: str, batch_size: int = 4
-    ) -> List[Union[str, List[str]]]:
+        self,
+        paths2audio_files: List[str],
+        output_dir: str,
+        batch_size: int = 1,
+        num_workers: Optional[int] = None,
+        input_channel_selector: Optional[ChannelSelectorType] = None,
+    ) -> List[str]:
+        """
+        Process audio files provided in paths2audio_files.
+        Processed signals will be saved in output_dir.
+
+        Args:
+            paths2audio_files: (a list) of paths to audio files. \
+                Recommended length per file is between 5 and 25 seconds. \
+                But it is possible to pass a few hours long file if enough GPU memory is available.
+            output_dir: 
+            batch_size: (int) batch size to use during inference.
+                Bigger will result in better throughput performance but would use more memory.
+            num_workers: Number of workers for the dataloader
+            input_channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
+
+        Returns:
+        """
+        if paths2audio_files is None or len(paths2audio_files) == 0:
+            return {}
+
+        if num_workers is None:
+            num_workers = min(batch_size, os.cpu_count() - 1)
+
+        # Output
+        paths2processed_files = []
+
+        # Model's mode and device
+        mode = self.training
+        device = next(self.parameters()).device
+
+        try:
+            # Switch model to evaluation mode
+            self.eval()
+            # Freeze weights
+            self.freeze()
+
+            logging_level = logging.get_verbosity()
+            logging.set_verbosity(logging.WARNING)
+
+            # Processing
+            with tempfile.TemporaryDirectory() as tmpdir:
+                # Save temporary manifest
+                temporary_manifest_filepath = os.path.join(tmpdir, 'manifest.json')
+                with open(temporary_manifest_filepath, 'w', encoding='utf-8') as fp:
+                    for audio_file in paths2audio_files:
+                        entry = {'input_filepath': audio_file, 'duration': librosa.get_duration(path=audio_file)}
+                        fp.write(json.dumps(entry) + '\n')
+
+                config = {
+                    'manifest_filepath': temporary_manifest_filepath,
+                    'input_key': 'input_filepath',
+                    'input_channel_selector': input_channel_selector,
+                    'batch_size': min(batch_size, len(paths2audio_files)),
+                    'num_workers': num_workers,
+                }
+
+                # Create output dir if necessary
+                if not os.path.isdir(output_dir):
+                    os.makedirs(output_dir)
+
+                # DataLoader for the input files
+                temporary_dataloader = self._setup_process_dataloader(config)
+
+                # Indexing of the original files, used to form the output file name
+                file_idx = 0
+
+                # Process batches
+                for test_batch in tqdm(temporary_dataloader, desc="Processing"):
+                    input_signal = test_batch[0]
+                    input_length = test_batch[1]
+
+                    # Expand channel dimension, if necessary
+                    # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+                    if input_signal.ndim == 2:
+                        input_signal = input_signal.unsqueeze(1)
+
+                    processed_batch, _ = self.forward(
+                        input_signal=input_signal.to(device), input_length=input_length.to(device)
+                    )
+
+                    for example_idx in range(processed_batch.size(0)):
+                        # This assumes the data loader is not shuffling files
+                        file_name = os.path.basename(paths2audio_files[file_idx])
+                        # Prepare output file
+                        output_file = os.path.join(output_dir, f'processed_{file_name}')
+                        # Crop the output signal to the actual length
+                        output_signal = processed_batch[example_idx, :, : input_length[example_idx]].cpu().numpy()
+                        # Write audio
+                        sf.write(output_file, output_signal.T, self.sample_rate, 'float')
+                        # Update the file counter
+                        file_idx += 1
+                        # Save processed file
+                        paths2processed_files.append(output_file)
+
+                    del test_batch
+                    del processed_batch
+
+        finally:
+            # set mode back to its original value
+            self.train(mode=mode)
+            if mode is True:
+                self.unfreeze()
+            logging.set_verbosity(logging_level)
+
+        return paths2processed_files
+
+    def _setup_dataloader_from_config(self, config: Optional[Dict]):
+
+        if config.get("use_lhotse", False):
+            return get_lhotse_dataloader_from_config(
+                config, global_rank=self.global_rank, world_size=self.world_size, dataset=LhotseAudioToTargetDataset()
+            )
+
+        is_concat = config.get('is_concat', False)
+        if is_concat:
+            raise NotImplementedError('Concat not implemented')
+
+        # TODO: Consider moving `inject` from `audio_to_text_dataset` to a utility module?
+        # Automatically inject args from model config to dataloader config
+        inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate')
+
+        # Instantiate tarred dataset loader or normal dataset loader
+        if config.get('is_tarred', False):
+            raise NotImplementedError('Tarred datasets not supported')
+
+        if 'manifest_filepath' in config and config['manifest_filepath'] is None:
+            logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}")
+            return None
+
+        dataset = audio_to_audio_dataset.get_audio_to_target_dataset(config=config)
+
+        if hasattr(dataset, 'collate_fn'):
+            collate_fn = dataset.collate_fn
+        elif hasattr(dataset.datasets[0], 'collate_fn'):
+            # support datasets that are lists of entries
+            collate_fn = dataset.datasets[0].collate_fn
+        else:
+            # support datasets that are lists of lists
+            collate_fn = dataset.datasets[0].datasets[0].collate_fn
+
+        return torch.utils.data.DataLoader(
+            dataset=dataset,
+            batch_size=config['batch_size'],
+            collate_fn=collate_fn,
+            drop_last=config.get('drop_last', False),
+            shuffle=config['shuffle'],
+            num_workers=config.get('num_workers', 0),
+            pin_memory=config.get('pin_memory', False),
+        )
+
+    def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the training data loader via a Dict-like object.
+
+        Args:
+            train_data_config: A config that contains the information regarding construction
+                of a training dataset.
+
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
+        """
+        if 'shuffle' not in train_data_config:
+            train_data_config['shuffle'] = True
+
+        # preserve config
+        self._update_dataset_config(dataset_name='train', config=train_data_config)
+
+        self._train_dl = self._setup_dataloader_from_config(config=train_data_config)
+
+        if 'is_tarred' in train_data_config and train_data_config['is_tarred']:
+            raise NotImplementedError('Tarred datasets not supported')
+
+    def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the validation data loader via a Dict-like object.
+
+        Args:
+            val_data_config: A config that contains the information regarding construction
+                of a validation dataset.
+
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
+        """
+        if 'shuffle' not in val_data_config:
+            val_data_config['shuffle'] = False
+
+        # preserve config
+        self._update_dataset_config(dataset_name='validation', config=val_data_config)
+
+        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config)
+
+    def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the test data loader via a Dict-like object.
+
+        Args:
+            test_data_config: A config that contains the information regarding construction
+                of a test dataset.
+
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
+        """
+        if 'shuffle' not in test_data_config:
+            test_data_config['shuffle'] = False
+
+        # preserve config
+        self._update_dataset_config(dataset_name='test', config=test_data_config)
+
+        self._test_dl = self._setup_dataloader_from_config(config=test_data_config)
+
+    def _setup_process_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
+        """Prepare a dataloader for processing files.
+
+        Args:
+            config: A python dictionary which contains the following keys:
+                manifest_filepath: path to a manifest file
+                input_key: key with audio filepaths in the manifest
+                input_channel_selector: Optional, used to select a subset of channels from input audio files
+                batch_size: batch size for the dataloader
+                num_workers: number of workers for the dataloader
+
+        Returns:
+            A pytorch DataLoader for the given manifest filepath.
+        """
+        dl_config = {
+            'manifest_filepath': config['manifest_filepath'],
+            'sample_rate': self.sample_rate,
+            'input_key': config['input_key'],
+            'input_channel_selector': config.get('input_channel_selector', None),
+            'target_key': None,
+            'target_channel_selector': None,
+            'batch_size': config['batch_size'],
+            'shuffle': False,
+            'num_workers': config.get('num_workers', min(config['batch_size'], os.cpu_count() - 1)),
+            'pin_memory': True,
+        }
+
+        temporary_dataloader = self._setup_dataloader_from_config(config=DictConfig(dl_config))
+        return temporary_dataloader
+
+    @staticmethod
+    def match_batch_length(input: torch.Tensor, batch_length: int) -> torch.Tensor:
+        """Trim or pad the output to match the batch length.
+
+        Args:
+            input: tensor with shape (B, C, T)
+            batch_length: int
+
+        Returns:
+            Tensor with shape (B, C, T), where T matches the
+            batch length.
+        """
+        input_length = input.size(-1)
+        pad_length = batch_length - input_length
+        pad = (0, pad_length)
+        # pad with zeros or crop
+        return torch.nn.functional.pad(input, pad, 'constant', 0)
+
+    @torch.no_grad()
+    def process(
+        self,
+        paths2audio_files: List[str],
+        output_dir: str,
+        batch_size: int = 1,
+        num_workers: Optional[int] = None,
+        input_channel_selector: Optional[ChannelSelectorType] = None,
+    ) -> List[str]:
         """
         Takes paths to audio files and returns a list of paths to processed
         audios.
 
         Args:
             paths2audio_files: paths to audio files to be processed
-            output_dir: directory to save processed files
-            batch_size: batch size for inference
+            output_dir: directory to save the processed files
+            batch_size: (int) batch size to use during inference.
+            num_workers: Number of workers for the dataloader
+            input_channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio.
+                            If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
 
         Returns:
             Paths to processed audio signals.
         """
-        pass
+        if paths2audio_files is None or len(paths2audio_files) == 0:
+            return {}
+
+        if num_workers is None:
+            num_workers = min(batch_size, os.cpu_count() - 1)
+
+        # Output
+        paths2processed_files = []
+
+        # Model's mode and device
+        mode = self.training
+        device = next(self.parameters()).device
+
+        try:
+            # Switch model to evaluation mode
+            self.eval()
+            # Freeze weights
+            self.freeze()
+
+            logging_level = logging.get_verbosity()
+            logging.set_verbosity(logging.WARNING)
+
+            # Processing
+            with tempfile.TemporaryDirectory() as tmpdir:
+                # Save temporary manifest
+                temporary_manifest_filepath = os.path.join(tmpdir, 'manifest.json')
+                with open(temporary_manifest_filepath, 'w', encoding='utf-8') as fp:
+                    for audio_file in paths2audio_files:
+                        entry = {'input_filepath': audio_file, 'duration': librosa.get_duration(path=audio_file)}
+                        fp.write(json.dumps(entry) + '\n')
+
+                config = {
+                    'manifest_filepath': temporary_manifest_filepath,
+                    'input_key': 'input_filepath',
+                    'input_channel_selector': input_channel_selector,
+                    'batch_size': min(batch_size, len(paths2audio_files)),
+                    'num_workers': num_workers,
+                }
+
+                # Create output dir if necessary
+                if not os.path.isdir(output_dir):
+                    os.makedirs(output_dir)
+
+                # DataLoader for the input files
+                temporary_dataloader = self._setup_process_dataloader(config)
+
+                # Indexing of the original files, used to form the output file name
+                file_idx = 0
+
+                # Process batches
+                for test_batch in tqdm(temporary_dataloader, desc="Processing"):
+                    input_signal = test_batch[0]
+                    input_length = test_batch[1]
+
+                    # Expand channel dimension, if necessary
+                    # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+                    if input_signal.ndim == 2:
+                        input_signal = input_signal.unsqueeze(1)
+
+                    processed_batch, _ = self.forward(
+                        input_signal=input_signal.to(device), input_length=input_length.to(device)
+                    )
+
+                    for example_idx in range(processed_batch.size(0)):
+                        # This assumes the data loader is not shuffling files
+                        file_name = os.path.basename(paths2audio_files[file_idx])
+                        # Prepare output file
+                        output_file = os.path.join(output_dir, f'processed_{file_name}')
+                        # Crop the output signal to the actual length
+                        output_signal = processed_batch[example_idx, :, : input_length[example_idx]].cpu().numpy()
+                        # Write audio
+                        sf.write(output_file, output_signal.T, self.sample_rate, 'float')
+                        # Update the file counter
+                        file_idx += 1
+                        # Save processed file
+                        paths2processed_files.append(output_file)
+
+                    del test_batch
+                    del processed_batch
+
+        finally:
+            # set mode back to its original value
+            self.train(mode=mode)
+            if mode is True:
+                self.unfreeze()
+            logging.set_verbosity(logging_level)
+
+        return paths2processed_files
 
     @classmethod
     def list_available_models(cls) -> 'List[PretrainedModelInfo]':
diff --git a/nemo/collections/asr/models/enhancement_models.py b/nemo/collections/asr/models/enhancement_models.py
index b80c357364aa..b765ae0fddad 100644
--- a/nemo/collections/asr/models/enhancement_models.py
+++ b/nemo/collections/asr/models/enhancement_models.py
@@ -16,6 +16,8 @@
 import tempfile
 from typing import Dict, List, Optional, Union
 
+import einops
+import hydra
 import librosa
 import soundfile as sf
 import torch
@@ -23,17 +25,13 @@
 from pytorch_lightning import Trainer
 from tqdm import tqdm
 
-from nemo.collections.asr.data import audio_to_audio_dataset
-from nemo.collections.asr.data.audio_to_audio_lhotse import LhotseAudioToTargetDataset
-from nemo.collections.asr.data.audio_to_text_dataset import inject_dataloader_value_from_model_config
+
 from nemo.collections.asr.models.audio_to_audio_model import AudioToAudioModel
-from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
-from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
-from nemo.core.neural_types import AudioSignal, LengthsType, NeuralType
+from nemo.core.neural_types import AudioSignal, LengthsType, LossType, NeuralType
 from nemo.utils import logging
 
-__all__ = ['EncMaskDecAudioToAudioModel']
+__all__ = ['EncMaskDecAudioToAudioModel', 'ScoreBasedGenerativeAudioToAudioModel', 'PredictiveAudioToAudioModel']
 
 
 class EncMaskDecAudioToAudioModel(AudioToAudioModel):
@@ -69,10 +67,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             logging.debug('Mixture consistency not used')
             self.mixture_consistency = None
 
-        # Future enhancement:
-        # If subclasses need to modify the config before calling super()
-        # Check ASRBPE* classes do with their mixin
-
         # Setup augmentation
         if hasattr(self.cfg, 'channel_augment') and self.cfg.channel_augment is not None:
             logging.debug('Using channel augmentation')
@@ -84,254 +78,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         # Setup optional Optimization flags
         self.setup_optimization_flags()
 
-    @torch.no_grad()
-    def process(
-        self,
-        paths2audio_files: List[str],
-        output_dir: str,
-        batch_size: int = 1,
-        num_workers: Optional[int] = None,
-        input_channel_selector: Optional[ChannelSelectorType] = None,
-    ) -> List[str]:
-        """
-        Process audio files provided in paths2audio_files.
-        Processed signals will be saved in output_dir.
-
-        Args:
-            paths2audio_files: (a list) of paths to audio files. \
-                Recommended length per file is between 5 and 25 seconds. \
-                But it is possible to pass a few hours long file if enough GPU memory is available.
-            output_dir: 
-            batch_size: (int) batch size to use during inference.
-                Bigger will result in better throughput performance but would use more memory.
-            num_workers: Number of workers for the dataloader
-            input_channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
-
-        Returns:
-        """
-        if paths2audio_files is None or len(paths2audio_files) == 0:
-            return {}
-
-        if num_workers is None:
-            num_workers = min(batch_size, os.cpu_count() - 1)
-
-        # Output
-        paths2processed_files = []
-
-        # Model's mode and device
-        mode = self.training
-        device = next(self.parameters()).device
-
-        try:
-            # Switch model to evaluation mode
-            self.eval()
-            # Freeze weights
-            self.freeze()
-
-            logging_level = logging.get_verbosity()
-            logging.set_verbosity(logging.WARNING)
-
-            # Processing
-            with tempfile.TemporaryDirectory() as tmpdir:
-                # Save temporary manifest
-                temporary_manifest_filepath = os.path.join(tmpdir, 'manifest.json')
-                with open(temporary_manifest_filepath, 'w', encoding='utf-8') as fp:
-                    for audio_file in paths2audio_files:
-                        entry = {'input_filepath': audio_file, 'duration': librosa.get_duration(path=audio_file)}
-                        fp.write(json.dumps(entry) + '\n')
-
-                config = {
-                    'manifest_filepath': temporary_manifest_filepath,
-                    'input_key': 'input_filepath',
-                    'input_channel_selector': input_channel_selector,
-                    'batch_size': min(batch_size, len(paths2audio_files)),
-                    'num_workers': num_workers,
-                }
-
-                # Create output dir if necessary
-                if not os.path.isdir(output_dir):
-                    os.makedirs(output_dir)
-
-                # DataLoader for the input files
-                temporary_dataloader = self._setup_process_dataloader(config)
-
-                # Indexing of the original files, used to form the output file name
-                file_idx = 0
-
-                # Process batches
-                for test_batch in tqdm(temporary_dataloader, desc="Processing"):
-                    input_signal = test_batch[0]
-                    input_length = test_batch[1]
-
-                    # Expand channel dimension, if necessary
-                    # For consistency, the model uses multi-channel format, even if the channel dimension is 1
-                    if input_signal.ndim == 2:
-                        input_signal = input_signal.unsqueeze(1)
-
-                    processed_batch, _ = self.forward(
-                        input_signal=input_signal.to(device), input_length=input_length.to(device)
-                    )
-
-                    for example_idx in range(processed_batch.size(0)):
-                        # This assumes the data loader is not shuffling files
-                        file_name = os.path.basename(paths2audio_files[file_idx])
-                        # Prepare output file
-                        output_file = os.path.join(output_dir, f'processed_{file_name}')
-                        # Crop the output signal to the actual length
-                        output_signal = processed_batch[example_idx, :, : input_length[example_idx]].cpu().numpy()
-                        # Write audio
-                        sf.write(output_file, output_signal.T, self.sample_rate, 'float')
-                        # Update the file counter
-                        file_idx += 1
-                        # Save processed file
-                        paths2processed_files.append(output_file)
-
-                    del test_batch
-                    del processed_batch
-
-        finally:
-            # set mode back to its original value
-            self.train(mode=mode)
-            if mode is True:
-                self.unfreeze()
-            logging.set_verbosity(logging_level)
-
-        return paths2processed_files
-
-    def _setup_dataloader_from_config(self, config: Optional[Dict]):
-
-        if config.get("use_lhotse", False):
-            return get_lhotse_dataloader_from_config(
-                config, global_rank=self.global_rank, world_size=self.world_size, dataset=LhotseAudioToTargetDataset()
-            )
-
-        is_concat = config.get('is_concat', False)
-        if is_concat:
-            raise NotImplementedError('Concat not implemented')
-
-        # TODO: Consider moving `inject` from `audio_to_text_dataset` to a utility module?
-        # Automatically inject args from model config to dataloader config
-        inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate')
-
-        # Instantiate tarred dataset loader or normal dataset loader
-        if config.get('is_tarred', False):
-            raise NotImplementedError('Tarred datasets not supported')
-
-        if 'manifest_filepath' in config and config['manifest_filepath'] is None:
-            logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}")
-            return None
-
-        dataset = audio_to_audio_dataset.get_audio_to_target_dataset(config=config)
-
-        if hasattr(dataset, 'collate_fn'):
-            collate_fn = dataset.collate_fn
-        elif hasattr(dataset.datasets[0], 'collate_fn'):
-            # support datasets that are lists of entries
-            collate_fn = dataset.datasets[0].collate_fn
-        else:
-            # support datasets that are lists of lists
-            collate_fn = dataset.datasets[0].datasets[0].collate_fn
-
-        return torch.utils.data.DataLoader(
-            dataset=dataset,
-            batch_size=config['batch_size'],
-            collate_fn=collate_fn,
-            drop_last=config.get('drop_last', False),
-            shuffle=config['shuffle'],
-            num_workers=config.get('num_workers', 0),
-            pin_memory=config.get('pin_memory', False),
-        )
-
-    def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]):
-        """
-        Sets up the training data loader via a Dict-like object.
-
-        Args:
-            train_data_config: A config that contains the information regarding construction
-                of a training dataset.
-
-        Supported Datasets:
-            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
-        """
-        if 'shuffle' not in train_data_config:
-            train_data_config['shuffle'] = True
-
-        # preserve config
-        self._update_dataset_config(dataset_name='train', config=train_data_config)
-
-        self._train_dl = self._setup_dataloader_from_config(config=train_data_config)
-
-        if 'is_tarred' in train_data_config and train_data_config['is_tarred']:
-            raise NotImplementedError('Tarred datasets not supported')
-
-    def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]):
-        """
-        Sets up the validation data loader via a Dict-like object.
-
-        Args:
-            val_data_config: A config that contains the information regarding construction
-                of a validation dataset.
-
-        Supported Datasets:
-            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
-        """
-        if 'shuffle' not in val_data_config:
-            val_data_config['shuffle'] = False
-
-        # preserve config
-        self._update_dataset_config(dataset_name='validation', config=val_data_config)
-
-        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config)
-
-    def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
-        """
-        Sets up the test data loader via a Dict-like object.
-
-        Args:
-            test_data_config: A config that contains the information regarding construction
-                of a test dataset.
-
-        Supported Datasets:
-            -   :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`
-        """
-        if 'shuffle' not in test_data_config:
-            test_data_config['shuffle'] = False
-
-        # preserve config
-        self._update_dataset_config(dataset_name='test', config=test_data_config)
-
-        self._test_dl = self._setup_dataloader_from_config(config=test_data_config)
-
-    def _setup_process_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
-        """Prepare a dataloader for processing files.
-
-        Args:
-            config: A python dictionary which contains the following keys:
-                manifest_filepath: path to a manifest file
-                input_key: key with audio filepaths in the manifest
-                input_channel_selector: Optional, used to select a subset of channels from input audio files
-                batch_size: batch size for the dataloader
-                num_workers: number of workers for the dataloader
-
-        Returns:
-            A pytorch DataLoader for the given manifest filepath.
-        """
-        dl_config = {
-            'manifest_filepath': config['manifest_filepath'],
-            'sample_rate': self.sample_rate,
-            'input_key': config['input_key'],
-            'input_channel_selector': config.get('input_channel_selector', None),
-            'target_key': None,
-            'target_channel_selector': None,
-            'batch_size': config['batch_size'],
-            'shuffle': False,
-            'num_workers': config.get('num_workers', min(config['batch_size'], os.cpu_count() - 1)),
-            'pin_memory': True,
-        }
-
-        temporary_dataloader = self._setup_dataloader_from_config(config=DictConfig(dl_config))
-        return temporary_dataloader
-
     @property
     def input_types(self) -> Dict[str, NeuralType]:
         return {
@@ -350,23 +96,6 @@ def output_types(self) -> Dict[str, NeuralType]:
             "output_length": NeuralType(tuple('B'), LengthsType(), optional=True),
         }
 
-    def match_batch_length(self, input: torch.Tensor, batch_length: int):
-        """Trim or pad the output to match the batch length.
-
-        Args:
-            input: tensor with shape (B, C, T)
-            batch_length: int
-
-        Returns:
-            Tensor with shape (B, C, T), where T matches the
-            batch length.
-        """
-        input_length = input.size(-1)
-        pad_length = batch_length - input_length
-        pad = (0, pad_length)
-        # pad with zeros or crop
-        return torch.nn.functional.pad(input, pad, 'constant', 0)
-
     @typecheck()
     def forward(self, input_signal, input_length=None):
         """
@@ -380,6 +109,7 @@ def forward(self, input_signal, input_length=None):
                 sequences.
 
         Returns:
+            Output signal `output` in the time domain and the length of the output signal `output_length`.
         """
         batch_length = input_signal.size(-1)
 
@@ -414,12 +144,11 @@ def training_step(self, batch, batch_idx):
         else:
             input_signal, input_length, target_signal, _ = batch
 
-        # Expand channel dimension, if necessary
         # For consistency, the model uses multi-channel format, even if the channel dimension is 1
         if input_signal.ndim == 2:
-            input_signal = input_signal.unsqueeze(1)
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
         if target_signal.ndim == 2:
-            target_signal = target_signal.unsqueeze(1)
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
 
         # Apply channel augmentation
         if self.training and self.channel_augmentation is not None:
@@ -449,12 +178,11 @@ def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str =
         else:
             input_signal, input_length, target_signal, _ = batch
 
-        # Expand channel dimension, if necessary
         # For consistency, the model uses multi-channel format, even if the channel dimension is 1
         if input_signal.ndim == 2:
-            input_signal = input_signal.unsqueeze(1)
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
         if target_signal.ndim == 2:
-            target_signal = target_signal.unsqueeze(1)
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
 
         # Process input
         processed_signal, _ = self.forward(input_signal=input_signal, input_length=input_length)
@@ -485,3 +213,406 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
         results = []
 
         return results
+
+
+class PredictiveAudioToAudioModel(AudioToAudioModel):
+    """This models aims to directly estimate the coefficients
+    in the encoded domain by applying a neural model.
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        super().__init__(cfg=cfg, trainer=trainer)
+        self.sample_rate = self._cfg.sample_rate
+
+        # Setup processing modules
+        self.encoder = self.from_config_dict(self._cfg.encoder)
+        self.decoder = self.from_config_dict(self._cfg.decoder)
+
+        # Neural estimator
+        self.estimator = self.from_config_dict(self._cfg.estimator)
+
+        # Normalization
+        self.normalize_input = self._cfg.get('normalize_input', False)
+
+        # Term added to the denominator to improve numerical stability
+        self.eps = self._cfg.get('eps', 1e-8)
+
+        # Setup optional Optimization flags
+        self.setup_optimization_flags()
+
+        logging.debug('Initialized %s', self.__class__.__name__)
+        logging.debug('\tnormalize_input: %s', self.normalize_input)
+        logging.debug('\teps:             %s', self.eps)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        return {
+            "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        return {
+            "output_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "output_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @typecheck()
+    def forward(self, input_signal, input_length=None):
+        """Forward pass of the model.
+        
+        Args:
+            input_signal: time-domain signal
+            input_length: valid length of each example in the batch
+        
+        Returns:
+            Output signal `output` in the time domain and the length of the output signal `output_length`.
+        """
+        batch_length = input_signal.size(-1)
+
+        if self.normalize_input:
+            # max for each example in the batch
+            norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True)
+            # scale input signal
+            input_signal = input_signal / (norm_scale + self.eps)
+
+        # Encoder
+        encoded, encoded_length = self.encoder(input=input_signal, input_length=input_length)
+
+        # Backbone
+        estimated, estimated_length = self.estimator(input=encoded, input_length=encoded_length)
+
+        # Decoder
+        output, output_length = self.decoder(input=estimated, input_length=estimated_length)
+
+        if self.normalize_input:
+            # rescale to the original scale
+            output = output * norm_scale
+
+        # Trim or pad the estimated signal to match input length
+        output = self.match_batch_length(input=output, batch_length=batch_length)
+        return output, output_length
+
+    # PTL-specific methods
+    def training_step(self, batch, batch_idx):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Estimate the signal
+        output_signal, _ = self.forward(input_signal=input_signal, input_length=input_length)
+
+        # Calculate the loss
+        loss = self.loss(estimate=output_signal, target=target_signal, input_length=input_length)
+
+        # Logs
+        self.log('train_loss', loss)
+        self.log('learning_rate', self._optimizer.param_groups[0]['lr'])
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return loss
+
+    def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Estimate the signal
+        output_signal, _ = self.forward(input_signal=input_signal, input_length=input_length)
+
+        # Prepare output
+        loss = self.loss(estimate=output_signal, target=target_signal, input_length=input_length)
+
+        # Update metrics
+        if hasattr(self, 'metrics') and tag in self.metrics:
+            # Update metrics for this (tag, dataloader_idx)
+            for name, metric in self.metrics[tag][dataloader_idx].items():
+                metric.update(preds=output_signal, target=target_signal, input_length=input_length)
+
+        # Log global step
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return {f'{tag}_loss': loss}
+
+
+class ScoreBasedGenerativeAudioToAudioModel(AudioToAudioModel):
+    """This models is using a score-based diffusion process to generate
+    an encoded representation of the enhanced signal.
+    
+    The model consists of the following blocks:
+        - encoder: transforms input multi-channel audio signal into an encoded representation (analysis transform)
+        - estimator: neural model, estimates a score for the diffusion process
+        - sde: stochastic differential equation (SDE) defining the forward and reverse diffusion process
+        - sampler: sampler for the reverse diffusion process, estimates coefficients of the target signal
+        - decoder: transforms sampler output into the time domain (synthesis transform)
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        super().__init__(cfg=cfg, trainer=trainer)
+        self.sample_rate = self._cfg.sample_rate
+
+        # Setup processing modules
+        self.encoder = self.from_config_dict(self._cfg.encoder)
+        self.decoder = self.from_config_dict(self._cfg.decoder)
+
+        # Neural score estimator
+        self.estimator = self.from_config_dict(self._cfg.estimator)
+
+        # SDE
+        self.sde = self.from_config_dict(self._cfg.sde)
+
+        # Sampler
+        if 'sde' in self._cfg.sampler:
+            raise ValueError('SDE should be defined in the model config, not in the sampler config')
+        if 'score_estimator' in self._cfg.sampler:
+            raise ValueError('Score estimator should be defined in the model config, not in the sampler config')
+
+        self.sampler = hydra.utils.instantiate(self._cfg.sampler, sde=self.sde, score_estimator=self.estimator)
+
+        # Normalization
+        self.normalize_input = self._cfg.get('normalize_input', False)
+
+        # Metric evaluation
+        self.max_utts_evaluation_metrics = self._cfg.get('max_utts_evaluation_metrics')
+
+        if self.max_utts_evaluation_metrics is not None:
+            logging.warning(
+                'Metrics will be evaluated on first %d examples of the evaluation datasets.',
+                self.max_utts_evaluation_metrics,
+            )
+
+        # Term added to the denominator to improve numerical stability
+        self.eps = self._cfg.get('eps', 1e-8)
+
+        # Setup optional Optimization flags
+        self.setup_optimization_flags()
+
+        logging.debug('Initialized %s', self.__class__.__name__)
+        logging.debug('\tnormalize_input: %s', self.normalize_input)
+        logging.debug('\teps:             %s', self.eps)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        return {
+            "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        return {
+            "output_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "output_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @typecheck()
+    @torch.inference_mode()
+    def forward(self, input_signal, input_length=None):
+        """Forward pass of the model.
+
+        Forward pass of the model aplies the following steps:
+            - encoder to obtain the encoded representation of the input signal
+            - sampler to generate the estimated coefficients of the target signal
+            - decoder to transform the sampler output into the time domain
+
+        Args:
+            input_signal: Tensor that represents a batch of raw audio signals,
+                of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as
+                `self.sample_rate` number of floating point values.
+            input_signal_length: Vector of length B, that contains the individual lengths of the audio
+                sequences.
+
+        Returns:
+            Output signal `output` in the time domain and the length of the output signal `output_length`.
+        """
+        batch_length = input_signal.size(-1)
+
+        if self.normalize_input:
+            # max for each example in the batch
+            norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True)
+            # scale input signal
+            input_signal = input_signal / (norm_scale + self.eps)
+
+        # Encoder
+        encoded, encoded_length = self.encoder(input=input_signal, input_length=input_length)
+
+        # Sampler
+        generated, generated_length = self.sampler(
+            prior_mean=encoded, score_condition=encoded, state_length=encoded_length
+        )
+
+        # Decoder
+        output, output_length = self.decoder(input=generated, input_length=generated_length)
+
+        if self.normalize_input:
+            # rescale to the original scale
+            output = output * norm_scale
+
+        # Trim or pad the estimated signal to match input length
+        output = self.match_batch_length(input=output, batch_length=batch_length)
+        return output, output_length
+
+    @typecheck(
+        input_types={
+            "target_signal": NeuralType(('B', 'C', 'T'), AudioSignal()),
+            "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal()),
+            "input_length": NeuralType(tuple('B'), LengthsType()),
+        },
+        output_types={"loss": NeuralType(None, LossType()),},
+    )
+    def _step(self, target_signal, input_signal, input_length=None):
+        """Randomly generate a time step for each example in the batch, estimate
+        the score and calculate the loss value.
+
+        Note that this step does not include sampler.
+        """
+        batch_size = target_signal.size(0)
+
+        if self.normalize_input:
+            # max for each example in the batch
+            norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True)
+            # scale input signal
+            input_signal = input_signal / (norm_scale + self.eps)
+            # scale the target signal
+            target_signal = target_signal / (norm_scale + self.eps)
+
+        # Apply encoder to both target and the input
+        input_enc, input_enc_len = self.encoder(input=input_signal, input_length=input_length)
+        target_enc, _ = self.encoder(input=target_signal, input_length=input_length)
+
+        # Generate random time steps
+        sde_time = self.sde.generate_time(size=batch_size, device=input_enc.device)
+
+        # Get the mean and the variance of the perturbation kernel
+        pk_mean, pk_std = self.sde.perturb_kernel_params(state=target_enc, prior_mean=input_enc, time=sde_time)
+
+        # Generate a random sample from a standard normal distribution
+        z_norm = torch.randn_like(input_enc)
+
+        # Prepare perturbed data
+        perturbed_enc = pk_mean + pk_std * z_norm
+
+        # Score is conditioned on the perturbed data and the input
+        estimator_input = torch.cat([perturbed_enc, input_enc], dim=-3)
+
+        # Estimate the score using the neural estimator
+        # SDE time is used to inform the estimator about the current time step
+        # Note:
+        # - some implementations use `score = -self._raw_dnn_output(x, t, y)`
+        # - this seems to be unimportant, and is an artifact of transfering code from the original Song's repo
+        score_est, score_len = self.estimator(input=estimator_input, input_length=input_enc_len, condition=sde_time)
+
+        # Score loss weighting as in Section 4.2 in http://arxiv.org/abs/1907.05600
+        score_est = score_est * pk_std
+        score_ref = -z_norm
+
+        # Score matching loss on the normalized scores
+        loss = self.loss(estimate=score_est, target=score_ref, input_length=score_len)
+
+        return loss
+
+    # PTL-specific methods
+    def training_step(self, batch, batch_idx):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Calculate the loss
+        loss = self._step(target_signal=target_signal, input_signal=input_signal, input_length=input_length)
+
+        # Logs
+        self.log('train_loss', loss)
+        self.log('learning_rate', self._optimizer.param_groups[0]['lr'])
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return loss
+
+    def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Calculate loss
+        loss = self._step(target_signal=target_signal, input_signal=input_signal, input_length=input_length)
+
+        # Update metrics
+        update_metrics = False
+        if self.max_utts_evaluation_metrics is None:
+            # Always update if max is not configured
+            update_metrics = True
+            # Number of examples to process
+            num_examples = input_signal.size(0)  # batch size
+        else:
+            # Check how many examples have been used for metric calculation
+            first_metric_name = next(iter(self.metrics[tag][dataloader_idx]))
+            num_examples_evaluated = self.metrics[tag][dataloader_idx][first_metric_name].num_examples
+            # Update metrics if some examples were not processed
+            update_metrics = num_examples_evaluated < self.max_utts_evaluation_metrics
+            # Number of examples to process
+            num_examples = min(self.max_utts_evaluation_metrics - num_examples_evaluated, input_signal.size(0))
+
+        if update_metrics:
+            # Generate output signal
+            output_signal, _ = self.forward(
+                input_signal=input_signal[:num_examples, ...], input_length=input_length[:num_examples]
+            )
+
+            # Update metrics
+            if hasattr(self, 'metrics') and tag in self.metrics:
+                # Update metrics for this (tag, dataloader_idx)
+                for name, metric in self.metrics[tag][dataloader_idx].items():
+                    metric.update(
+                        preds=output_signal,
+                        target=target_signal[:num_examples, ...],
+                        input_length=input_length[:num_examples],
+                    )
+
+        # Log global step
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return {f'{tag}_loss': loss}
diff --git a/nemo/collections/asr/modules/audio_modules.py b/nemo/collections/asr/modules/audio_modules.py
index 82cfbefeb8d9..67a923099cde 100644
--- a/nemo/collections/asr/modules/audio_modules.py
+++ b/nemo/collections/asr/modules/audio_modules.py
@@ -17,7 +17,7 @@
 import numpy as np
 import torch
 
-from nemo.collections.asr.losses.audio_losses import temporal_mean
+from nemo.collections.asr.losses.audio_losses import calculate_mean
 from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder
 from nemo.collections.asr.parts.preprocessing.features import make_seq_mask_like
 from nemo.collections.asr.parts.submodules.multichannel_modules import (
@@ -39,6 +39,7 @@
     'MaskReferenceChannel',
     'MaskBasedBeamformer',
     'MaskBasedDereverbWPE',
+    'MixtureConsistencyProjection',
 ]
 
 
@@ -158,7 +159,7 @@ def get_mean_time_channel(input: torch.Tensor, input_length: Optional[torch.Tens
             mean = torch.mean(input, dim=(-1, -3), keepdim=True)
         else:
             # temporal mean
-            mean = temporal_mean(input, input_length, keepdim=True)
+            mean = calculate_mean(input, input_length, dim=-1, keepdim=True)
             # channel mean
             mean = torch.mean(mean, dim=-3, keepdim=True)
 
@@ -186,7 +187,7 @@ def get_mean_std_time_channel(
             mean = cls.get_mean_time_channel(input, input_length)
             std = (input - mean).pow(2)
             # temporal mean
-            std = temporal_mean(std, input_length, keepdim=True)
+            std = calculate_mean(std, input_length, dim=-1, keepdim=True)
             # channel mean
             std = torch.mean(std, dim=-3, keepdim=True)
             # final value
diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py
index cc5312403255..643bc4a69d69 100644
--- a/nemo/collections/asr/modules/audio_preprocessing.py
+++ b/nemo/collections/asr/modules/audio_preprocessing.py
@@ -709,9 +709,11 @@ class AudioToSpectrogram(NeuralModule):
         hop_length: length of hops/shifts of the sliding window
         power: exponent for magnitude spectrogram. Default `None` will
                return a complex-valued spectrogram
+        magnitude_power: Transform magnitude of the spectrogram as x^magnitude_power.
+        scale: Positive scaling of the spectrogram.
     """
 
-    def __init__(self, fft_length: int, hop_length: int, power: Optional[float] = None):
+    def __init__(self, fft_length: int, hop_length: int, magnitude_power: float = 1.0, scale: float = 1.0):
         if not HAVE_TORCHAUDIO:
             logging.error('Could not import torchaudio. Some features might not work.')
 
@@ -726,12 +728,26 @@ def __init__(self, fft_length: int, hop_length: int, power: Optional[float] = No
             raise ValueError(f'fft_length = {fft_length} must be divisible by 2')
 
         self.stft = torchaudio.transforms.Spectrogram(
-            n_fft=fft_length, hop_length=hop_length, power=power, pad_mode='constant'
+            n_fft=fft_length, hop_length=hop_length, power=None, pad_mode='constant'
         )
 
         # number of subbands
         self.F = fft_length // 2 + 1
 
+        if magnitude_power <= 0:
+            raise ValueError(f'Magnitude power needs to be positive: current value {magnitude_power}')
+        self.magnitude_power = magnitude_power
+
+        if scale <= 0:
+            raise ValueError(f'Scale needs to be positive: current value {scale}')
+        self.scale = scale
+
+        logging.debug('Initialized %s with:', self.__class__.__name__)
+        logging.debug('\tfft_length:      %s', fft_length)
+        logging.debug('\thop_length:      %s', hop_length)
+        logging.debug('\tmagnitude_power: %s', magnitude_power)
+        logging.debug('\tscale:           %s', scale)
+
     @property
     def num_subbands(self) -> int:
         return self.F
@@ -776,6 +792,14 @@ def forward(
         with torch.cuda.amp.autocast(enabled=False):
             output = self.stft(input.float())
 
+            if self.magnitude_power != 1:
+                # apply power on the magnitude
+                output = torch.pow(output.abs(), self.magnitude_power) * torch.exp(1j * output.angle())
+
+            if self.scale != 1:
+                # apply scaling of the coefficients
+                output = self.scale * output
+
         if input_length is not None:
             # Mask padded frames
             output_length = self.get_output_length(input_length=input_length)
@@ -810,11 +834,11 @@ class SpectrogramToAudio(NeuralModule):
     Args:
         fft_length: length of FFT
         hop_length: length of hops/shifts of the sliding window
-        power: exponent for magnitude spectrogram. Default `None` will
-               return a complex-valued spectrogram
+        magnitude_power: Transform magnitude of the spectrogram as x^(1/magnitude_power).
+        scale: Spectrogram will be scaled with 1/scale before the inverse transform. 
     """
 
-    def __init__(self, fft_length: int, hop_length: int):
+    def __init__(self, fft_length: int, hop_length: int, magnitude_power: float = 1.0, scale: float = 1.0):
         if not HAVE_TORCHAUDIO:
             logging.error('Could not import torchaudio. Some features might not work.')
 
@@ -834,6 +858,20 @@ def __init__(self, fft_length: int, hop_length: int):
 
         self.F = fft_length // 2 + 1
 
+        if magnitude_power <= 0:
+            raise ValueError(f'Magnitude power needs to be positive: current value {magnitude_power}')
+        self.magnitude_power = magnitude_power
+
+        if scale <= 0:
+            raise ValueError(f'Scale needs to be positive: current value {scale}')
+        self.scale = scale
+
+        logging.debug('Initialized %s with:', self.__class__.__name__)
+        logging.debug('\tfft_length:      %s', fft_length)
+        logging.debug('\thop_length:      %s', hop_length)
+        logging.debug('\tmagnitude_power: %s', magnitude_power)
+        logging.debug('\tscale:           %s', scale)
+
     @property
     def num_subbands(self) -> int:
         return self.F
@@ -875,7 +913,16 @@ def forward(self, input: torch.Tensor, input_length: Optional[torch.Tensor] = No
 
         # iSTFT output (B, C, T)
         with torch.cuda.amp.autocast(enabled=False):
-            output = self.istft(input.cfloat())
+            output = input.cfloat()
+
+            if self.scale != 1:
+                # apply 1/scale on the coefficients
+                output = output / self.scale
+
+            if self.magnitude_power != 1:
+                # apply 1/power on the magnitude
+                output = torch.pow(output.abs(), 1 / self.magnitude_power) * torch.exp(1j * output.angle())
+            output = self.istft(output)
 
         if input_length is not None:
             # Mask padded samples
diff --git a/nemo/collections/asr/parts/submodules/diffusion.py b/nemo/collections/asr/parts/submodules/diffusion.py
new file mode 100644
index 000000000000..db3d30f49701
--- /dev/null
+++ b/nemo/collections/asr/parts/submodules/diffusion.py
@@ -0,0 +1,1310 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, Sequence, Tuple, Type
+
+import einops
+import einops.layers.torch
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from nemo.collections.common.parts.utils import activation_registry
+from nemo.collections.tts.parts.utils.helpers import mask_sequence_tensor
+from nemo.core.classes import NeuralModule, typecheck
+from nemo.core.neural_types import FloatType, LengthsType, NeuralType, SpectrogramType, VoidType
+from nemo.utils import logging
+
+__all__ = [
+    'OrnsteinUhlenbeckVarianceExplodingSDE',
+    'SpectrogramNoiseConditionalScoreNetworkPlusPlus',
+    'NoiseConditionalScoreNetworkPlusPlus',
+    'PredictorCorrectorSampler',
+]
+
+
+class StochasticDifferentialEquation(NeuralModule, ABC):
+    """Base class for stochastic differential equations.
+    """
+
+    def __init__(self, time_min: float, time_max: float, num_steps: int):
+        super().__init__()
+
+        # min and max time
+        if time_min <= 0:
+            raise ValueError(f'time_min should be positive, current value {time_min}')
+
+        if time_max <= time_min:
+            raise ValueError(f'time_max should be larger than time_min, current max {time_max} and min {time_min}')
+
+        self.time_min = time_min
+        self.time_max = time_max
+
+        # number of steps
+        if num_steps <= 0:
+            raise ValueError(f'num_steps needs to be positive: current value {num_steps}')
+
+        self.num_steps = num_steps
+
+    @property
+    def dt(self) -> float:
+        """Time step for this SDE.
+        This denotes the step size between `0` and `self.time_max` when using `self.num_steps`.
+        """
+        return self.time_max / self.num_steps
+
+    @property
+    def time_delta(self) -> float:
+        """Time range for this SDE.
+        """
+        return self.time_max - self.time_min
+
+    def generate_time(self, size: int, device: torch.device) -> torch.Tensor:
+        """Generate random time steps in the valid range.
+
+        Time steps are generated between `self.time_min` and `self.time_max`.
+
+        Args:
+            size: number of samples
+            device: device to use
+
+        Returns:
+            A tensor of floats with shape (size,)
+        """
+        time = torch.rand(size, device=device) * self.time_delta + self.time_min
+        return time
+
+    @abstractmethod
+    def coefficients(self, state: torch.Tensor, time: torch.Tensor, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            state: tensor of shape (B, C, D, T)
+            time: tensor of shape (B,)
+
+        Returns:
+            Tuple with drift and diffusion coefficients.
+        """
+        pass
+
+    @typecheck(
+        input_types={"prior_mean": NeuralType(('B', 'C', 'D', 'T'), VoidType()),},
+        output_types={"sample": NeuralType(('B', 'C', 'D', 'T'), VoidType()),},
+    )
+    @abstractmethod
+    def prior_sampling(self, prior_mean: torch.Tensor) -> torch.Tensor:
+        """Generate a sample from the prior distribution p_T.
+
+        Args:
+            prior_mean: Mean of the prior distribution
+
+        Returns:
+            A sample from the prior distribution.
+        """
+        pass
+
+    def discretize(
+        self, *, state: torch.Tensor, time: torch.Tensor, state_length: Optional[torch.Tensor] = None, **kwargs
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Assume we have the following SDE:
+
+            dx = drift(x, t) * dt + diffusion(x, t) * dwt
+
+        where `wt` is the standard Wiener process.
+
+        We assume the following discretization:
+
+            new_state = current_state + total_drift + total_diffusion * z_norm
+
+        where `z_norm` is sampled from normal distribution with zero mean and unit variance.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            state_length: length of the valid time steps for each example in the batch, shape (B,)
+            **kwargs: other parameters
+
+        Returns:
+            Drift and diffusion.
+        """
+        # Get coefficients
+        drift_coefficient, diffusion_coefficient = self.coefficients(
+            state=state, time=time, state_length=state_length, **kwargs
+        )
+
+        # Discretized drift
+        drift = drift_coefficient * self.dt
+
+        # Note:
+        # Scale with sqrt(dt) because z_norm is sampled from a normal distribution with zero mean and
+        # unit variance and dwt is normally distributed with zero mean and variance dt
+        diffusion = diffusion_coefficient * np.sqrt(self.dt)
+
+        return drift, diffusion
+
+    @abstractmethod
+    def copy(self):
+        """Create a copy of this SDE.
+        """
+        pass
+
+    def __repr__(self):
+        desc = f'{self.__class__.__name__}(time_min={self.time_min}, time_max={self.time_max}, num_steps={self.num_steps})'
+        desc += f'\n\tdt:         {self.dt}'
+        desc += f'\n\ttime_delta: {self.time_delta}'
+        return desc
+
+
+class OrnsteinUhlenbeckVarianceExplodingSDE(StochasticDifferentialEquation):
+    """This class implements the Ornstein-Uhlenbeck SDE with variance exploding noise schedule.
+
+    The SDE is given by:
+
+        dx = theta * (y - x) dt + g(t) dw
+
+    where `theta` is the stiffness parameter and `g(t)` is the diffusion coefficient:
+
+        g(t) = std_min * (std_max/std_min)^t * sqrt(2 * log(std_max/std_min))
+
+    References:
+        Richter et al., Speech Enhancement and Dereverberation with Diffusion-based Generative Models, Tr. ASLP 2023
+    """
+
+    def __init__(
+        self,
+        stiffness: float,
+        std_min: float,
+        std_max: float,
+        num_steps: int = 100,
+        time_min: float = 3e-2,
+        time_max: float = 1.0,
+        eps: float = 1e-8,
+    ):
+        super().__init__(time_min=time_min, time_max=time_max, num_steps=num_steps)
+
+        # Small regularization
+        if eps <= 0:
+            raise ValueError(f'eps should be positive, current value {eps}')
+        self.eps = eps
+
+        # stifness
+        self.stiffness = stiffness
+
+        # noise schedule
+        if std_min <= 0:
+            raise ValueError(f'std_min should be positive, current value {std_min}')
+
+        if std_max <= std_min:
+            raise ValueError(f'std_max should be larger than std_min, current max {std_max} and min {std_min}')
+
+        self.std_min = std_min
+        self.std_max = std_max
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tstiffness:     %s', self.stiffness)
+        logging.debug('\tstd_min:       %s', self.std_min)
+        logging.debug('\tstd_max:       %s', self.std_max)
+        logging.debug('\tnum_steps:     %s', self.num_steps)
+        logging.debug('\ttime_min:      %s', self.time_min)
+        logging.debug('\ttime_max:      %s', self.time_max)
+        logging.debug('\teps:           %s', self.eps)
+
+    @property
+    def std_ratio(self) -> float:
+        return self.std_max / (self.std_min + self.eps)
+
+    @property
+    def log_std_ratio(self) -> float:
+        return np.log(self.std_ratio + self.eps)
+
+    @typecheck(
+        input_types={
+            "state": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "prior_mean": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "time": NeuralType(tuple('B'), FloatType()),
+        },
+        output_types={"mean": NeuralType(('B', 'C', 'D', 'T'), FloatType()),},
+    )
+    def perturb_kernel_mean(self, state: torch.Tensor, prior_mean: torch.Tensor, time: torch.Tensor) -> torch.Tensor:
+        """Return the mean of the perturbation kernel for this SDE.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            prior_mean: mean of the prior distribution
+            time: current time of the process, shape (B,)
+
+        Returns:
+            A tensor of shape (B, C, D, T)
+        """
+        # exponential weighting
+        weight = torch.exp(-self.stiffness * time)
+
+        # view as [B, C, D, T]
+        weight = weight.view(-1, 1, 1, 1)
+
+        # closed-form mean
+        mean = weight * state + (1 - weight) * prior_mean
+
+        return mean
+
+    @typecheck(
+        input_types={"time": NeuralType(tuple('B'), FloatType()),},
+        output_types={"std": NeuralType(tuple('B'), FloatType()),},
+    )
+    def perturb_kernel_std(self, time: torch.Tensor) -> torch.Tensor:
+        """Return the standard deviation of the perturbation kernel for this SDE.
+
+        Note that the standard deviation depends on the time and the noise schedule,
+        which is parametrized using `self.stiffness`, `self.std_min` and `self.std_max`.
+
+        Args:
+            time: current time of the process, shape (B,)
+
+        Returns:
+            A tensor of shape (B,)
+        """
+        var = (self.std_min ** 2) * self.log_std_ratio
+        var *= torch.pow(self.std_ratio, 2 * time) - torch.exp(-2 * self.stiffness * time)
+        var /= self.stiffness + self.log_std_ratio
+        std = torch.sqrt(var)
+        return std
+
+    @typecheck(
+        input_types={
+            "state": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "prior_mean": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "time": NeuralType(tuple('B'), FloatType()),
+        },
+        output_types={
+            "mean": NeuralType(('B', 'C', 'D', 'T'), FloatType()),
+            "std": NeuralType(('B', 'C', 'D', 'T'), FloatType()),
+        },
+    )
+    def perturb_kernel_params(self, state: torch.Tensor, prior_mean: torch.Tensor, time: torch.Tensor) -> torch.Tensor:
+        """Return the mean and standard deviation of the perturbation kernel for this SDE.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            prior_mean: mean of the prior distribution
+            time: current time of the process, shape (B,)
+        """
+        assert torch.all(time <= self.time_max)
+        assert torch.all(time >= self.time_min)
+
+        # compute the mean
+        mean = self.perturb_kernel_mean(state=state, prior_mean=prior_mean, time=time)
+
+        # compute the standard deviation
+        std = self.perturb_kernel_std(time=time)
+        # view as [B, C, D, T]
+        std = std.view(-1, 1, 1, 1)
+
+        return mean, std
+
+    @typecheck(
+        input_types={
+            "state": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "time": NeuralType(tuple('B'), VoidType()),
+            "prior_mean": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "state_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        },
+        output_types={
+            "drift_coefficient": NeuralType(('B', 'C', 'D', 'T'), FloatType()),
+            "diffusion_coefficient": NeuralType(('B', 'C', 'D', 'T'), FloatType()),
+        },
+    )
+    def coefficients(
+        self,
+        state: torch.Tensor,
+        time: torch.Tensor,
+        prior_mean: torch.Tensor,
+        state_length: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute drift and diffusion coefficients for this SDE.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            prior_mean: mean of the prior distribution
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            Drift and diffusion coefficients.
+        """
+        # Drift coefficient
+        drift_coefficient = self.stiffness * (prior_mean - state)
+
+        # Diffusion coefficient
+        diffusion_coefficient = self.std_min * torch.pow(self.std_ratio, time) * np.sqrt(2 * self.log_std_ratio)
+        # View in the same shape as the state
+        diffusion_coefficient = diffusion_coefficient.view(-1, *([1] * (state.dim() - 1)))
+
+        if state_length is not None:
+            drift_coefficient = mask_sequence_tensor(drift_coefficient, state_length)
+            diffusion_coefficient = mask_sequence_tensor(diffusion_coefficient, state_length)
+
+        return drift_coefficient, diffusion_coefficient
+
+    def prior_sampling(self, prior_mean: torch.Tensor) -> torch.Tensor:
+        """Generate a sample from the prior distribution p_T.
+
+        Args:
+            prior_mean: Mean of the prior distribution
+        """
+        # Final time step for all samples in the batch
+        time = self.time_max * torch.ones(prior_mean.shape[0], device=prior_mean.device)
+
+        # Compute the std of the prior distribution
+        std = self.perturb_kernel_std(time=time)
+
+        # view as [B, C, D, T]
+        std = std.view(-1, 1, 1, 1)
+
+        # Generate a sample from a normal distribution centered at prior_mean
+        sample = prior_mean + torch.randn_like(prior_mean) * std
+
+        return sample
+
+    def copy(self):
+        return OrnsteinUhlenbeckVarianceExplodingSDE(
+            stiffness=self.stiffness,
+            std_min=self.std_min,
+            std_max=self.std_max,
+            num_steps=self.num_steps,
+            time_min=self.time_min,
+            time_max=self.time_max,
+            eps=self.eps,
+        )
+
+    def __repr__(self):
+        desc = f'{self.__class__.__name__}(stiffness={self.stiffness}, std_min={self.std_min}, std_max={self.std_max}, num_steps={self.num_steps}, time_min={self.time_min}, time_max={self.time_max}, eps={self.eps})'
+        desc += f'\n\tdt:         {self.dt}'
+        desc += f'\n\ttime_delta: {self.time_delta}'
+        desc += f'\n\tstd_ratio:  {self.std_ratio}'
+        desc += f'\n\tlog_std_ratio:  {self.log_std_ratio}'
+
+        return desc
+
+
+class ReverseStochasticDifferentialEquation(StochasticDifferentialEquation):
+    def __init__(self, *, sde: Type[StochasticDifferentialEquation], score_estimator: Type[NeuralModule]):
+        """Use the forward SDE and a score estimator to define the reverse SDE.
+
+        Args:
+            sde: forward SDE
+            score_estimator: neural score estimator
+        """
+        super().__init__(time_min=sde.time_min, time_max=sde.time_max, num_steps=sde.num_steps)
+        self.score_estimator = score_estimator
+        self.forward_sde = sde
+
+        logging.debug('Initialized %s', self.__class__.__name__)
+
+    def coefficients(
+        self,
+        state: torch.Tensor,
+        time: torch.Tensor,
+        score_condition: Optional[torch.Tensor] = None,
+        state_length: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute drift and diffusion coefficients for the reverse SDE.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+        """
+        raise NotImplementedError('Coefficients not necessary for the reverse SDE.')
+
+    def prior_sampling(self, shape: torch.Size, device: torch.device) -> torch.Tensor:
+        """Prior sampling is not necessary for the reverse SDE.
+        """
+        raise NotImplementedError('Prior sampling not necessary for the reverse SDE.')
+
+    def discretize(
+        self,
+        *,
+        state: torch.Tensor,
+        time: torch.Tensor,
+        score_condition: Optional[torch.Tensor] = None,
+        state_length: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Discretize the reverse SDE.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            score_condition: condition for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+            **kwargs: other parameters for discretization of the forward SDE
+        """
+        # Drift and diffusion from the forward SDE
+        forward_drift, forward_diffusion = self.forward_sde.discretize(state=state, time=time, **kwargs)
+
+        # For input for the score estimator:
+        # - if no condition is provided, use the state
+        # - if a condition is provided, concatenate the state and the condition along the channel dimension
+        score_input = state if score_condition is None else torch.cat([state, score_condition], dim=1)
+
+        # Estimate score
+        score, _ = self.score_estimator(input=score_input, input_length=state_length, condition=time)
+
+        # Adjust drift
+        drift = forward_drift - forward_diffusion.pow(2) * score
+
+        # Adjust diffusion
+        diffusion = forward_diffusion
+
+        if state_length is not None:
+            drift = mask_sequence_tensor(drift, state_length)
+            diffusion = mask_sequence_tensor(diffusion, state_length)
+
+        return drift, diffusion
+
+    def copy(self):
+        return ReverseStochasticDifferentialEquation(sde=self.forward_sde.copy(), score_estimator=self.score_estimator)
+
+    def __repr__(self):
+        desc = f'{self.__class__.__name__}(sde={self.forward_sde}, score_estimator={self.score_estimator})'
+        return desc
+
+
+class SpectrogramNoiseConditionalScoreNetworkPlusPlus(NeuralModule):
+    """This model handles complex-valued inputs by stacking real and imaginary components.
+    Stacked tensor is processed using NCSN++ and the output is projected to generate real
+    and imaginary components of the output channels.
+
+    Args:
+        in_channels: number of input complex-valued channels
+        out_channels: number of output complex-valued channels
+    """
+
+    def __init__(self, *, in_channels: int = 1, out_channels: int = 1, **kwargs):
+        super().__init__()
+
+        # Number of input signals for this estimator
+        if in_channels < 1:
+            raise ValueError(
+                f'Number of input channels needs to be larger or equal to one, current value {in_channels}'
+            )
+
+        self.in_channels = in_channels
+
+        # Number of output signals for this estimator
+        if out_channels < 1:
+            raise ValueError(
+                f'Number of output channels needs to be larger or equal to one, current value {out_channels}'
+            )
+
+        self.out_channels = out_channels
+
+        # Instantiate noise conditional score network NCSN++
+        ncsnpp_params = kwargs.copy()
+        ncsnpp_params['in_channels'] = ncsnpp_params['out_channels'] = 2 * self.in_channels  # stack real and imag
+        self.ncsnpp = NoiseConditionalScoreNetworkPlusPlus(**ncsnpp_params)
+
+        # Output projection to generate real and imaginary components of the output channels
+        self.output_projection = torch.nn.Conv2d(
+            in_channels=2 * self.in_channels, out_channels=2 * self.out_channels, kernel_size=1
+        )
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tin_channels:  %s', self.in_channels)
+        logging.debug('\tout_channels: %s', self.out_channels)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "input_length": NeuralType(('B',), LengthsType(), optional=True),
+            "condition": NeuralType(('B',), FloatType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "output_length": NeuralType(('B',), LengthsType(), optional=True),
+        }
+
+    @typecheck()
+    def forward(self, input, input_length=None, condition=None):
+        # Stack real and imaginary components
+        B, C_in, D, T = input.shape
+
+        if C_in != self.in_channels:
+            raise RuntimeError(f'Unexpected input channel size {C_in}, expected {self.in_channels}')
+
+        # Stack real and imaginary parts
+        input_real_imag = torch.stack([input.real, input.imag], dim=2)
+        input = einops.rearrange(input_real_imag, 'B C RI F T -> B (C RI) F T')
+
+        # Process using NCSN++
+        output, output_length = self.ncsnpp(input=input, input_length=input_length, condition=condition)
+
+        # Output projection
+        output = self.output_projection(output)
+
+        # Convert to complex-valued signal
+        output = output.reshape(B, 2, self.out_channels, D, T)
+        # Move real/imag dimension to the end
+        output = output.permute(0, 2, 3, 4, 1)
+        output = torch.view_as_complex(output.contiguous())
+
+        return output, output_length
+
+
+class NoiseConditionalScoreNetworkPlusPlus(NeuralModule):
+    """Implementation of Noise Conditional Score Network (NCSN++) architecture.
+
+    References:
+        - Song et al., Score-Based Generative Modeling through Stochastic Differential Equations, NeurIPS 2021
+        - Brock et al., Large scale GAN training for high fidelity natural image synthesis, ICLR 2018
+    """
+
+    def __init__(
+        self,
+        nonlinearity: str = "swish",
+        in_channels: int = 2,  # number of channels in the input image
+        out_channels: int = 2,  # number of channels in the output image
+        channels: Sequence[int] = (128, 128, 256, 256, 256),  # number of channels at start + at every resolution
+        num_res_blocks: int = 2,
+        num_resolutions: int = 4,
+        init_scale: float = 1e-5,
+        conditioned_on_time: bool = False,
+        fourier_embedding_scale: float = 16.0,
+        dropout_rate: float = 0.0,
+        pad_time_to: Optional[int] = None,
+        pad_dimension_to: Optional[int] = None,
+        **_,
+    ):
+        # Network topology is a flavor of UNet, example chart for num_resolutions=4
+        #
+        # 1: Image  → Image/2  → Image/4  → Image/8
+        #       ↓        ↓          ↓          ↓
+        # 2: Hidden → Hidden/2 → Hidden/4 → Hidden/8
+        #       ↓        ↓          ↓          ↓
+        # 3: Hidden ← Hidden/2 ← Hidden/4 ← Hidden/8
+        #       ↓        ↓          ↓          ↓
+        # 4: Image  ← Image/2  ← Image/4  ← Image/8
+
+        # Horizontal arrows in (1) are downsampling
+        # Vertical arrows from (1) to (2) are channel upconversions
+        #
+        # Horizontal arrows in (2) are blocks with downsampling where necessary
+        # Horizontal arrows in (3) are blocks with upsampling where necessary
+        #
+        # Vertical arrows from (1) to (2) are downsampling and channel upconversioins
+        # Vertical arrows from (2) to (3) are sums connections (also with / sqrt(2))
+        # Vertical arrows from (3) to (4) are channel downconversions
+        # Horizontal arrows in (4) are upsampling and addition
+        super().__init__()
+
+        # same nonlinearity is used throughout the whole network
+        self.activation: torch.nn.Module = activation_registry[nonlinearity]()
+        self.init_scale: float = init_scale
+
+        self.downsample = torch.nn.Upsample(scale_factor=0.5, mode="bilinear")
+        self.upsample = torch.nn.Upsample(scale_factor=2, mode="bilinear")
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.channels = channels
+        self.num_res_blocks = num_res_blocks
+        self.num_resolutions = num_resolutions
+        self.conditioned_on_time = conditioned_on_time
+
+        # padding setup
+        self.pad_time_to = pad_time_to or 2 ** self.num_resolutions
+        self.pad_dimension_to = pad_dimension_to or 2 ** self.num_resolutions
+
+        if self.conditioned_on_time:
+            self.time_embedding = torch.nn.Sequential(
+                GaussianFourierProjection(embedding_size=self.channels[0], scale=fourier_embedding_scale),
+                torch.nn.Linear(self.channels[0] * 2, self.channels[0] * 4),
+                self.activation,
+                torch.nn.Linear(self.channels[0] * 4, self.channels[0] * 4),
+            )
+
+        self.input_pyramid = torch.nn.ModuleList()
+        for ch in self.channels[:-1]:
+            self.input_pyramid.append(torch.nn.Conv2d(in_channels=self.in_channels, out_channels=ch, kernel_size=1))
+
+        # each block takes an image and outputs an image
+        # possibly changes number of channels
+        # output blocks ("reverse" path of the unet) reuse outputs of input blocks ("forward" path)
+        # so great care must be taken to in/out channels of each block
+        # resolutions are handled in `forward`
+        block_params = {
+            "activation": self.activation,
+            "dropout_rate": dropout_rate,
+            "init_scale": self.init_scale,
+            "diffusion_step_embedding_dim": channels[0] * 4 if self.conditioned_on_time else None,
+        }
+        self.input_blocks = torch.nn.ModuleList()
+        for in_ch, out_ch in zip(self.channels[:-1], self.channels[1:]):
+            for n in range(num_res_blocks):
+                block = ResnetBlockBigGANPlusPlus(in_ch=in_ch if n == 0 else out_ch, out_ch=out_ch, **block_params)
+                self.input_blocks.append(block)
+
+        self.output_blocks = torch.nn.ModuleList()
+        for in_ch, out_ch in zip(reversed(self.channels[1:]), reversed(self.channels[:-1])):
+            for n in reversed(range(num_res_blocks)):
+                block = ResnetBlockBigGANPlusPlus(in_ch=in_ch, out_ch=out_ch if n == 0 else in_ch, **block_params)
+                self.output_blocks.append(block)
+
+        self.projection_blocks = torch.nn.ModuleList()
+        for ch in self.channels[:-1]:
+            self.projection_blocks.append(torch.nn.Conv2d(ch, out_channels, kernel_size=1))
+
+        assert len(self.input_pyramid) == self.num_resolutions
+        assert len(self.input_blocks) == self.num_resolutions * self.num_res_blocks
+        assert len(self.output_blocks) == self.num_resolutions * self.num_res_blocks
+        assert len(self.projection_blocks) == self.num_resolutions
+
+        self.init_weights_()
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tin_channels:         %s', self.in_channels)
+        logging.debug('\tout_channels:        %s', self.out_channels)
+        logging.debug('\tchannels:            %s', self.channels)
+        logging.debug('\tnum_res_blocks:      %s', self.num_res_blocks)
+        logging.debug('\tnum_resolutions:     %s', self.num_resolutions)
+        logging.debug('\tconditioned_on_time: %s', self.conditioned_on_time)
+        logging.debug('\tpad_time_to:         %s', self.pad_time_to)
+        logging.debug('\tpad_dimension_to:    %s', self.pad_dimension_to)
+
+    def init_weights_(self):
+        for module in self.modules():
+            if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    torch.nn.init.zeros_(module.bias)
+
+        # torch.nn submodules with scaled init
+        for module in self.projection_blocks:
+            torch.nn.init.xavier_uniform_(module.weight, gain=self.init_scale)
+
+        # non-torch.nn submodules can have their own init schemes
+        for module in self.modules():
+            if module is self:
+                continue
+
+            if hasattr(module, "init_weights_"):
+                module.init_weights_()
+
+    @typecheck(
+        input_types={"input": NeuralType(('B', 'C', 'D', 'T')),},
+        output_types={"output": NeuralType(('B', 'C', 'D', 'T')),},
+    )
+    def pad_input(self, input: torch.Tensor) -> torch.Tensor:
+        """Pad input tensor to match the required dimensions across `T` and `D`.
+        """
+        *_, D, T = input.shape
+        output = input
+
+        # padding across time
+        if T % self.pad_time_to != 0:
+            output = F.pad(output, (0, self.pad_time_to - T % self.pad_time_to))
+
+        # padding across dimension
+        if D % self.pad_dimension_to != 0:
+            output = F.pad(output, (0, 0, 0, self.pad_dimension_to - D % self.pad_dimension_to))
+
+        return output
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "input": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "input_length": NeuralType(('B',), LengthsType(), optional=True),
+            "condition": NeuralType(('B',), FloatType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "output": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "output_length": NeuralType(('B',), LengthsType(), optional=True),
+        }
+
+    @typecheck()
+    def forward(
+        self, *, input: torch.Tensor, input_length: Optional[torch.Tensor], condition: Optional[torch.Tensor] = None
+    ):
+        """Forward pass of the model.
+
+        Args:
+            input: input tensor, shjae (B, C, D, T)
+            input_length: length of the valid time steps for each example in the batch, shape (B,)
+            condition: scalar condition (time) for the model, will be embedded using `self.time_embedding`
+        """
+        assert input.shape[1] == self.in_channels
+
+        # apply padding at the input
+        *_, D, T = input.shape
+        input = self.pad_input(input=input)
+
+        if input_length is None:
+            # assume all time frames are valid
+            input_length = torch.LongTensor([input.shape[-1]] * input.shape[0]).to(input.device)
+
+        lengths = input_length
+
+        if condition is not None:
+            if len(condition.shape) != 1:
+                raise ValueError(
+                    f"Expected conditon to be a 1-dim tensor, got a {len(condition.shape)}-dim tensor of shape {tuple(condition.shape)}"
+                )
+            if condition.shape[0] != input.shape[0]:
+                raise ValueError(
+                    f"Condition {tuple(condition.shape)} and input {tuple(input.shape)} should match along the batch dimension"
+                )
+
+            condition = self.time_embedding(torch.log(condition))
+
+        # downsample and project input image to add later in the downsampling path
+        pyramid = [input]
+        for resolution_num in range(self.num_resolutions - 1):
+            pyramid.append(self.downsample(pyramid[-1]))
+        pyramid = [block(image) for image, block in zip(pyramid, self.input_pyramid)]
+
+        # downsampling path
+        history = []
+        hidden = torch.zeros_like(pyramid[0])
+        input_blocks = iter(self.input_blocks)
+        for resolution_num, image in enumerate(pyramid):
+            hidden = (hidden + image) / math.sqrt(2.0)
+            hidden = mask_sequence_tensor(hidden, lengths)
+
+            for _ in range(self.num_res_blocks):
+                hidden = next(input_blocks)(hidden, condition)
+                hidden = mask_sequence_tensor(hidden, lengths)
+                history.append(hidden)
+
+            final_resolution = resolution_num == self.num_resolutions - 1
+            if not final_resolution:
+                hidden = self.downsample(hidden)
+                lengths = (lengths / 2).ceil().long()
+
+        # upsampling path
+        to_project = []
+        for residual, block in zip(reversed(history), self.output_blocks):
+            if hidden.shape != residual.shape:
+                to_project.append(hidden)
+                hidden = self.upsample(hidden)
+                lengths = (lengths * 2).long()
+
+            hidden = (hidden + residual) / math.sqrt(2.0)
+            hidden = block(hidden, condition)
+            hidden = mask_sequence_tensor(hidden, lengths)
+
+        to_project.append(hidden)
+
+        # projecting to images
+        images = []
+        for tensor, projection in zip(to_project, reversed(self.projection_blocks)):
+            image = projection(tensor)
+            images.append(F.interpolate(image, size=input.shape[-2:]))  # TODO write this loop using self.upsample
+
+        result = sum(images)
+
+        assert result.shape[-2:] == input.shape[-2:]
+
+        # remove padding
+        result = result[:, :, :D, :T]
+        return result, input_length
+
+
+class GaussianFourierProjection(NeuralModule):
+    """Gaussian Fourier embeddings for input scalars.
+    
+    The input scalars are typically time or noise levels.
+    """
+
+    def __init__(self, embedding_size: int = 256, scale: float = 1.0):
+        super().__init__()
+        self.W = torch.nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "input": NeuralType(('B',), FloatType()),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports.
+        """
+        return {
+            "output": NeuralType(('B', 'D'), VoidType()),
+        }
+
+    def forward(self, input):
+        x_proj = input[:, None] * self.W[None, :] * 2 * math.pi
+        return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+
+
+class ResnetBlockBigGANPlusPlus(torch.nn.Module):
+    """Implementation of a ResNet block for the BigGAN model.
+
+    References:
+        - Song et al., Score-Based Generative Modeling through Stochastic Differential Equations, NeurIPS 2021
+        - Brock et al., Large scale GAN training for high fidelity natural image synthesis, ICLR 2018
+    """
+
+    def __init__(
+        self,
+        activation: torch.nn.Module,
+        in_ch: int,
+        out_ch: int,
+        diffusion_step_embedding_dim: Optional[int] = None,
+        init_scale: float = 1e-5,
+        dropout_rate: float = 0.1,
+        in_num_groups: Optional[int] = None,
+        out_num_groups: Optional[int] = None,
+        eps: float = 1e-6,
+    ):
+        """
+        Args:
+            activation (torch.nn.Module): activation layer (ReLU, SiLU, etc)
+            in_ch (int): number of channels in the input image
+            out_ch (int, optional): number of channels in the output image
+            diffusion_step_embedding_dim (int, optional): dimension of diffusion timestep embedding. Defaults to None (no embedding).
+            dropout_rate (float, optional): dropout rate. Defaults to 0.1.
+            init_scale (float, optional): scaling for weight initialization. Defaults to 0.0.
+            in_num_groups (int, optional): num_groups in the first GroupNorm. Defaults to min(in_ch // 4, 32)
+            out_num_groups (int, optional): num_groups in the second GroupNorm. Defaults to min(out_ch // 4, 32)
+            eps (float, optional): eps parameter of GroupNorms. Defaults to 1e-6.
+        """
+        super().__init__()
+        in_num_groups = in_num_groups or min(in_ch // 4, 32)
+        out_num_groups = out_num_groups or min(out_ch // 4, 32)
+
+        self.init_scale = init_scale
+
+        self.input_block = torch.nn.Sequential(
+            torch.nn.GroupNorm(num_groups=in_num_groups, num_channels=in_ch, eps=eps), activation,
+        )
+
+        self.middle_conv = torch.nn.Conv2d(in_channels=in_ch, out_channels=out_ch, kernel_size=3, padding=1)
+        if diffusion_step_embedding_dim is not None:
+            self.diffusion_step_projection = torch.nn.Sequential(
+                activation,
+                torch.nn.Linear(diffusion_step_embedding_dim, out_ch),
+                einops.layers.torch.Rearrange("batch dim -> batch dim 1 1"),
+            )
+
+        self.output_block = torch.nn.Sequential(
+            torch.nn.GroupNorm(num_groups=out_num_groups, num_channels=out_ch, eps=eps),
+            activation,
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Conv2d(in_channels=out_ch, out_channels=out_ch, kernel_size=3, padding=1),
+        )
+
+        if in_ch != out_ch:
+            self.residual_projection = torch.nn.Conv2d(in_channels=in_ch, out_channels=out_ch, kernel_size=1)
+
+        self.act = activation
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+
+        self.init_weights_()
+
+    def init_weights_(self):
+        """Weight initialization
+        """
+        for module in self.modules():
+            if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    torch.nn.init.zeros_(module.bias)
+
+        # a single Conv2d is initialized with gain
+        torch.nn.init.xavier_uniform_(self.output_block[-1].weight, gain=self.init_scale)
+
+    def forward(self, x: torch.Tensor, diffusion_time_embedding: Optional[torch.Tensor] = None):
+        """Forward pass of the model.
+
+        Args:
+            x: input tensor
+            diffusion_time_embedding: embedding of the diffusion time step
+
+        Returns:
+            Output tensor
+        """
+        h = self.input_block(x)
+        h = self.middle_conv(h)
+
+        if diffusion_time_embedding is not None:
+            h = h + self.diffusion_step_projection(diffusion_time_embedding)
+
+        h = self.output_block(h)
+
+        if x.shape != h.shape:  # matching number of channels
+            x = self.residual_projection(x)
+        return (x + h) / math.sqrt(2.0)
+
+
+class PredictorCorrectorSampler(NeuralModule):
+    """Predictor-Corrector sampler for the reverse SDE.
+
+    Args:
+        sde: forward SDE
+        score_estimator: neural score estimator
+        predictor: predictor for the reverse process
+        corrector: corrector for the reverse process
+        num_steps: number of time steps for the reverse process
+        num_corrector_steps: number of corrector steps
+        time_max: maximum time
+        time_min: minimum time
+        snr: SNR for Annealed Langevin Dynamics
+        output_type: type of the output ('state' for the final state, or 'mean' for the mean of the final state)
+
+    References:
+        - Song et al., Score-based generative modeling through stochastic differential equations, 2021
+    """
+
+    def __init__(
+        self,
+        sde,
+        score_estimator,
+        predictor: str = 'reverse_diffusion',
+        corrector: str = 'annealed_langevin_dynamics',
+        num_steps: int = 50,
+        num_corrector_steps: int = 1,
+        time_max: Optional[float] = None,
+        time_min: Optional[float] = None,
+        snr: float = 0.5,
+        output_type: str = 'mean',
+    ):
+        super().__init__()
+        # Create a copy of SDE
+        self.sde = sde.copy()
+
+        # Update SDE parameters for sampling
+        if time_max is not None:
+            self.sde.time_max = time_max
+            logging.info('sde.time_max set to: %s', self.sde.time_max)
+
+        if time_min is not None:
+            self.sde.time_min = time_min
+            logging.info('sde.time_min set to: %s', self.sde.time_min)
+
+        self.sde.num_steps = num_steps
+        logging.info('sde.num_steps set to: %s', self.sde.num_steps)
+
+        # Update local values
+        self.time_max = self.sde.time_max
+        self.time_min = self.sde.time_min
+        self.num_steps = self.sde.num_steps
+
+        # Predictor setup
+        if predictor == 'reverse_diffusion':
+            self.predictor = ReverseDiffusionPredictor(sde=self.sde, score_estimator=score_estimator)
+        else:
+            raise RuntimeError(f'Unexpected predictor: {predictor}')
+
+        # Corrector setup
+        if corrector == 'annealed_langevin_dynamics':
+            self.corrector = AnnealedLangevinDynamics(
+                sde=self.sde, score_estimator=score_estimator, snr=snr, num_steps=num_corrector_steps
+            )
+        else:
+            raise RuntimeError(f'Unexpected corrector: {corrector}')
+
+        if output_type not in ['mean', 'state']:
+            raise ValueError(f'Unexpected output type: {output_type}')
+        self.output_type = output_type
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tpredictor:           %s', predictor)
+        logging.debug('\tcorrector:           %s', corrector)
+        logging.debug('\tnum_steps:           %s', self.num_steps)
+        logging.debug('\ttime_min:            %s', self.time_min)
+        logging.debug('\ttime_max:            %s', self.time_max)
+        logging.debug('\tnum_corrector_steps: %s', num_corrector_steps)
+        logging.debug('\tsnr:                 %s', snr)
+        logging.debug('\toutput_type:         %s', self.output_type)
+
+    @typecheck(
+        input_types={
+            "prior_mean": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "score_condition": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType(), optional=True),
+            "state_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        },
+        output_types={
+            "sample": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "state_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        },
+    )
+    @torch.inference_mode()
+    def forward(
+        self, prior_mean: torch.Tensor, score_condition: torch.Tensor, state_length: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Takes prior (noisy) mean and generates a sample by solving the reverse SDE.
+
+        Args:
+            prior_mean: mean for the prior distribution, e.g., noisy observation
+            score_condition: conditioning for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            Generated `sample` and the corresponding `sample_length`.
+        """
+        # Sample from the prior distribution
+        state = self.sde.prior_sampling(prior_mean=prior_mean)
+
+        if state_length is not None:
+            state = mask_sequence_tensor(state, state_length)
+
+        # Time steps for evaluation
+        time_steps = torch.linspace(self.time_max, self.time_min, self.num_steps, device=state.device)
+
+        # Sampling
+        for t in time_steps:
+            # time steps for the whole batch
+            time = t * torch.ones(state.shape[0], device=state.device)
+
+            # corrector step
+            state, _ = self.corrector(
+                state=state, time=time, score_condition=score_condition, state_length=state_length
+            )
+
+            # predictor step
+            state, state_mean = self.predictor(
+                state=state,
+                time=time,
+                score_condition=score_condition,
+                prior_mean=prior_mean,
+                state_length=state_length,
+            )
+
+        # Final output
+        if self.output_type == 'state':
+            sample = state
+        elif self.output_type == 'mean':
+            sample = state_mean
+        else:
+            raise RuntimeError(f'Unexpected output type: {self.output_type}')
+
+        if state_length is not None:
+            sample = mask_sequence_tensor(sample, state_length)
+
+        return sample, state_length
+
+
+class Predictor(torch.nn.Module, ABC):
+    """Predictor for the reverse process.
+
+    Args:
+        sde: forward SDE
+        score_estimator: neural score estimator
+    """
+
+    def __init__(self, sde, score_estimator):
+        super().__init__()
+        self.reverse_sde = ReverseStochasticDifferentialEquation(sde=sde, score_estimator=score_estimator)
+
+    @abstractmethod
+    @torch.inference_mode()
+    def forward(
+        self,
+        *,
+        state: torch.Tensor,
+        time: torch.Tensor,
+        score_condition: Optional[torch.Tensor] = None,
+        state_length: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        """Predict the next state of the reverse process.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            score_condition: conditioning for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            New state and mean.
+        """
+        pass
+
+
+class ReverseDiffusionPredictor(Predictor):
+    """Predict the next state of the reverse process using the reverse diffusion process.
+
+    Args:
+        sde: forward SDE
+        score_estimator: neural score estimator
+    """
+
+    def __init__(self, sde, score_estimator):
+        super().__init__(sde=sde, score_estimator=score_estimator)
+
+    @torch.inference_mode()
+    def forward(self, *, state, time, score_condition=None, state_length=None, **kwargs):
+        """Predict the next state of the reverse process using the reverse diffusion process.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            score_condition: conditioning for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            New state and mean of the diffusion process.
+        """
+        drift, diffusion = self.reverse_sde.discretize(
+            state=state, time=time, score_condition=score_condition, state_length=state_length, **kwargs
+        )
+
+        # Generate a random sample from a standard normal distribution
+        z_norm = torch.randn_like(state)
+
+        # Compute the mean of the next state
+        mean = state - drift
+
+        # Compute new state by sampling
+        new_state = mean + diffusion * z_norm
+
+        if state_length is not None:
+            new_state = mask_sequence_tensor(new_state, state_length)
+            mean = mask_sequence_tensor(mean, state_length)
+
+        return new_state, mean
+
+
+class Corrector(NeuralModule, ABC):
+    """Corrector for the reverse process.
+
+    Args:
+        sde: forward SDE
+        score_estimator: neural score estimator
+        snr: SNR for Annealed Langevin Dynamics
+        num_steps: number of steps for the corrector
+    """
+
+    def __init__(
+        self,
+        sde: Type[StochasticDifferentialEquation],
+        score_estimator: Type[NeuralModule],
+        snr: float,
+        num_steps: int,
+    ):
+        super().__init__()
+        self.sde = sde
+        self.score_estimator = score_estimator
+        self.snr = snr
+        self.num_steps = num_steps
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tsnr:             %s', snr)
+        logging.debug('\tnum_steps:       %s', num_steps)
+
+    @abstractmethod
+    @typecheck(
+        input_types={
+            "state": NeuralType(('B', 'C', 'D', 'T'), VoidType()),
+            "time": NeuralType(tuple('B'), FloatType()),
+            "score_condition": NeuralType(('B', 'C', 'D', 'T'), VoidType(), optional=True),
+            "state_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        },
+        output_types={"state": NeuralType(('B', 'C', 'D', 'T'), VoidType()),},
+    )
+    @torch.inference_mode()
+    def forward(self, state, time, score_condition=None, state_length=None):
+        """
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            score_condition: conditioning for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            New state and mean.
+        """
+        pass
+
+
+class AnnealedLangevinDynamics(Corrector):
+    """Annealed Langevin Dynamics for the reverse process.
+
+    References:
+        - Song et al., Score-based generative modeling through stochastic differential equations, 2021
+    """
+
+    def __init__(self, sde, **kwargs):
+        if not isinstance(sde, OrnsteinUhlenbeckVarianceExplodingSDE):
+            raise ValueError(f'Expected an instance of OrnsteinUhlenbeckVarianceExplodingSDE, got {type(sde)}')
+        super().__init__(sde=sde, **kwargs)
+
+    @torch.inference_mode()
+    def forward(self, state, time, score_condition=None, state_length=None):
+        """Correct the state using Annealed Langevin Dynamics.
+
+        Args:
+            state: current state of the process, shape (B, C, D, T)
+            time: current time of the process, shape (B,)
+            score_condition: conditioning for the score estimator
+            state_length: length of the valid time steps for each example in the batch
+
+        Returns:
+            New state and mean of the diffusion process.
+
+        References:
+            Alg. 4 in http://arxiv.org/abs/2011.13456
+        """
+        # Compute the standard deviation of the diffusion process
+        std = self.sde.perturb_kernel_std(time=time)
+        # View as [B, 1, 1, 1]
+        std = std.view(-1, *([1] * (state.dim() - 1)))
+
+        for i in range(self.num_steps):
+            # prepare input for the score estimator, concatenate conditioning along the channel dimension
+            score_input = state if score_condition is None else torch.cat([state, score_condition], dim=1)
+
+            # calculate the score
+            score, _ = self.score_estimator(input=score_input, input_length=state_length, condition=time)
+
+            # generate a sample from a standard normal distribution
+            z_norm = torch.randn_like(state)
+
+            # compute the step size
+            # note: this is slightly different than in the paper, where std = ||z_norm||_2 / ||score||_2
+            step_size = 2 * (self.snr * std).pow(2)
+
+            # update the mean
+            mean = state + step_size * score
+
+            # update the state
+            state = mean + z_norm * torch.sqrt(step_size * 2)
+
+        if state_length is not None:
+            state = mask_sequence_tensor(state, state_length)
+            mean = mask_sequence_tensor(mean, state_length)
+
+        return state, mean
diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt
index b7863714eb2d..30e839fd2ca8 100644
--- a/requirements/requirements_asr.txt
+++ b/requirements/requirements_asr.txt
@@ -1,5 +1,6 @@
 braceexpand
 editdistance
+einops
 g2p_en
 ipywidgets
 jiwer
diff --git a/tests/collections/asr/test_asr_datasets.py b/tests/collections/asr/test_asr_datasets.py
index 946acb614f11..a2e39628e4cb 100644
--- a/tests/collections/asr/test_asr_datasets.py
+++ b/tests/collections/asr/test_asr_datasets.py
@@ -809,6 +809,39 @@ def test_list_to_multichannel(self, num_channels, num_targets):
         # Check the list is converted back to the original signal
         assert (ASRAudioProcessor.list_to_multichannel(target_list) == golden_target).all()
 
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 2])
+    def test_processor_process_audio(self, num_channels):
+        """Test signal normalization in process_audio.
+        """
+        num_samples = 1000
+        num_examples = 30
+
+        signals = ['input_signal', 'target_signal', 'reference_signal']
+
+        for normalization_signal in [None] + signals:
+            # Create processor
+            processor = ASRAudioProcessor(
+                sample_rate=16000, random_offset=False, normalization_signal=normalization_signal
+            )
+
+            # Generate random signals
+            for n in range(num_examples):
+                example = {signal: torch.randn(num_channels, num_samples) for signal in signals}
+                processed_example = processor.process_audio(example)
+
+                # Expected scale
+                if normalization_signal:
+                    scale = 1.0 / (example[normalization_signal].abs().max() + processor.eps)
+                else:
+                    scale = 1.0
+
+                # Make sure all signals are scaled as expected
+                for signal in signals:
+                    assert torch.allclose(
+                        processed_example[signal], example[signal] * scale
+                    ), f'Failed example {n} signal {signal}'
+
     @pytest.mark.unit
     def test_audio_collate_fn(self):
         """Test `_audio_collate_fn`
diff --git a/tests/collections/asr/test_asr_losses.py b/tests/collections/asr/test_asr_losses.py
index e09fd71e0892..e050e7cc07c3 100644
--- a/tests/collections/asr/test_asr_losses.py
+++ b/tests/collections/asr/test_asr_losses.py
@@ -17,7 +17,9 @@
 import torch
 
 from nemo.collections.asr.losses.audio_losses import (
+    MSELoss,
     SDRLoss,
+    calculate_mse_batch,
     calculate_sdr_batch,
     convolution_invariant_target,
     scale_invariant_target,
@@ -271,7 +273,7 @@ def test_sdr_binary_mask(self, num_channels):
             estimate = target + noise
 
             # Limit calculation to masked samples
-            mask = _rng.integers(low=0, high=2, size=(batch_size, max_num_samples))
+            mask = _rng.integers(low=0, high=2, size=(batch_size, num_channels, max_num_samples))
 
             # Tensors for testing the loss
             tensor_estimate = torch.tensor(estimate)
@@ -282,7 +284,9 @@ def test_sdr_binary_mask(self, num_channels):
             golden_sdr = 0
             for b in range(batch_size):
                 sdr = [
-                    calculate_sdr_numpy(estimate=estimate[b, m, mask[b, :] > 0], target=target[b, m, mask[b, :] > 0])
+                    calculate_sdr_numpy(
+                        estimate=estimate[b, m, mask[b, m, :] > 0], target=target[b, m, mask[b, m, :] > 0]
+                    )
                     for m in range(num_channels)
                 ]
                 sdr = np.mean(np.array(sdr))
@@ -467,3 +471,187 @@ def test_sdr_convolution_invariant(self, num_channels: int, filter_length: int):
             assert np.allclose(
                 uut_sdr_loss.cpu().detach().numpy(), -golden_sdr, atol=atol
             ), f'SDRLoss not matching for example {n}'
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 4])
+    @pytest.mark.parametrize('ndim', [3, 4])
+    def test_mse(self, num_channels: int, ndim: int):
+        """Test SDR calculation
+        """
+        batch_size = 8
+        num_samples = 50
+        num_features = 123
+        num_batches = 10
+        random_seed = 42
+        atol = 1e-6
+
+        signal_shape = (
+            (batch_size, num_channels, num_features, num_samples)
+            if ndim == 4
+            else (batch_size, num_channels, num_samples)
+        )
+
+        reduction_dim = (-2, -1) if ndim == 4 else -1
+
+        mse_loss = MSELoss(ndim=ndim)
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        for n in range(num_batches):
+
+            # Generate random signal
+            target = _rng.normal(size=signal_shape)
+            # Random noise + scaling
+            noise = _rng.uniform(low=0.01, high=1) * _rng.normal(size=signal_shape)
+            # Estimate
+            estimate = target + noise
+
+            # DC bias for both
+            target += _rng.uniform(low=-1, high=1)
+            estimate += _rng.uniform(low=-1, high=1)
+
+            # Tensors for testing the loss
+            tensor_estimate = torch.tensor(estimate)
+            tensor_target = torch.tensor(target)
+
+            # Reference MSE
+            golden_mse = np.zeros((batch_size, num_channels))
+            for b in range(batch_size):
+                for m in range(num_channels):
+                    err = estimate[b, m, :] - target[b, m, :]
+                    golden_mse[b, m] = np.mean(np.abs(err) ** 2, axis=reduction_dim)
+
+            # Calculate MSE in torch
+            uut_mse = calculate_mse_batch(estimate=tensor_estimate, target=tensor_target)
+
+            # Calculate MSE loss
+            uut_mse_loss = mse_loss(estimate=tensor_estimate, target=tensor_target)
+
+            # Compare torch SDR vs numpy
+            assert np.allclose(
+                uut_mse.cpu().detach().numpy(), golden_mse, atol=atol
+            ), f'MSE not matching for example {n}'
+
+            # Compare SDR loss vs average of torch SDR
+            assert np.isclose(uut_mse_loss, uut_mse.mean(), atol=atol), f'MSELoss not matching for example {n}'
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 4])
+    @pytest.mark.parametrize('ndim', [3, 4])
+    def test_mse_weighted(self, num_channels: int, ndim: int):
+        """Test SDR calculation with weighting for channels
+        """
+        batch_size = 8
+        num_samples = 50
+        num_features = 123
+        num_batches = 10
+        random_seed = 42
+        atol = 1e-6
+
+        signal_shape = (
+            (batch_size, num_channels, num_features, num_samples)
+            if ndim == 4
+            else (batch_size, num_channels, num_samples)
+        )
+
+        reduction_dim = (-2, -1) if ndim == 4 else -1
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        channel_weight = _rng.uniform(low=0.01, high=1.0, size=num_channels)
+        channel_weight = channel_weight / np.sum(channel_weight)
+        mse_loss = MSELoss(weight=channel_weight, ndim=ndim)
+
+        for n in range(num_batches):
+
+            # Generate random signal
+            target = _rng.normal(size=signal_shape)
+            # Random noise + scaling
+            noise = _rng.uniform(low=0.001, high=10) * _rng.normal(size=target.shape)
+            # Estimate
+            estimate = target + noise
+
+            # Tensors for testing the loss
+            tensor_estimate = torch.tensor(estimate)
+            tensor_target = torch.tensor(target)
+
+            # Reference MSE
+            golden_mse = 0
+            for b in range(batch_size):
+                mse = [
+                    np.mean(np.abs(estimate[b, m, :] - target[b, m, :]) ** 2, axis=reduction_dim)
+                    for m in range(num_channels)
+                ]
+                # weighted sum
+                mse = np.sum(np.array(mse) * channel_weight)
+                golden_mse += mse
+            golden_mse /= batch_size  # average over batch
+
+            # Calculate MSE loss
+            uut_mse_loss = mse_loss(estimate=tensor_estimate, target=tensor_target)
+
+            # Compare
+            assert np.allclose(
+                uut_mse_loss.cpu().detach().numpy(), golden_mse, atol=atol
+            ), f'MSELoss not matching for example {n}'
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('num_channels', [1, 4])
+    @pytest.mark.parametrize('ndim', [3, 4])
+    def test_mse_input_length(self, num_channels: int, ndim: int):
+        """Test SDR calculation with input length.
+        """
+        batch_size = 8
+        max_num_samples = 50
+        num_features = 123
+        num_batches = 10
+        random_seed = 42
+        atol = 1e-6
+
+        signal_shape = (
+            (batch_size, num_channels, num_features, max_num_samples)
+            if ndim == 4
+            else (batch_size, num_channels, max_num_samples)
+        )
+
+        reduction_dim = (-2, -1) if ndim == 4 else -1
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        mse_loss = MSELoss(ndim=ndim)
+
+        for n in range(num_batches):
+
+            # Generate random signal
+            target = _rng.normal(size=signal_shape)
+            # Random noise + scaling
+            noise = _rng.uniform(low=0.001, high=10) * _rng.normal(size=target.shape)
+            # Estimate
+            estimate = target + noise
+
+            # Limit calculation to random input_length samples
+            input_length = _rng.integers(low=1, high=max_num_samples, size=batch_size)
+
+            # Tensors for testing the loss
+            tensor_estimate = torch.tensor(estimate)
+            tensor_target = torch.tensor(target)
+            tensor_input_length = torch.tensor(input_length)
+
+            # Reference MSE
+            golden_mse = 0
+            for b, b_len in enumerate(input_length):
+                mse = [
+                    np.mean(np.abs(estimate[b, m, ..., :b_len] - target[b, m, ..., :b_len]) ** 2, axis=reduction_dim)
+                    for m in range(num_channels)
+                ]
+                mse = np.mean(np.array(mse))
+                golden_mse += mse
+            golden_mse /= batch_size  # average over batch
+
+            # Calculate MSE
+            uut_mse_loss = mse_loss(estimate=tensor_estimate, target=tensor_target, input_length=tensor_input_length)
+
+            # Compare
+            assert np.allclose(
+                uut_mse_loss.cpu().detach().numpy(), golden_mse, atol=atol
+            ), f'MSELoss not matching for example {n}'
diff --git a/tests/collections/asr/test_audio_preprocessing.py b/tests/collections/asr/test_audio_preprocessing.py
index b0875936a7f7..600b9fed44fa 100644
--- a/tests/collections/asr/test_audio_preprocessing.py
+++ b/tests/collections/asr/test_audio_preprocessing.py
@@ -155,7 +155,11 @@ def test_spec_to_audio(self, fft_length: int, num_channels: int):
     @pytest.mark.skipif(not HAVE_TORCHAUDIO, reason="Modules in this test require torchaudio")
     @pytest.mark.parametrize('fft_length', [128, 1024])
     @pytest.mark.parametrize('num_channels', [1, 4])
-    def test_audio_to_spectrogram_reconstruction(self, fft_length: int, num_channels: int):
+    @pytest.mark.parametrize('magnitude_power', [0.5, 1, 2])
+    @pytest.mark.parametrize('scale', [0.1, 1.0])
+    def test_audio_to_spectrogram_reconstruction(
+        self, fft_length: int, num_channels: int, magnitude_power: float, scale: float
+    ):
         """Test analysis and synthesis transform result in a perfect reconstruction.
         """
         batch_size = 4
@@ -169,8 +173,12 @@ def test_audio_to_spectrogram_reconstruction(self, fft_length: int, num_channels
         hop_lengths = [fft_length // 2, fft_length // 4]
 
         for hop_length in hop_lengths:
-            audio2spec = AudioToSpectrogram(fft_length=fft_length, hop_length=hop_length)
-            spec2audio = SpectrogramToAudio(fft_length=fft_length, hop_length=hop_length)
+            audio2spec = AudioToSpectrogram(
+                fft_length=fft_length, hop_length=hop_length, magnitude_power=magnitude_power, scale=scale
+            )
+            spec2audio = SpectrogramToAudio(
+                fft_length=fft_length, hop_length=hop_length, magnitude_power=magnitude_power, scale=scale
+            )
 
             for n in range(num_examples):
                 x = _rng.normal(size=(batch_size, num_channels, num_samples))

From 8c1ce65961c60df8c58817cae6f1cb7b5e5d407a Mon Sep 17 00:00:00 2001
From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Date: Wed, 1 May 2024 15:52:36 -0700
Subject: [PATCH 016/178] Fix docs errors and most warnings (#9006)

* add various docs fixes

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* make conf.py changes clearer

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix Duplicate explicit target name error for links

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* more fixes, mainly citations

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix some code formatting

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update hf space iframe link

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix new ERRORs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Update docs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 docs/source/asr/datasets.rst                  | 53 ++++++------
 docs/source/asr/intro.rst                     |  4 +-
 docs/source/asr/models.rst                    |  4 +-
 docs/source/asr/speech_intent_slot/api.rst    |  2 +
 docs/source/asr/ssl/api.rst                   |  2 +
 docs/source/ckpt_converters/dev_guide.rst     |  4 +-
 docs/source/ckpt_converters/user_guide.rst    | 84 +++++++++----------
 docs/source/conf.py                           |  3 +-
 docs/source/core/adapters/api.rst             |  7 ++
 docs/source/core/adapters/components.rst      | 12 ++-
 docs/source/core/adapters/intro.rst           |  1 +
 docs/source/core/core.rst                     | 11 +--
 docs/source/core/exp_manager.rst              |  1 +
 docs/source/core/export.rst                   |  3 +-
 docs/source/core/neural_types.rst             |  3 +
 docs/source/features/memory_optimizations.rst | 13 +--
 docs/source/multimodal/api.rst                |  9 +-
 docs/source/multimodal/mllm/checkpoint.rst    | 10 +--
 docs/source/multimodal/nerf/dreamfusion.rst   |  6 +-
 .../source/multimodal/text2img/controlnet.rst |  8 +-
 .../source/multimodal/text2img/dreambooth.rst |  8 +-
 docs/source/multimodal/text2img/imagen.rst    | 10 +--
 docs/source/multimodal/text2img/insp2p.rst    |  6 +-
 docs/source/multimodal/text2img/intro.rst     |  1 +
 .../multimodal/text2img/sdxl_quantization.rst | 10 ++-
 docs/source/multimodal/vlm/clip.rst           |  6 +-
 docs/source/nlp/api.rst                       | 19 ++---
 docs/source/nlp/information_retrieval.rst     |  2 +-
 .../machine_translation.rst                   |  8 +-
 .../nlp/nemo_megatron/gpt/gpt_training.rst    |  2 +-
 .../nemo_megatron/positional_embeddings.rst   | 28 +++----
 ...ation_and_capitalization_lexical_audio.rst |  6 +-
 .../text_normalization_as_tagging.rst         |  8 +-
 docs/source/starthere/best-practices.rst      |  2 +-
 docs/source/starthere/migration-guide.rst     | 20 ++---
 docs/source/tools/nemo_forced_aligner.rst     |  8 +-
 docs/source/vision/checkpoint.rst             |  2 +-
 docs/source/vision/vit.rst                    |  6 +-
 nemo/collections/asr/models/asr_model.py      |  4 +-
 nemo/collections/asr/models/msdd_models.py    | 13 ++-
 nemo/collections/asr/modules/rnnt.py          | 23 +++--
 .../tokenizers/huggingface/auto_tokenizer.py  | 11 ++-
 .../language_modeling/megatron/t5_dataset.py  |  3 +-
 .../megatron/t5_prompt_learning_dataset.py    |  4 +-
 .../language_modeling/megatron/ul2_dataset.py |  4 +-
 .../megatron_bert_embedding_model.py          |  8 +-
 .../language_modeling/megatron_bert_model.py  |  8 +-
 .../language_modeling/megatron_gpt_model.py   |  8 +-
 .../megatron_lm_encoder_decoder_model.py      | 12 ++-
 .../common/transformer/text_generation.py     | 57 ++++++-------
 .../megatron_vit_classification_models.py     |  8 +-
 nemo/core/classes/dataset.py                  | 15 ++--
 nemo/utils/exp_manager.py                     |  4 +-
 53 files changed, 306 insertions(+), 268 deletions(-)

diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index b4656eec3f3f..a6e9cbe96c63 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -261,11 +261,6 @@ Semi Sorted Batching
 
 Sorting samples by duration and spliting them into batches speeds up training, but can degrade the quality of the model. To avoid quality degradation and maintain some randomness in the partitioning process, we add pseudo noise to the sample length when sorting.
 
-  .. image:: images/ssb.png
-    :align: center
-    :alt: semi sorted batching
-    :scale: 50%
-
 It may result into training speeedup of more than 40 percent with the same quality. To enable and use semi sorted batching add some lines in config.
 
   .. code::
@@ -772,30 +767,30 @@ To enable multimodal dataloading, we provide several configuration options:
 
 Example 3. Combine an ASR (audio-text) dataset with an MT (text-only) dataset so that mini-batches have some examples from both datasets. Provide a custom prompt field for both datasets (to be leveraged by a relevant dataset class):
 
-```yaml
-use_multimodal_sampling: true
-batch_tokens: 1024
-token_equivalent_duration: 0.08  # 0.01 frame shift * 8 subsampling factor
-quadratic_factor: 50
-num_buckets: 30
-use_bucketing: true
-input_cfg:
-  - type: nemo_tarred
-    manifest_filepath: /path/to/manifest__OP_0..512_CL_.json
-    tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar
-    weight: 0.5
-    tags:
-      lang: en
-      prompt: "Given the following recording, transcribe what the person is saying:"
-  - type: txt_pair
-    source_path: /path/to/en__OP_0..512_CL_.txt
-    target_path: /path/to/pl__OP_0..512_CL_.txt
-    source_language: en
-    target_language: pl
-    weight: 0.5
-    tags:
-      prompt: "Translate the following text to Polish:"
-```
+.. code-block:: yaml
+
+    use_multimodal_sampling: true
+    batch_tokens: 1024
+    token_equivalent_duration: 0.08  # 0.01 frame shift * 8 subsampling factor
+    quadratic_factor: 50
+    num_buckets: 30
+    use_bucketing: true
+    input_cfg:
+      - type: nemo_tarred
+        manifest_filepath: /path/to/manifest__OP_0..512_CL_.json
+        tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar
+        weight: 0.5
+        tags:
+          lang: en
+          prompt: "Given the following recording, transcribe what the person is saying:"
+      - type: txt_pair
+        source_path: /path/to/en__OP_0..512_CL_.txt
+        target_path: /path/to/pl__OP_0..512_CL_.txt
+        source_language: en
+        target_language: pl
+        weight: 0.5
+        tags:
+          prompt: "Translate the following text to Polish:"
 
 .. caution:: We strongly recommend to use multiple shards for text files as well so that different nodes and dataloading workers are able to randomize the order of text iteration. Otherwise, multi-GPU training has a high risk of duplication of text examples.
 
diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
index 7d1270af1267..d353b4d983dd 100644
--- a/docs/source/asr/intro.rst
+++ b/docs/source/asr/intro.rst
@@ -156,11 +156,11 @@ Canary-1B is a multi-lingual, multi-task model, supporting automatic speech-to-t
 
 .. raw:: html
 
-    <iframe src="https://hf.space/embed/nvidia/canary-1b/+"
+    <iframe src="https://nvidia-canary-1b.hf.space"
     width="100%" class="gradio-asr" allow="microphone *"></iframe>
 
     <script type="text/javascript" language="javascript">
-        $('.gradio-asr').css('height', $(window).height()+'px');
+        $('.gradio-asr').css('height', $(window).height() * 0.8+'px');
     </script>
 
 
diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
index 97dafcb2bf6d..f002137beb0f 100644
--- a/docs/source/asr/models.rst
+++ b/docs/source/asr/models.rst
@@ -46,12 +46,14 @@ HuggingFace Spaces to try out Parakeet models in your browser:
 * `Parakeet-TDT-1.1B <https://huggingface.co/spaces/nvidia/parakeet-tdt-1.1b>`__ space
 
 .. _Conformer_model:
+
 Conformer
 ---------
+
 .. _Conformer-CTC_model:
+
 Conformer-CTC
 ~~~~~~~~~~~~~
--------------
 
 Conformer-CTC is a CTC-based variant of the Conformer model introduced in :cite:`asr-models-gulati2020conformer`. Conformer-CTC has a
 similar encoder as the original Conformer but uses CTC loss and decoding instead of RNNT/Transducer loss, which makes it a non-autoregressive model.
diff --git a/docs/source/asr/speech_intent_slot/api.rst b/docs/source/asr/speech_intent_slot/api.rst
index 735c583f9115..d45f24f807f6 100644
--- a/docs/source/asr/speech_intent_slot/api.rst
+++ b/docs/source/asr/speech_intent_slot/api.rst
@@ -15,8 +15,10 @@ Mixins
 .. autoclass:: nemo.collections.asr.parts.mixins.ASRModuleMixin
     :show-inheritance:
     :members:
+    :no-index:
 
 .. autoclass:: nemo.collections.asr.parts.mixins.ASRBPEMixin
     :show-inheritance:
     :members:
+    :no-index:
 
diff --git a/docs/source/asr/ssl/api.rst b/docs/source/asr/ssl/api.rst
index 7103243a4b20..8e6f83986032 100644
--- a/docs/source/asr/ssl/api.rst
+++ b/docs/source/asr/ssl/api.rst
@@ -15,10 +15,12 @@ Mixins
 .. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin
     :show-inheritance:
     :members:
+    :no-index:
 
 .. autoclass:: nemo.core.classes.mixins.access_mixins.AccessMixin
     :show-inheritance:
     :members:
+    :no-index:
 
 
diff --git a/docs/source/ckpt_converters/dev_guide.rst b/docs/source/ckpt_converters/dev_guide.rst
index 9faa752df2e1..601e69749b64 100644
--- a/docs/source/ckpt_converters/dev_guide.rst
+++ b/docs/source/ckpt_converters/dev_guide.rst
@@ -48,7 +48,7 @@ Script Placement and Naming Conventions
 Code Template
 -------------
 
-Below template tries to address the 11 steps in the guideline part. Please also use `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`_  as an full example for development.
+Below template tries to address the 11 steps in the guideline part. Please also use `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`__  as an full example for development.
 
 .. code-block:: python
 
@@ -210,7 +210,7 @@ A Simple Guide for Model Mapping and Conversion
 
 2. **Common issues when converting: results not matching between Community model and NeMo model**:
 
-   a. Megatron Core uses a special QKV layout, which needs careful handling and reshaping from community models, especially when GQA or MQA is used. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L144>`_ for guidance.
+   a. Megatron Core uses a special QKV layout, which needs careful handling and reshaping from community models, especially when GQA or MQA is used. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L144>`__ for guidance.
 
    b. GLU Variants weights could also be a common source of error. In Megatron Core, the regular feedforward projection weights and gated forward weights are fused together, requiring careful attention to the order of these two. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L135>`_ for more details.
 
diff --git a/docs/source/ckpt_converters/user_guide.rst b/docs/source/ckpt_converters/user_guide.rst
index 9de22f4b5994..451679a7e3ae 100644
--- a/docs/source/ckpt_converters/user_guide.rst
+++ b/docs/source/ckpt_converters/user_guide.rst
@@ -6,45 +6,45 @@ This guide provides instructions on how to use the conversion scripts to convert
 Support Matrix
 --------------
 
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Conversion           | From             | To                  | Github Link                                                                                                        |
-+======================+==================+=====================+====================================================================================================================+
-| Baichuan             | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py>`_   |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Baichuan             | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py>`_   |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| BERT                 | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py>`_        |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| BERT                 | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py>`_        |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Falcon               | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Falcon               | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`_       |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | JAX              | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | PyTorch          | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| GPT/LLaMA            | NeMo (Legacy)    | NeMo (Megatron-Core)| `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| LLaMA                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py>`_       |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| LLaMA                | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py>`_       |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mistral 7B           | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py>`_  |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mistral 7B           | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py>`_  |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mixtral              | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py>`_     |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mixtral              | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py>`_     |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| MPT                  | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py>`_         |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Starcoder            | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py>`_   |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Conversion           | From             | To                  | Github Link                                                                                                         |
++======================+==================+=====================+=====================================================================================================================+
+| Baichuan             | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py>`__   |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Baichuan             | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py>`__   |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| BERT                 | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py>`__        |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| BERT                 | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py>`__        |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Falcon               | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Falcon               | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Gemma                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`__       |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Gemma                | JAX              | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Gemma                | PyTorch          | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| GPT/LLaMA            | NeMo (Legacy)    | NeMo (Megatron-Core)| `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| LLaMA                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py>`__       |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| LLaMA                | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py>`__       |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mistral 7B           | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py>`__  |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mistral 7B           | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py>`__  |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mixtral              | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py>`__     |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mixtral              | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py>`__     |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| MPT                  | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py>`__         |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Starcoder            | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py>`__   |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
 
 
 Convert Hugging Face LLaMA Checkpoints to NeMo
@@ -54,7 +54,7 @@ To convert a Hugging Face LLaMA checkpoint into a NeMo checkpoint, use the follo
 
 .. code-block:: bash
 
-    python convert_llama_hf_to_nemo.py>`_ \
+    python convert_llama_hf_to_nemo.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --output_path <path_to_output_nemo_file>
 
@@ -67,7 +67,7 @@ To convert a NeMo checkpoint into a Hugging Face LLaMA checkpoint, you have two
 
 .. code-block:: bash
 
-    python convert_<model>_nemo_to_hf.py>`_ \
+    python convert_<model>_nemo_to_hf.py \
     --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
     --output_path /path/to/pytorch_model.bin
 
@@ -75,7 +75,7 @@ To convert a NeMo checkpoint into a Hugging Face LLaMA checkpoint, you have two
 
 .. code-block:: bash
 
-    python convert_<model>_nemo_to_hf.py>`_ \
+    python convert_<model>_nemo_to_hf.py \
     --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
     --output_path /path/to/model_folder \
     --hf_input_path /path/to/input_hf_folder \
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e8fba7457605..c599f630d7f7 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -113,10 +113,9 @@
     "sphinx.ext.viewcode",
     "sphinx.ext.napoleon",
     "sphinx.ext.githubpages",
-    "sphinxcontrib.bibtex",
     "sphinx.ext.inheritance_diagram",
     "sphinx.ext.intersphinx",
-    "sphinx.ext.autosectionlabel",
+    # "sphinx.ext.autosectionlabel",
     "sphinxcontrib.bibtex",
     "sphinx_copybutton",
     "sphinxext.opengraph",
diff --git a/docs/source/core/adapters/api.rst b/docs/source/core/adapters/api.rst
index b0f2a8e13610..8922c72d63eb 100644
--- a/docs/source/core/adapters/api.rst
+++ b/docs/source/core/adapters/api.rst
@@ -9,6 +9,7 @@ Core
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -17,6 +18,7 @@ Core
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -28,6 +30,7 @@ Adapter Networks
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 -----
 
@@ -35,6 +38,7 @@ Adapter Networks
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 -----
 
@@ -47,6 +51,7 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -55,6 +60,7 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -63,3 +69,4 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
diff --git a/docs/source/core/adapters/components.rst b/docs/source/core/adapters/components.rst
index cc2ea0b525df..d8bed1b23a75 100644
--- a/docs/source/core/adapters/components.rst
+++ b/docs/source/core/adapters/components.rst
@@ -8,7 +8,7 @@ An adapter module can be any pytorch module, but it must follow certain straight
 1) The model accepts an input of some input dimension, and its output must match this dimension.
 2) Ideally, the module is initialized such that the output of the adapter when initialized is such that it does not modify the original input. This allows the model to produce the same output results, even when additional parameters have been added.
 
-According to Junxian et al :cite:`adapters-Junxian2021unified`, we can consider an adapter being represented as three components -
+According to Junxian et al :cite:`adapters-components-Junxian2021unified`, we can consider an adapter being represented as three components -
 
 1) Functional form - the trainable parameters that will modify the input
 2) Insertion form - Where the adapter outputs are integrated with the original input. The input to the adapters can be the last output of the layer, the input to some attention layer, or even the original input to the module itself (before even the modules forward pass).
@@ -17,7 +17,7 @@ According to Junxian et al :cite:`adapters-Junxian2021unified`, we can consider
 Functional Form - Adapter Networks
 ==================================
 
-Adapter modules represent the functional form of the adapter. We discuss an example of a most commonly used adapter module found in literature, titled the ``LinearAdapter`` (or Houlsby Adapter) :cite:`adapters-houlsby2019adapter`.
+Adapter modules represent the functional form of the adapter. We discuss an example of a most commonly used adapter module found in literature, titled the ``LinearAdapter`` (or Houlsby Adapter) :cite:`adapters-components-houlsby2019adapter`.
 
 .. note::
 
@@ -28,6 +28,7 @@ Adapter modules represent the functional form of the adapter. We discuss an exam
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 -----
 
@@ -35,12 +36,13 @@ Adapter modules represent the functional form of the adapter. We discuss an exam
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 
 Insertion Form - Module Adapters
 --------------------------------
 
-Adapter modules can be integrated into many different locations of a given module. For example, it is possible to have an adapter that affects only the outputs of the final layer in each module. We can also have a ``Parallel Adapter`` :cite:`adapters-Junxian2021unified` that operates at the input of the module itself, in parallel to the forward pass of the module. Yet another insertion location is inside the Multi Head Attention Layers.
+Adapter modules can be integrated into many different locations of a given module. For example, it is possible to have an adapter that affects only the outputs of the final layer in each module. We can also have a ``Parallel Adapter`` :cite:`adapters-components-Junxian2021unified` that operates at the input of the module itself, in parallel to the forward pass of the module. Yet another insertion location is inside the Multi Head Attention Layers.
 
 On top of this, while adapters are commonly used only in the layers containing the most parameters (say the Encoder of a network), some models can support adapters in multiple locations (Encoder-Decoder architecture for Language Models, Machine Translation, or even Encoder-Decoder-Joint for ASR with Transducer Loss). As such, NeMo utilizes the concept of ``Module Adapters``.
 
@@ -70,6 +72,7 @@ We discuss a simple residual additional connection strategy below - that accepts
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -78,6 +81,7 @@ We discuss a simple residual additional connection strategy below - that accepts
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -87,4 +91,4 @@ References
 
 .. bibliography:: ./adapter_bib.bib
     :style: plain
-    :keyprefix: adapters-
+    :keyprefix: adapters-components-
diff --git a/docs/source/core/adapters/intro.rst b/docs/source/core/adapters/intro.rst
index fd94c8d23446..8c5e9cbc8895 100644
--- a/docs/source/core/adapters/intro.rst
+++ b/docs/source/core/adapters/intro.rst
@@ -144,4 +144,5 @@ References
 
 .. bibliography:: ./adapter_bib.bib
     :style: plain
+    :labelprefix: adapters
     :keyprefix: adapters-
diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst
index 6e5efa56d5f0..1c9325cf0a96 100644
--- a/docs/source/core/core.rst
+++ b/docs/source/core/core.rst
@@ -16,9 +16,10 @@ NeMo models contain everything needed to train and reproduce Conversational AI m
 
 NeMo uses `Hydra <https://hydra.cc/>`_ for configuring both NeMo models and the PyTorch Lightning Trainer.
 
-.. note:: Every NeMo model has an example configuration file and training script that can be found `here <https://github.com/NVIDIA/NeMo/tree/v1.0.2/examples>`_.
+.. note::
+    Every NeMo model has an example configuration file and training script that can be found `here <https://github.com/NVIDIA/NeMo/tree/stable/examples>`__.
 
-The end result of using NeMo, `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_, and Hydra is that NeMo models all have the same look and feel and are also fully compatible with the PyTorch ecosystem. 
+The end result of using NeMo, `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`__, and Hydra is that NeMo models all have the same look and feel and are also fully compatible with the PyTorch ecosystem.
 
 Pretrained
 ----------
@@ -42,14 +43,14 @@ To see all available pretrained models for a specific NeMo model, use the ``list
 
 For detailed information on the available pretrained models, refer to the collections documentation: 
 
-- :ref:`Automatic Speech Recognition (ASR)`
+- :doc:`Automatic Speech Recognition (ASR) <../asr/intro>`
 - :doc:`Natural Language Processing (NLP) <../nlp/models>`
 - :doc:`Text-to-Speech Synthesis (TTS) <../tts/intro>`
 
 Training
 --------
 
-NeMo leverages `PyTorch Lightning <https://www.pytorchlightning.ai/>`_ for model training. PyTorch Lightning lets NeMo decouple the 
+NeMo leverages `PyTorch Lightning <https://www.pytorchlightning.ai/>`__ for model training. PyTorch Lightning lets NeMo decouple the
 conversational AI code from the PyTorch training code. This means that NeMo users can focus on their domain (ASR, NLP, TTS) and 
 build complex AI applications without having to rewrite boiler plate code for PyTorch training.
 
@@ -298,7 +299,7 @@ With NeMo and Hydra, every aspect of model training can be modified from the com
 of experiments on compute clusters or for quickly testing parameters while developing.
 
 All NeMo `examples <https://github.com/NVIDIA/NeMo/tree/v1.0.2/examples>`_ come with instructions on how to
-run the training/inference script from the command-line (see `here <https://github.com/NVIDIA/NeMo/blob/4e9da75f021fe23c9f49404cd2e7da4597cb5879/examples/asr/asr_ctc/speech_to_text_ctc.py#L24>`_
+run the training/inference script from the command-line (see `here <https://github.com/NVIDIA/NeMo/blob/4e9da75f021fe23c9f49404cd2e7da4597cb5879/examples/asr/asr_ctc/speech_to_text_ctc.py#L24>`__
 for an example).
 
 With Hydra, arguments are set using the ``=`` operator:
diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst
index b44d27c38b4b..efb55b0feabb 100644
--- a/docs/source/core/exp_manager.rst
+++ b/docs/source/core/exp_manager.rst
@@ -379,3 +379,4 @@ ExpManagerConfig
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
diff --git a/docs/source/core/export.rst b/docs/source/core/export.rst
index 990769452a5c..c53dd8159a60 100644
--- a/docs/source/core/export.rst
+++ b/docs/source/core/export.rst
@@ -194,7 +194,7 @@ To facilitate that, the hooks below are provided. To export, for example, 'encod
         First goes the one receiving input (input_example)
         """
 
-Some nertworks may be exported differently according to user-settable options (like ragged batch support for TTS or cache support for ASR). To facilitate that - `set_export_config()` method is provided by Exportable to set key/value pairs to predefined model.export_config dictionary, to be used during the export:
+Some networks may be exported differently according to user-settable options (like ragged batch support for TTS or cache support for ASR). To facilitate that - `set_export_config()` method is provided by Exportable to set key/value pairs to predefined model.export_config dictionary, to be used during the export:
 
 .. code-block:: Python	
 
@@ -202,6 +202,7 @@ Some nertworks may be exported differently according to user-settable options (l
         """
         Sets/updates export_config dictionary
         """
+
 Also, if an action hook on setting config is desired, this method may be overloaded by `Exportable` descendants to include one.
 An example can be found in ``<NeMo_git_root>/nemo/collections/asr/models/rnnt_models.py``.
 
diff --git a/docs/source/core/neural_types.rst b/docs/source/core/neural_types.rst
index 9003b9ca5203..ec7d94336c05 100644
--- a/docs/source/core/neural_types.rst
+++ b/docs/source/core/neural_types.rst
@@ -24,6 +24,7 @@ Types are implemented in ``nemo.core.neural_types.NeuralType`` class. When you i
 are expected to include both *axes* information and *element type* information.
 
 .. autoclass:: nemo.core.neural_types.NeuralType
+    :no-index:
 
 Type Comparison Results
 -----------------------
@@ -31,6 +32,7 @@ Type Comparison Results
 When comparing two neural types, the following comparison results are generated.
 
 .. autoclass:: nemo.core.neural_types.NeuralTypeComparisonResult
+    :no-index:
 
 Examples
 --------
@@ -113,6 +115,7 @@ Custom element types
 It is possible to create user-defined element types to express the semantics of elements in your tensors. To do so, the user will need to inherit and implement abstract methods of the ``nemo.core.neural_types.elements.ElementType`` class
 
 .. autoclass:: nemo.core.neural_types.elements.ElementType
+    :no-index:
 
 Note that element types can be parametrized. Consider this example where it distinguishes between audio sampled at 8Khz and 16Khz.
 
diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst
index 0e0b3ad84402..d72d54ab7c2c 100644
--- a/docs/source/features/memory_optimizations.rst
+++ b/docs/source/features/memory_optimizations.rst
@@ -3,7 +3,7 @@ Memory Optimizations
 
 Parallelism
 -----------
-Refer to :doc:`Parallelism <./parallelism>`.
+Refer to :doc:`Parallelism <./parallelisms>`.
 
 Flash Attention
 ---------------
@@ -20,10 +20,8 @@ In the NeMo Framework, Flash Attention is supported through the Transformer Engi
 
 For more details on the supported Dot Attention backend, please refer to the Transformer Engine source code available at `Transformer Engine's Attention Mechanism <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_.
 
-.. bibliography:: ./nlp_all.bib
-    :style: plain
-    :labelprefix: nlp-megatron
-    :keyprefix: nlp-megatron-
+Activation Recomputation
+------------------------
 
 Overview
 ^^^^^^^^
@@ -41,8 +39,3 @@ Selective Activation Recomputation
 This method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
 
 Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198
-
-.. bibliography:: ./nlp_all.bib
-    :style: plain
-    :labelprefix: nlp-megatron
-    :keyprefix: nlp-megatron-
\ No newline at end of file
diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst
index d6f96e6c6ea4..3228cd76d4ad 100644
--- a/docs/source/multimodal/api.rst
+++ b/docs/source/multimodal/api.rst
@@ -8,6 +8,7 @@ Model Classes
     :show-inheritance:
     :no-members:
     :members: __init__, configure_optimizers
+    :no-index:
 
 
 .. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
@@ -16,18 +17,18 @@ Model Classes
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
 
 
-.. autoclass:: nemo.collections.multimodal.models.dreambooth.dreambooth.MegatronDreamBooth
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.dreambooth.dreambooth.MegatronDreamBooth
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
 
 
-.. autoclass:: nemo.collections.multimodal.models.controlnet.controlnet.MegatronControlNet
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.MegatronControlNet
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
 
-.. autoclass:: nemo.collections.multimodal.models.imagen.imagen.MegatronImagen
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.imagen.imagen.MegatronImagen
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
@@ -65,7 +66,7 @@ Modules
     :members: __init__, encode
 
 
-.. autoclass:: nemo.collections.multimodal.models.controlnet.controlnet.ControlledUnetModel
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.ControlledUnetModel
     :show-inheritance:
     :no-members:
     :members: forward
diff --git a/docs/source/multimodal/mllm/checkpoint.rst b/docs/source/multimodal/mllm/checkpoint.rst
index 46c6da631ba2..d1fe7b651e66 100644
--- a/docs/source/multimodal/mllm/checkpoint.rst
+++ b/docs/source/multimodal/mllm/checkpoint.rst
@@ -41,7 +41,7 @@ Converting Local Checkpoints
 
 The training script only auto-converts the final checkpoint into the ``.nemo`` format. To evaluate intermediate training checkpoints, conversion to ``.nemo`` might be needed. For this:
 
-.. code-block:: python
+.. code-block:: bash
 
    python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
        examples/multimodal/convert_ckpt_to_nemo.py \
@@ -59,12 +59,12 @@ NeVA Checkpoints
 
 Currently, the conversion mainly supports LLaVA checkpoints based on "llama-2 chat" checkpoints. As a reference, we'll consider the checkpoint `llava-llama-2-13b-chat-lightning-preview <https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview>`_.
 
-After downloading this checkpoint and saving it at `/path/to/llava-llama-2-13b-chat-lightning-preview`, undertake the following procedures:
+After downloading this checkpoint and saving it at ``/path/to/llava-llama-2-13b-chat-lightning-preview``, undertake the following procedures:
 
 Modifying the Tokenizer
 """""""""""""""""""""""
 
-NeMo mandates adding specific tokens to the tokenizer model for peak performance. To modify an existing tokenizer located in `/path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer`, execute the following in the NeMo container:
+NeMo mandates adding specific tokens to the tokenizer model for peak performance. To modify an existing tokenizer located in ``/path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer``, execute the following in the NeMo container:
 
 .. code-block:: bash
 
@@ -82,7 +82,7 @@ Checkpoint Conversion
 
 For conversion:
 
-.. code-block:: python
+.. code-block:: bash
 
    python examples/multimodal/mllm/neva/convert_hf_llava_to_neva.py \
      --in-file /path/to/llava-llama-2-13b-chat-lightning-preview \
@@ -99,7 +99,7 @@ NeVA Checkpoints
 
 Adjust model parallelism with:
 
-.. code-block:: python
+.. code-block:: bash
 
    python examples/nlp/language_modeling/megatron_change_num_partitions.py \
     --model_file=/path/to/source.nemo \
diff --git a/docs/source/multimodal/nerf/dreamfusion.rst b/docs/source/multimodal/nerf/dreamfusion.rst
index a9f2f630bcdd..d6c926392556 100644
--- a/docs/source/multimodal/nerf/dreamfusion.rst
+++ b/docs/source/multimodal/nerf/dreamfusion.rst
@@ -3,7 +3,7 @@ DreamFusion
 
 Model Introduction
 -------------------
-DreamFusion  :cite:`mm-models-poole2022dreamfusion` uses a pretrained text-to-image diffusion model to perform
+DreamFusion  :cite:`mm-models-df-poole2022dreamfusion` uses a pretrained text-to-image diffusion model to perform
 text-to-3D synthesis. The model uses a loss based on probability density distillation that enables the use of a 2D
 diffusion model as a prior for optimization of a parametric image generator.
 
@@ -306,5 +306,5 @@ References
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-DF
+    :keyprefix: mm-models-df-
diff --git a/docs/source/multimodal/text2img/controlnet.rst b/docs/source/multimodal/text2img/controlnet.rst
index 6eae36dd017a..b9f55031b79d 100644
--- a/docs/source/multimodal/text2img/controlnet.rst
+++ b/docs/source/multimodal/text2img/controlnet.rst
@@ -4,12 +4,12 @@ ControlNet
 Model Introduction
 --------------------
 
-ControlNet :cite:`mm-models-controlnetgithub` is a neural network structure to control diffusion models by adding extra conditions.
+ControlNet :cite:`mm-models-cn-controlnetgithub` is a neural network structure to control diffusion models by adding extra conditions.
 It copies the weights of neural network blocks into a "locked" copy and a "trainable" copy. The "trainable" one learns your condition. The "locked" one preserves your model. In this way, the ControlNet can reuse the SD encoder as a deep, strong, robust, and powerful backbone to learn diverse controls.
 NeMo Multimodal provides a training pipeline and example implementation for generating images based on segmentation maps. Users have the flexibility to explore other implementations using their own control input dataset and recipe.
 
 .. image:: ./images/controlnet-structure.png
-   :alt: ControlNet structure on stable diffusion (See :cite:`mm-models-controlnetgithub`)
+   :alt: ControlNet structure on stable diffusion (See :cite:`mm-models-cn-controlnetgithub`)
 
 
 ControlNet Dataset
@@ -102,5 +102,5 @@ Reference
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-CN
+    :keyprefix: mm-models-cn-
diff --git a/docs/source/multimodal/text2img/dreambooth.rst b/docs/source/multimodal/text2img/dreambooth.rst
index fa7e52a7ccbb..1c6a420d49f2 100644
--- a/docs/source/multimodal/text2img/dreambooth.rst
+++ b/docs/source/multimodal/text2img/dreambooth.rst
@@ -5,7 +5,7 @@ DreamBooth
 Model Introduction
 --------------------
 
-DreamBooth :cite:`mm-models-dreamboothpaper` is a fine-tuning technique and a solution to personalize large diffusion models like Stable Diffusion, which are powerful but lack the
+DreamBooth :cite:`mm-models-db-dreamboothpaper` is a fine-tuning technique and a solution to personalize large diffusion models like Stable Diffusion, which are powerful but lack the
 ability to mimic subjects of a given reference set. With DreamBooth, you only need a few images of a specific subject to
 fine-tune a pretrained text-to-image model, so that it learns to bind a unique identifier with a special subject. This
 unique identifier can then be used to synthesize fully-novel photorealistic images of the subject contextualized in
@@ -28,7 +28,7 @@ NeMo's Dreambooth is built upon the Stable Diffusion framework. While its archit
 
 - Training Dataset
 
-    NeMo's Dreambooth model dataset is different from other NeMo multimodal models in that it doesn't necessitate data stored in the webdataset format. You can find a sample dataset at :cite:`mm-models-dreamboothdataset`. For each object you aim to integrate into the model, just place its images (typically 3-5) in a folder and specify its path in ``model.data.instance_dir``. When training with the prior preservation loss, store images produced by the original model in a distinct folder and reference its path in ``model.data.regularization_dir``. This process is automated in NeMo's DreamBooth implementation.
+    NeMo's Dreambooth model dataset is different from other NeMo multimodal models in that it doesn't necessitate data stored in the webdataset format. You can find a sample dataset at :cite:`mm-models-db-dreamboothdataset`. For each object you aim to integrate into the model, just place its images (typically 3-5) in a folder and specify its path in ``model.data.instance_dir``. When training with the prior preservation loss, store images produced by the original model in a distinct folder and reference its path in ``model.data.regularization_dir``. This process is automated in NeMo's DreamBooth implementation.
 
 Model Configuration
 --------------------
@@ -130,5 +130,5 @@ Reference
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-DB
+    :keyprefix: mm-models-db-
diff --git a/docs/source/multimodal/text2img/imagen.rst b/docs/source/multimodal/text2img/imagen.rst
index 9aeff2f2a061..844f68df747f 100644
--- a/docs/source/multimodal/text2img/imagen.rst
+++ b/docs/source/multimodal/text2img/imagen.rst
@@ -4,7 +4,7 @@ Imagen
 Model Introduction
 -------------------
 
-Imagen  :cite:`mm-models-saharia2022photorealistic` is a multi-stage text-to-image diffusion model with an unprecedented 
+Imagen  :cite:`mm-models-imagen-saharia2022photorealistic` is a multi-stage text-to-image diffusion model with an unprecedented 
 degree of photorealism and a deep level of language understanding. Given a text prompt, 
 Imagen first generates an image at a 64x64 resolution and then upsamples the generated image to 256x256 and 1024x1024 
 resolutions, all using diffusion models.
@@ -75,9 +75,9 @@ Recommended Efficient UNet size for SR256 and SR1024 models are listed below:
 Noise Scheduling / Sampler
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-NeMo Imagen supports two types of noise scheduling: Continous DDPM :cite:`mm-models-nichol2021improved` and EDM :cite:`mm-models-karras2022elucidating`.
+NeMo Imagen supports two types of noise scheduling: Continous DDPM :cite:`mm-models-imagen-nichol2021improved` and EDM :cite:`mm-models-imagen-karras2022elucidating`.
 
-Denoising diffusion probabilistic models (DDPM) :cite:`mm-models-ho2020denoising` 
+Denoising diffusion probabilistic models (DDPM) :cite:`mm-models-imagen-ho2020denoising` 
 represents the most widely adopted noise scheduling approach among all diffusion models. 
 Continuous DDPM introduces several modifications to the standard DDPM framework, 
 with the most significant change being the transition from a discrete noise space to a continuous space.
@@ -285,5 +285,5 @@ Reference
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-IMAGEN
+    :keyprefix: mm-models-imagen-
diff --git a/docs/source/multimodal/text2img/insp2p.rst b/docs/source/multimodal/text2img/insp2p.rst
index 177734584bc7..282874444738 100644
--- a/docs/source/multimodal/text2img/insp2p.rst
+++ b/docs/source/multimodal/text2img/insp2p.rst
@@ -4,7 +4,7 @@ InstructPix2Pix
 Model Introduction
 --------------------
 
-InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
+InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
 
 Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.
 
@@ -79,7 +79,7 @@ References
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-INSP2P
+    :keyprefix: mm-models-insp2p-
 
 
diff --git a/docs/source/multimodal/text2img/intro.rst b/docs/source/multimodal/text2img/intro.rst
index 3c3c17768679..599c9bae5e15 100644
--- a/docs/source/multimodal/text2img/intro.rst
+++ b/docs/source/multimodal/text2img/intro.rst
@@ -13,4 +13,5 @@ NeMo multimodal provides implementations of multiple image-to-text models, inclu
    imagen
    dreambooth
    controlnet
+   insp2p
    sdxl_quantization
diff --git a/docs/source/multimodal/text2img/sdxl_quantization.rst b/docs/source/multimodal/text2img/sdxl_quantization.rst
index 78403e9c402c..68bb7ff8d511 100644
--- a/docs/source/multimodal/text2img/sdxl_quantization.rst
+++ b/docs/source/multimodal/text2img/sdxl_quantization.rst
@@ -7,16 +7,17 @@ This example shows how to use Ammo to calibrate and quantize the UNet part of th
 We also provide instructions on deploying and running E2E SDXL pipeline
 with Ammo quantized int8 UNet to generate images and measure latency on target GPUs.
 
-To get started, it is required to have a pretrained SDXL checkpoint in `nemo` format. The example training configs are provided in NeMo,
-which is located in `NeMo/examples/multimodal/text2img/stable_diffusion`.
+To get started, it is required to have a pretrained SDXL checkpoint in ``nemo`` format. The example training configs are provided in NeMo,
+which is located in ``NeMo/examples/multimodal/text2img/stable_diffusion``.
 
 Calibration
 ---------------
 The first step is to run quantization script with default config, and finally the script will export the quantized unet to onnx file.
-Here is the default config for `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_quantize.py`.
+Here is the default config for ``NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_quantize.py``.
 
 
 .. code-block:: yaml
+
     quantize
       exp_name: nemo
       n_steps: 20          # number of inference steps
@@ -41,6 +42,7 @@ Build the TRT engine for the Quantized ONNX UNet
 ------------------------------------------------------------
 
 .. code-block:: bash
+
     trtexec --onnx=./nemo_onnx/unet.onnx --shapes=x:8x4x128x128,timesteps:8,context:8x80x2048,y:8x2816 --fp16 --int8 --builderOptimizationLevel=4 --saveEngine=nemo_unet_xl.plan
 
 
@@ -57,6 +59,7 @@ Build End-to-end Stable Diffusion XL Pipeline with NeMo
 We provide a script to build end to end TRT inference pipeline with NeMo backend, which is located at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_export.py`.
 
 .. code-block:: yaml
+
     infer:
         out_path: sdxl_export
         width: 1024
@@ -82,6 +85,7 @@ Run End-to-end Stable Diffusion XL TRT Pipeline
 The inference script can be found at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_trt_inference.py`.
 
 .. code-block:: yaml
+
     unet_xl: sdxl_export/plan/unet_xl.plan
     vae: sdxl_export/plan/vae.plan
     clip1: sdxl_export/plan/clip1.plan
diff --git a/docs/source/multimodal/vlm/clip.rst b/docs/source/multimodal/vlm/clip.rst
index e28fb836ff4a..976baadb5a83 100644
--- a/docs/source/multimodal/vlm/clip.rst
+++ b/docs/source/multimodal/vlm/clip.rst
@@ -4,7 +4,7 @@ CLIP
 Model Introduction
 -------------------
 
-Contrastive Language-Image Pre-training (CLIP) :cite:`mm-models-radford2021learning` offers an efficient method for learning image representations using natural language supervision. The essence of CLIP is to train both an image encoder and a text encoder from scratch. The model aims to predict the correct pairings of a batch of (image, text) training examples by jointly training these encoders. During pre-training, CLIP is designed to predict which images and texts form a semantically coherent pair by maximizing the similarity between the correct (image, text) pairs while minimizing the similarity between incorrect pairs. This contrastive learning approach ensures that CLIP learns meaningful and contextually rich representations of both visual and textual data.
+Contrastive Language-Image Pre-training (CLIP) :cite:`mm-models-clip-radford2021learning` offers an efficient method for learning image representations using natural language supervision. The essence of CLIP is to train both an image encoder and a text encoder from scratch. The model aims to predict the correct pairings of a batch of (image, text) training examples by jointly training these encoders. During pre-training, CLIP is designed to predict which images and texts form a semantically coherent pair by maximizing the similarity between the correct (image, text) pairs while minimizing the similarity between incorrect pairs. This contrastive learning approach ensures that CLIP learns meaningful and contextually rich representations of both visual and textual data.
 
 NeMo's implementation of the CLIP model leverages its parallel transformer implementation, specifically the `nemo.collections.nlp.modules.common.megatron.transformer.ParallelTransformer`, to enable model parallelism support in both the text encoder and vision model. This design choice ensures efficient scaling and utilization of resources during training. Additionally, some of the model design and loss implementations in NeMo's CLIP are inspired by the open-source [open_clip](https://github.com/mlfoundations/open_clip) repository.
 
@@ -153,5 +153,5 @@ References
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-CLIP
+    :keyprefix: mm-models-clip-
diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst
index b9b4d529ba46..52c1b537b0bf 100755
--- a/docs/source/nlp/api.rst
+++ b/docs/source/nlp/api.rst
@@ -22,7 +22,7 @@ Pretraining Model Classes
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_bart_model.MegatronBARTModel
     :show-inheritance: 
     :no-members:
-    :members: training_step, validation_step, build_train_valid_test_datasets, setup, on_save_checkpoint, on_load_checkpoint
+    :members: training_step, validation_step, build_train_valid_test_datasets, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_retrieval_model.MegatronRetrievalModel
     :show-inheritance: 
@@ -45,32 +45,27 @@ Customization Model Classes
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTAdapterLearningModel
     :show-inheritance: 
     :no-members:
-    :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: __init__, state_dict, generate, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTInfusedAdapterModel
     :show-inheritance: 
     :no-members:
-    :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: __init__, state_dict, generate, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model.MegatronGPTPromptLearningModel
     :show-inheritance: 
     :no-members:
-    :members: built_virtual_prompt_dataset, generate, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: build_virtual_prompt_dataset, generate, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel
     :show-inheritance: 
     :no-members:
-    :members: __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup
-
-.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel
-    :show-inheritance: 
-    :no-members:
-    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5InfusedAdapterModel
     :show-inheritance: 
     :no-members:
-    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, setup
 
 Modules
 -------
@@ -86,7 +81,7 @@ Modules
     :no-members:
     :members: forward
 
-.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.bert_model.BertModel
+.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.bert.bert_model.NeMoBertModel
     :show-inheritance: 
     :no-members:
     :members: forward
diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
index fa9157e45b59..26732283e8f4 100644
--- a/docs/source/nlp/information_retrieval.rst
+++ b/docs/source/nlp/information_retrieval.rst
@@ -53,7 +53,7 @@ BERT checkpoint to NeMo (mcore) using the following:
 
 Then you can fine-tune the sentence-BERT model using the following script:
 
-.. code-block:: python
+.. code-block:: bash
 
 
     #!/bin/bash
diff --git a/docs/source/nlp/machine_translation/machine_translation.rst b/docs/source/nlp/machine_translation/machine_translation.rst
index 190ac5b07da9..f58c67551abe 100644
--- a/docs/source/nlp/machine_translation/machine_translation.rst
+++ b/docs/source/nlp/machine_translation/machine_translation.rst
@@ -470,12 +470,12 @@ NMT with bottleneck encoder architecture is also supported (i.e., fixed size bot
 
 1. Supported  learning frameworks (**model.model_type**):
     * NLL - Conditional cross entropy (the usual NMT loss)
-    * VAE - Variational Auto-Encoder (`paper <https://arxiv.org/pdf/1312.6114.pdf>`_)
-    * MIM - Mutual Information Machine (`paper <https://arxiv.org/pdf/2003.02645.pdf>`_)
+    * VAE - Variational Auto-Encoder (`paper <https://arxiv.org/pdf/1312.6114.pdf>`__)
+    * MIM - Mutual Information Machine (`paper <https://arxiv.org/pdf/2003.02645.pdf>`__)
 2. Supported encoder architectures (**model.encoder.arch**):
     * seq2seq - the usual transformer encoder without a bottleneck
-    * bridge - attention bridge bottleneck (`paper <https://arxiv.org/pdf/1703.03130.pdf>`_)
-    * perceiver -  Perceiver bottleneck (`paper <https://arxiv.org/pdf/2103.03206.pdf>`_)
+    * bridge - attention bridge bottleneck (`paper <https://arxiv.org/pdf/1703.03130.pdf>`__)
+    * perceiver -  Perceiver bottleneck (`paper <https://arxiv.org/pdf/2103.03206.pdf>`__)
 
 
 +----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+
diff --git a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
index 2e94cc45b40f..efc2ac3f8439 100644
--- a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
+++ b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
@@ -70,7 +70,7 @@ Note that training tokenizer model will also take some time.
         --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 \
         --split_digits true
 
-After this is done (will take a while), you'll have two files: ```spm_32k_wiki.model``` and ```spm_32k_wiki.vocab``corresponding to the model and vocabulary.
+After this is done (will take a while), you'll have two files: ``spm_32k_wiki.model`` and ``spm_32k_wiki.vocab`` corresponding to the model and vocabulary.
 
 **Step 4: Convert training data into memory map format**
 
diff --git a/docs/source/nlp/nemo_megatron/positional_embeddings.rst b/docs/source/nlp/nemo_megatron/positional_embeddings.rst
index 332ce304049d..cac0bb452f58 100644
--- a/docs/source/nlp/nemo_megatron/positional_embeddings.rst
+++ b/docs/source/nlp/nemo_megatron/positional_embeddings.rst
@@ -18,38 +18,38 @@ GPT
      - .. code::
           
           model.position_embedding_type='learned_absolute'
-     - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
+     - Absolute Position Encodings :cite:`pos-emb-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
 
    * - **rope**
      - .. code::
 
           model.position_embedding_type='rope'
           model.rotary_percentage=1.0
-     - Rotary Position Embedding (RoPE) :cite:`nlp-megatron-su2022roformer` incorporates positional information by utilizing a rotation matrix to encode the absolute positions of tokens while maintaining relative positional relationships in self-attention formulations. It achieves this by leveraging the geometric properties of vectors and complex numbers and applying a rotation based on a preset non-zero constant and the relative positions of the tokens to the word embeddings.
-
+     - Rotary Position Embedding (RoPE) :cite:`pos-emb-su2022roformer` incorporates positional information by utilizing a rotation matrix to encode the absolute positions of tokens while maintaining relative positional relationships in self-attention formulations by leveraging the geometric properties of vectors and complex numbers, applying a rotation based on a preset non-zero constant and the relative positions of the tokens to the word embeddings.
+   
    * - **alibi**
      - .. code::
 
           model.position_embedding_type='alibi'
-     - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
+     - Attention with Linear Biases (ALiBi) :cite:`pos-emb-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
 
    * - **kerple**
      - .. code::
 
           model.position_embedding_type='kerple'
-     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using Conditionally Positive Definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
+     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`pos-emb-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using conditionally positive definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
 
    * - **xpos**
      - .. code::
 
           model.position_embedding_type='xpos'
-     - Extrapolatable Position Embedding (xPos) :cite:`nlp-megatron-sun2022lengthextrapolatable`
+     - Extrapolatable Position Embedding (xPos) :cite:`pos-emb-sun2022lengthextrapolatable`
 
    * - **sandwich**
      - .. code::
 
           model.position_embedding_type='sandwich'
-     - Sandwich :cite:`nlp-megatron-chi2023dissecting`
+     - Sandwich :cite:`pos-emb-chi2023dissecting`
 
 T5
 ^^
@@ -67,32 +67,32 @@ T5
 
           model.encoder.position_embedding_type='learned_absolute'
           model.decoder.position_embedding_type='learned_absolute'
-     - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
+     - Absolute Position Encodings :cite:`pos-emb-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
 
    * - **relative**
      - .. code::
 
           model.encoder.position_embedding_type='relative'
           model.decoder.position_embedding_type='relative'
-     - Relative Position Representations :cite:`nlp-megatron-shaw2018selfattention`
+     - Relative Position Representations :cite:`pos-emb-shaw2018selfattention`
 
    * - **alibi**
      - .. code::
 
           model.encoder.position_embedding_type='alibi'
           model.decoder.position_embedding_type='alibi'
-     - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
+     - Attention with Linear Biases (ALiBi) :cite:`pos-emb-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
 
    * - **kerple**
      - .. code::
 
           model.encoder.position_embedding_type='kerple'
           model.decoder.position_embedding_type='kerple'
-     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using Conditionally Positive Definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
+     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`pos-emb-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using conditionally positive definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
 
 Positional interpolation
 ------------------------
-Position Interpolation (PI) :cite:`nlp-megatron-chen2023extending` is a method introduced to extend the context window sizes of Rotary Position Embedding (RoPE)-based pretrained large language models (LLMs). The central principle of PI is to reduce the position indices so that they align with the initial context window size through interpolation.
+Position Interpolation (PI) :cite:`pos-emb-chen2023extending` is a method introduced to extend the context window sizes of Rotary Position Embedding (RoPE)-based pretrained large language models (LLMs). The central principle of PI is to reduce the position indices so that they align with the initial context window size through interpolation.
 
 Positional Interpolation is supported in Megatron GPT SFT models. Set RoPE Interpolation factor for sequence length :code:`seq_len_interpolation_factor` to enable it.
 
@@ -107,5 +107,5 @@ References
 
 .. bibliography:: ../nlp_all.bib
     :style: plain
-    :labelprefix: nlp-megatron
-    :keyprefix: nlp-megatron-
\ No newline at end of file
+    :labelprefix: pos-emb
+    :keyprefix: pos-emb-
\ No newline at end of file
diff --git a/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst b/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
index 8314676e5c4c..4cd13abd2264 100644
--- a/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
+++ b/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
@@ -36,7 +36,7 @@ Quick Start Guide
 Model Description
 -----------------
 In addition to :doc:`Punctuation And Capitalization model <./punctuation_and_capitalization>` we add audio encoder (e.g. Conformer's encoder) and attention based fusion of lexical and audio features.
-This model architecture is based on `Multimodal Semi-supervised Learning Framework for Punctuation Prediction in Conversational Speech <https://arxiv.org/pdf/2008.00702.pdf>`__ :cite:`nlp-punct-sunkara20_interspeech`.
+This model architecture is based on `Multimodal Semi-supervised Learning Framework for Punctuation Prediction in Conversational Speech <https://arxiv.org/pdf/2008.00702.pdf>`__ :cite:`nlp-punct-lex-sunkara20_interspeech`.
 
 .. note::
 
@@ -386,6 +386,6 @@ References
 
 .. bibliography:: nlp_all.bib
     :style: plain
-    :labelprefix: NLP-PUNCT
-    :keyprefix: nlp-punct-
+    :labelprefix: NLP-PUNCT-LEX
+    :keyprefix: nlp-punct-lex-
 
diff --git a/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst b/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
index 672226622357..702fb9425026 100644
--- a/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
+++ b/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
@@ -59,7 +59,7 @@ In the example, ``<self>`` denotes that the spoken form is the same as the writt
     <eos>	<eos>
 
 
-More information about the Google Text Normalization Dataset can be found in the paper `RNN Approaches to Text Normalization: A Challenge <https://arxiv.org/ftp/arxiv/papers/1611/1611.00068.pdf>`__ :cite:`nlp-textnorm-sproat2016rnn`.
+More information about the Google Text Normalization Dataset can be found in the paper `RNN Approaches to Text Normalization: A Challenge <https://arxiv.org/ftp/arxiv/papers/1611/1611.00068.pdf>`__ :cite:`nlp-textnorm-tag-sproat2016rnn`.
 
 
 Data preprocessing
@@ -146,7 +146,7 @@ contextualized representation for each input token. It then uses a classificatio
 to predict the tag for each token. Another classification head is used to predict a "semiotic" class label for each token.
 
 Overall, our design is partly inspired by the LaserTagger approach proposed in the paper
-`Encode, tag, realize: High-precision text editing <https://arxiv.org/abs/1909.01187>`__ :cite:`nlp-textnorm-malmi2019encode`.
+`Encode, tag, realize: High-precision text editing <https://arxiv.org/abs/1909.01187>`__ :cite:`nlp-textnorm-tag-malmi2019encode`.
 
 The LaserTagger method is not directly applicable to ITN because it can only regard the whole non-common fragment as a single
 replacement tag, whereas spoken-to-written conversion, e.g. a date, needs to be aligned on a more granular level. Otherwise,
@@ -161,5 +161,5 @@ References
 
 .. bibliography:: tn_itn_all.bib
     :style: plain
-    :labelprefix: NLP-TEXTNORM
-    :keyprefix: nlp-textnorm-
+    :labelprefix: NLP-TEXTNORM-TAG
+    :keyprefix: nlp-textnorm-tag
diff --git a/docs/source/starthere/best-practices.rst b/docs/source/starthere/best-practices.rst
index ec0fea1985cc..759ee108ed7b 100644
--- a/docs/source/starthere/best-practices.rst
+++ b/docs/source/starthere/best-practices.rst
@@ -23,7 +23,7 @@ NeMo excels in training large-scale LLM & MM, utilizing optimizations from Megat
 - Advanced checkpointing through the Distributed Checkpoint Format.
 
 Speech AI
---------
+---------
 
 Data Augmentation
 ~~~~~~~~~~~~~~~~~
diff --git a/docs/source/starthere/migration-guide.rst b/docs/source/starthere/migration-guide.rst
index 1d9816493a5b..7005873e5343 100644
--- a/docs/source/starthere/migration-guide.rst
+++ b/docs/source/starthere/migration-guide.rst
@@ -8,39 +8,39 @@ Upgrade guide to use lightning 2.0
 
 .. _dummy_header:
 
-* Replace ``trainer.strategy=null`` with ``trainer.strategy=auto`` as `lightning 2.0 doesn't have None strategy <https://lightning.ai/docs/pytorch/stable/common/trainer.html#:~:text=strategy%20(Union%5Bstr%2C%20Strategy%5D)%20%E2%80%93%20Supports%20different%20training%20strategies%20with%20aliases%20as%20well%20custom%20strategies.%20Default%3A%20%22auto%22.>`_.
+* Replace ``trainer.strategy=null`` with ``trainer.strategy=auto`` as `lightning 2.0 doesn't have None strategy <https://lightning.ai/docs/pytorch/stable/common/trainer.html#:~:text=strategy%20(Union%5Bstr%2C%20Strategy%5D)%20%E2%80%93%20Supports%20different%20training%20strategies%20with%20aliases%20as%20well%20custom%20strategies.%20Default%3A%20%22auto%22.>`__.
 
-* Remove ``resume_from_checkpoint`` if being used as a trainer flag and pass the path to `Trainer.fit(ckpt_path="...") method <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20Trainer%E2%80%99s%20flag%20resume_from_checkpoint>`_.
+* Remove ``resume_from_checkpoint`` if being used as a trainer flag and pass the path to `Trainer.fit(ckpt_path="...") method <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20Trainer%E2%80%99s%20flag%20resume_from_checkpoint>`__.
 
 * Set ``trainer.strategy = "ddp_find_unused_parameters_true"`` if there are unused parameters in your model as lightning 2.0 has find_unused_parameters as False by default. 
   
-    Reference: `NeMo PR 6433 <https://github.com/NVIDIA/NeMo/pull/6433/files#:~:text=Resolve%20conversation-,cfg.trainer.strategy%20%3D%20%22ddp_find_unused_parameters_true%22,-logging.info>`_.  More details about this change: `lightning PR 16611 <https://github.com/Lightning-AI/lightning/pull/16611>`_.
+    Reference: `NeMo PR 6433 <https://github.com/NVIDIA/NeMo/pull/6433/files#:~:text=Resolve%20conversation-,cfg.trainer.strategy%20%3D%20%22ddp_find_unused_parameters_true%22,-logging.info>`__.  More details about this change: `lightning PR 16611 <https://github.com/Lightning-AI/lightning/pull/16611>`__.
 
 
-* If used Trainer's flag ``replace_sampler_ddp`` replace it with `use_distributed_sampler <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=use%20use_distributed_sampler%3B%20the%20sampler%20gets%20created%20not%20only%20for%20the%20DDP%20strategies>`_.
+* If used Trainer's flag ``replace_sampler_ddp`` replace it with `use_distributed_sampler <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=use%20use_distributed_sampler%3B%20the%20sampler%20gets%20created%20not%20only%20for%20the%20DDP%20strategies>`__.
 
-* If using ``CheckpointConnector`` replace it with `_CheckpointConnector <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-fbee9218112b5eb07e4b799b868cbe3ab582336157bde6dc7c881daa63965ff5R20>`_.
+* If using ``CheckpointConnector`` replace it with `_CheckpointConnector <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-fbee9218112b5eb07e4b799b868cbe3ab582336157bde6dc7c881daa63965ff5R20>`__.
 
 * To set or get ``ckpt_path`` use ``trainer.ckpt_path`` directly instead of calling protected API via ``trainer._checkpoint_connector._ckpt_path`` or using ``trainer._checkpoint_connector.resume_from_checkpoint_fit_path``.
 
 * Change ``import load`` from pytorch_lightning.utilities.cloud_io to ``import _load``.
 
-* If used ``from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin`` from replace it with `from pytorch_lightning.plugins.precision import MixedPrecisionPlugin <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20the%20pl.plugins.NativeMixedPrecisionPlugin%20plugin>`_. 
+* If used ``from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin`` from replace it with `from pytorch_lightning.plugins.precision import MixedPrecisionPlugin <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20the%20pl.plugins.NativeMixedPrecisionPlugin%20plugin>`__.
 
 * Lightning 2.0 adds ``'16-mixed'``, ``'bf16-mixed'`` as the preicison values for fp16 mixed precision and bf16 mixed precision respectively. 
   
-    For backward compatbility ``16`` or ``'16'`` and ``'bf16'`` also perform mixed precision and is equivalent to ``'16-mixed'`` and ``'bf16-mixed'`` respectively. However, lightning recommends to use ``'16-mixed'`` and ``'bf16-mixed'`` to make it less ambiguous. Due to this, ``MegatronHalfPrecisionPlugin's`` parent class from lightning ``MixedPrecisionPlugin`` class, expects the precision arg to be ``'16-mixed'`` and ``'bf16-mixed'``. As a result it's required to pass ``'16-mixed'`` or ``'bf16-mixed'`` to ``MixedPrecisionPLugin`` whenever the precision passed is any of ``[16, '16', '16-mixed']`` or ``['bf16', 'bf16-mixed']``. This can be taken care as shown here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R140>`_ and here: `MixedPrecisionPlugin <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R148-R152>`_. Also, ``'32-true'`` is added as a precsion value for pure fp32 along with ``32``, ``'32'`` that existed. This can be taken into account as shown here in the `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R269>`_.
+    For backward compatbility ``16`` or ``'16'`` and ``'bf16'`` also perform mixed precision and is equivalent to ``'16-mixed'`` and ``'bf16-mixed'`` respectively. However, lightning recommends to use ``'16-mixed'`` and ``'bf16-mixed'`` to make it less ambiguous. Due to this, ``MegatronHalfPrecisionPlugin's`` parent class from lightning ``MixedPrecisionPlugin`` class, expects the precision arg to be ``'16-mixed'`` and ``'bf16-mixed'``. As a result it's required to pass ``'16-mixed'`` or ``'bf16-mixed'`` to ``MixedPrecisionPLugin`` whenever the precision passed is any of ``[16, '16', '16-mixed']`` or ``['bf16', 'bf16-mixed']``. This can be taken care as shown here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R140>`__ and here: `MixedPrecisionPlugin <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R148-R152>`__. Also, ``'32-true'`` is added as a precsion value for pure fp32 along with ``32``, ``'32'`` that existed. This can be taken into account as shown here in the `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R269>`__.
 
-* Lightning 2.0 renames epoch end hooks from ``training_epoch_end``, ``validation_epoch_end``, ``test_epoch_end`` to ``on_train_epoch_end``, ``on_validation_epoch_end``, ``on_test_epoch_end``. The renamed hooks do not accept the outputs arg but instead outputs needs to be defined as an instance variable of the model class to which the outputs of the step needs to be manually appended. More detailed examples implementing this can be found under migration guide of `lightning's PR 16520 <https://github.com/Lightning-AI/lightning/pull/16520>`_. Example from NeMo  can be found `here <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R904-R911>`_.
+* Lightning 2.0 renames epoch end hooks from ``training_epoch_end``, ``validation_epoch_end``, ``test_epoch_end`` to ``on_train_epoch_end``, ``on_validation_epoch_end``, ``on_test_epoch_end``. The renamed hooks do not accept the outputs arg but instead outputs needs to be defined as an instance variable of the model class to which the outputs of the step needs to be manually appended. More detailed examples implementing this can be found under migration guide of `lightning's PR 16520 <https://github.com/Lightning-AI/lightning/pull/16520>`__. Example from NeMo  can be found `here <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R904-R911>`__.
 
 * Lightning 2.0 is not currently supporting multiple dataloders for validation and testing in case of ``dataloader_iter``. The support for this will be added back soon in an upcoming release. If ``dataloader_iter`` is being used and your config passes multiple files to ``validation_ds.file_names`` or ``test_ds.file_names``, please use just one file until this issue is fixed with pytorch lightning.
 
 * With lightning 2.0 it's required to set ``limit_val_batches`` and ``num_sanity_val_steps`` to be a multiple of number of microbatches while using ``dataloader_iter`` (applies only to Megatron files that use dataloader_iter) for all pretraining files (not downstream tasks like finetuning). This is being taken care internally in NeMo and does not require anything to be done by the user. However, if you are a developer of NeMo and are building a new model for pretraining that uses ``dataloader_iter`` instead of batch in ``validation_step`` methods please make sure to call ``self._reconfigure_val_batches()`` in ``build_train_valid_test_datasets method`` of your model.
 
 * If model is being wrapped with ``LightningDistributedModule`` in ``configure_ddp`` method please replace it with ``_LightningModuleWrapperBase`` 
-  as being done here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR136>`_.
+  as being done here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR136>`__.
 
-* If using ``pre_configure_ddp()`` in your DDP, remove it as it's not required anymore. `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR148-R150>`_.
+* If using ``pre_configure_ddp()`` in your DDP, remove it as it's not required anymore. `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR148-R150>`__.
 
 * If any of the tests use CPU as the device, ensure to explicitly pass it in the trainer as ``trainer = pl.Trainer(max_epochs=1, accelerator='cpu')`` since deafult val in PTL >= 2.0 is auto and it picks cuda.
 
diff --git a/docs/source/tools/nemo_forced_aligner.rst b/docs/source/tools/nemo_forced_aligner.rst
index aa8d2139653f..df872e7d2195 100644
--- a/docs/source/tools/nemo_forced_aligner.rst
+++ b/docs/source/tools/nemo_forced_aligner.rst
@@ -12,14 +12,14 @@ NFA can be used on long audio files of 1+ hours duration (subject to your hardwa
 Demos & Tutorials
 -----------------
 
-* HuggingFace Space `demo <https://huggingface.co/spaces/erastorgueva-nv/NeMo-Forced-Aligner>`_ to quickly try out NFA in various languages.
-* NFA "how-to" notebook `tutorial <https://nvidia.github.io/NeMo/blogs/2023/2023-08-forced-alignment/>`_.
-* "How forced alignment works" NeMo blog `tutorial <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb>`_.
+* HuggingFace Space `demo <https://huggingface.co/spaces/erastorgueva-nv/NeMo-Forced-Aligner>`__ to quickly try out NFA in various languages.
+* NFA "how-to" notebook `tutorial <https://nvidia.github.io/NeMo/blogs/2023/2023-08-forced-alignment/>`__.
+* "How forced alignment works" NeMo blog `tutorial <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb>`__.
 
 Quickstart
 ----------
 
-1. Install `NeMo <https://github.com/NVIDIA/NeMo#installation>`_.
+1. Install `NeMo <https://github.com/NVIDIA/NeMo#installation>`__.
 2. Prepare a NeMo-style manifest containing the paths of audio files you would like to proces, and (optionally) their text.
 3. Run NFA's ``align.py`` script with the desired config, e.g.:
 
diff --git a/docs/source/vision/checkpoint.rst b/docs/source/vision/checkpoint.rst
index 7e3e197a1169..49848b90d51a 100644
--- a/docs/source/vision/checkpoint.rst
+++ b/docs/source/vision/checkpoint.rst
@@ -63,7 +63,7 @@ ViT Checkpoints
 
 To adjust model parallelism from original model parallelism size to a new model parallelism size (Note: NeMo ViT currently only supports `pipeline_model_parallel_size=1`):
 
-.. code-block:: python
+.. code-block:: bash
 
    python examples/nlp/language_modeling/megatron_change_num_partitions.py \
     --model_file=/path/to/source.nemo \
diff --git a/docs/source/vision/vit.rst b/docs/source/vision/vit.rst
index 679313bcbd66..a7b4e2546f22 100644
--- a/docs/source/vision/vit.rst
+++ b/docs/source/vision/vit.rst
@@ -4,7 +4,7 @@ ViT
 Model Introduction
 -------------------
 
-The Vision Transformer, commonly referred to as ViT :cite:`vision-models-vit`, serves as a foundational model
+The Vision Transformer, commonly referred to as ViT :cite:`vision-models-vit-vit`, serves as a foundational model
 for image classification tasks in NeMo. Unlike conventional convolutional neural networks, ViT adopts a transformer-like
 architecture to process image data. In this approach, an image is divided into fixed-size patches, typically
 14x14 or 16x16. These patches are linearly embedded and augmented with position embeddings. The resulting
@@ -136,5 +136,5 @@ Reference
 .. bibliography:: ./vision_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: VISION-MODELS
-    :keyprefix: vision-models-
+    :labelprefix: VISION-MODELS-VIT
+    :keyprefix: vision-models-vit-
diff --git a/nemo/collections/asr/models/asr_model.py b/nemo/collections/asr/models/asr_model.py
index 4420318dd416..e14424cec5c1 100644
--- a/nemo/collections/asr/models/asr_model.py
+++ b/nemo/collections/asr/models/asr_model.py
@@ -203,9 +203,9 @@ def forward_for_export(
         """
         This forward is used when we need to export the model to ONNX format.
         Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models.
+
         Args:
-            input: Tensor that represents a batch of raw audio signals,
-                of shape [B, T]. T here represents timesteps.
+            input: Tensor that represents a batch of raw audio signals of shape [B, T]. T here represents timesteps.
             length: Vector of length B, that contains the individual lengths of the audio sequences.
             cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers
             cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers
diff --git a/nemo/collections/asr/models/msdd_models.py b/nemo/collections/asr/models/msdd_models.py
index d96bafd5af9b..01926eb4ae79 100644
--- a/nemo/collections/asr/models/msdd_models.py
+++ b/nemo/collections/asr/models/msdd_models.py
@@ -400,10 +400,15 @@ def get_cluster_avg_embs_model(
                 multi-scale input tensors during forward propagating.
 
                 Example: `batch_size=3, scale_n=6, emb_dim=192`
-                    ms_seg_counts =  
-                     [[8,  9, 12, 16, 25, 51],  
-                      [11, 13, 14, 17, 25, 51],  
-                      [ 9,  9, 11, 16, 23, 50]]  
+                    .. code:: python
+
+                        ms_seg_counts =
+                            [
+                                [ 8,  9, 12, 16, 25, 51],
+                                [11, 13, 14, 17, 25, 51],
+                                [ 9,  9, 11, 16, 23, 50]
+                            ]
+
                     Counts of merged segments: (121, 131, 118)  
                     embs has shape of (370, 192)  
                     clus_label_index has shape of (3, 131)  
diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py
index 5a7457f6379d..055066c00660 100644
--- a/nemo/collections/asr/modules/rnnt.py
+++ b/nemo/collections/asr/modules/rnnt.py
@@ -1559,13 +1559,13 @@ def joint_after_projection(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tens
         NOTE:
             The implementation of this model is slightly modified from the original paper.
             The original paper proposes the following steps :
-            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1
-            *1 -> Forward through joint final [B, T, U, V + 1].
+            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- \*1
+            \*1 -> Forward through joint final [B, T, U, V + 1].
 
             We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows:
-            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1
-            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2
-            (*1, *2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1].
+            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- \*1
+            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- \*2
+            (\*1, \*2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1].
 
         Args:
             f: Output of the Encoder model. A torch.Tensor of shape [B, T, H1]
@@ -2050,8 +2050,7 @@ def sampled_joint(
         """
         Compute the sampled joint step of the network.
 
-        # Reference
-        - [Memory-Efficient Training of RNN-Transducer with Sampled Softmax](https://arxiv.org/abs/2203.16868)
+        Reference: `Memory-Efficient Training of RNN-Transducer with Sampled Softmax <https://arxiv.org/abs/2203.16868>`__.
 
         Here,
         B = Batch size
@@ -2065,13 +2064,13 @@ def sampled_joint(
         NOTE:
             The implementation of this joint model is slightly modified from the original paper.
             The original paper proposes the following steps :
-            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1
-            *1 -> Forward through joint final [B, T, U, V + 1].
+            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- \*1
+            \*1 -> Forward through joint final [B, T, U, V + 1].
 
             We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows:
-            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1
-            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2
-            (*1, *2) -> Sum [B, T, U, H] -> Sample Vocab V_Pos (for target tokens) and V_Neg ->
+            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- \*1
+            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- \*2
+            (\*1, \*2) -> Sum [B, T, U, H] -> Sample Vocab V_Pos (for target tokens) and V_Neg ->
             (V_Neg is sampled not uniformly by as a rand permutation of all vocab tokens, then eliminate
             all Intersection(V_Pos, V_Neg) common tokens to avoid duplication of loss) ->
             Concat new Vocab V_Sampled = Union(V_Pos, V_Neg)
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index b264890ce48d..dc0cef692ee2 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -26,9 +26,10 @@
 
 
 class AutoTokenizer(TokenizerSpec):
-    '''
+    """
         Wrapper of HuggingFace AutoTokenizer https://huggingface.co/transformers/model_doc/auto.html#autotokenizer.
-    '''
+
+    """
 
     def __init__(
         self,
@@ -52,7 +53,7 @@ def __init__(
                 For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained. 
                 The list of all supported models can be found here: ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
             vocab_file: path to file with vocabulary which consists
-                of characters separated by '\n'.
+                of characters separated by newlines.
             mask_token: mask token 
             bos_token: the beginning of sequence token
             eos_token: the end of sequence token. Usually equal to sep_token
@@ -167,11 +168,13 @@ def add_special_tokens(self, special_tokens_dict: dict) -> int:
         """
         Adds a dictionary of special tokens (eos, pad, cls...). If special tokens are NOT in the vocabulary, they are added
         to it (indexed starting from the last index of the current vocabulary).
+
         Args:
             special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
                 [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
                 ``additional_special_tokens``].
-            Tokens are only added if they are not already in the vocabulary.
+                Tokens are only added if they are not already in the vocabulary.
+
         Returns:
             Number of tokens added to the vocabulary.
         """
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py
index 72f4fd0e12a1..f0efaf5cd1aa 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py
@@ -252,7 +252,8 @@ def build_training_sample(
         skip_masking_id=None,
     ):
         """Build training sample.
-        Arguments:
+
+        Args:
             sample: A list of sentences in which each sentence is a list token ids.
             target_seq_length: Desired sequence length.
             max_seq_length: Maximum length of the sequence. All values are padded to
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
index 5ed0da009cf2..fb8ec9554a95 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
@@ -72,10 +72,10 @@ def load_data(self, dataset):
         """
         Loads a dataset by filling in the task templates specified in the config file
         with the information from each training/inference example. Converts all input 
-        text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in 
+        text into token ids. Also replaces the ``<|VIRTUAL_PROMPT_#|>`` placeholders in
         the task templates with the actual virtual prompt token ids. 
 
-        params:
+        Args:
             dataset: A list of json objects or a dictionary objects each
                      containing the information needed for a training example
         """
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py
index c2d19305cf03..485388d84343 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py
@@ -25,6 +25,7 @@
 class UL2Dataset(T5Dataset):
     """ UL2 Dataset from https://arxiv.org/abs/2205.05131.
     Consists of three different objectives:
+
     1. Short span masking with small probabilities (ex: T5). Typically max ngram size of 5 with 0.15 mask prob.
     2. Extreme span masking with either large probabilities or large ngram sizes or both.
     3. Prefx-LM as in the T5 or LM-adapted T5 (prompt-tuning paper).
@@ -312,7 +313,8 @@ def build_extreme_masking_training_sample(
         skip_masking_id=None,
     ):
         """Build training sample.
-        Arguments:
+
+        Args:
             sample: A list of sentences in which each sentence is a list token ids.
             target_seq_length: Desired sequence length.
             max_seq_length: Maximum length of the sequence. All values are padded to
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
index d974c8182234..102ab5ec0f84 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
@@ -182,9 +182,11 @@ def build_train_valid_test_datasets(self):
         return self._train_ds, self._validation_ds, self._test_ds
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index dc6d81649122..0f1fa76f9b01 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -760,9 +760,11 @@ def _append_sequence_parallel_module_grads(self, module, grads):
                 grads.append(grad.data)
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 7a2f3459470c..d7f489abf158 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1475,9 +1475,11 @@ def build_pretraining_data_loader(
         )
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 459bf5b71c7e..4c39bd877b4a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -911,9 +911,11 @@ def build_pretraining_data_loader(self, dataset, consumed_samples, num_workers):
         )
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
@@ -1413,11 +1415,13 @@ def dummy():
 
     def complete(self, request: Dict):
         """
-            Autoregressively invokes language model in the inference mode
+        Autoregressively invokes language model in the inference mode
+
         Args:
             request: Dictionary with the following fields
                 * prompt: a string which text the model should complete.
                 * tokens_to_generate: how many tokens to generate while doing prompt completion.
+
         Returns:
             response: A python dictionary with the following fields
                 * prompt: original text of the prompt
diff --git a/nemo/collections/nlp/modules/common/transformer/text_generation.py b/nemo/collections/nlp/modules/common/transformer/text_generation.py
index a4e37935adc9..5f0275ff4553 100644
--- a/nemo/collections/nlp/modules/common/transformer/text_generation.py
+++ b/nemo/collections/nlp/modules/common/transformer/text_generation.py
@@ -67,47 +67,48 @@ def generate(
             inputs (Union[List[str], Tensor, List[dict]]):
                 Can be one of the 3 types: 
 
-                    1. List of strings. Each element of the list provides input prompt. The model will apply tokenizer on it.
-                        E.g [‘sentence’, ‘sentence2’ … ]
+                1. List of strings. Each element of the list provides input prompt. The model will apply tokenizer on it.
+                    E.g [‘sentence’, ‘sentence2’ … ]
 
-                    2. Tuple of Pytorch Tensors (context_tokens, context_lengths). The `context_tokens` has shape (batch_size, seq_length),  it's the batched sequences of tokens used as a prompst for the generation or as model inputs to the encoder. 
-                        The generative model will skip the tokenization and padding step.  The `context_lengths` has shape (batch_size,), it indicates the length of the context tokens for each of the input sequences.
-                        E.g. ( torch.tensor([[23,5234,23,35,…], [223,323,23,23232,232,...] …]), torch.tensor([20, 30, …]))
+                2. Tuple of Pytorch Tensors (context_tokens, context_lengths). The `context_tokens` has shape (batch_size, seq_length),  it's the batched sequences of tokens used as a prompst for the generation or as model inputs to the encoder.
+                    The generative model will skip the tokenization and padding step.  The `context_lengths` has shape (batch_size,), it indicates the length of the context tokens for each of the input sequences.
+                    E.g. ( torch.tensor([[23,5234,23,35,…], [223,323,23,23232,232,...] …]), torch.tensor([20, 30, …]))
 
-                    3. List of python dict objects. Used for prompt/p-tuning inputs where a set of key-value pairs are converted into input token embeddings for the model.
-                        E.g. [{"prompt-tag": "sentiment", "sentence": "this is a good movie"},
-                        {"prompt-tag": "qa", "context": "some context text", "question": "a simple question"} ... ]
-                        where 'prompt-tag' is used to identify the type of NLP task to solve.
+                3. List of python dict objects. Used for prompt/p-tuning inputs where a set of key-value pairs are converted into input token embeddings for the model.
+                    E.g. [{"prompt-tag": "sentiment", "sentence": "this is a good movie"},
+                    {"prompt-tag": "qa", "context": "some context text", "question": "a simple question"} ... ]
+                    where 'prompt-tag' is used to identify the type of NLP task to solve.
 
             length_params (LengthParam):
                 a dictionary type which controls the sampling length.
 
-                    max_length: int, The maximum length of the sequence to be generated.
-
-                    min_length: int,  The minimum length of the sequence to be generated.
+                * max_length: int, The maximum length of the sequence to be generated.
+                * min_length: int,  The minimum length of the sequence to be generated.
 
                 If None, max_length is set to 30, and min_length is set to None
+
             sampling_params (SamplingParam):
                 a dictionary type which contains the parameters for text sampling. It has the following keys
 
-                    use_greedy: bool,  Whether or not to use sampling ; use greedy decoding otherwise
-                    top_k: int, The number of highest probability vocabulary tokens to keep for top-k-filtering.
-                    top_p: float, If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-                    repetition_penalty: float, The parameter for repetition penalty. 1.0 means no penalty. 
-                    add_BOS: bool, Whether add the bos token at the begining of the prompt
-                    all_probs: bool  # whether return the log prob for all the tokens in vocab
-                    compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-                    end_strings: List[str]  # generation will stop when one of these tokens is generated
+                * use_greedy: bool,  Whether or not to use sampling ; use greedy decoding otherwise
+                * top_k: int, The number of highest probability vocabulary tokens to keep for top-k-filtering.
+                * top_p: float, If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+                * repetition_penalty: float, The parameter for repetition penalty. 1.0 means no penalty.
+                * add_BOS: bool, Whether add the bos token at the begining of the prompt
+                * all_probs: bool  # whether return the log prob for all the tokens in vocab
+                * compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+                * end_strings: List[str]  # generation will stop when one of these tokens is generated
+
                 Default None, If it is None, use_greedy will be "True".
 
         Returns:
-            OutputType: It generates the output in a dictionary type. It has the following keys:
-
-                sentences: List[str], output sentences
-                tokens: List[List[str]], output sentences borken into tokens
-                logprob: List[List[float]],  log prob of generated tokens
-                full_logprob: List[List[float]], log prob of all the tokens in the vocab
-                token_ids: List[List[int]], output sentence token ids
-                offsets: List[List[int]]  # list of tokens start positions in text
+            It generates the output in a dictionary type. It has the following keys,
+
+            * sentences: List[str], output sentences
+            * tokens: List[List[str]], output sentences borken into tokens
+            * logprob: List[List[float]],  log prob of generated tokens
+            * full_logprob: List[List[float]], log prob of all the tokens in the vocab
+            * token_ids: List[List[int]], output sentence token ids
+            * offsets: List[List[int]]  # list of tokens start positions in text
         """
         raise NotImplementedError("please implement this method")
diff --git a/nemo/collections/vision/models/megatron_vit_classification_models.py b/nemo/collections/vision/models/megatron_vit_classification_models.py
index c27c37c2b917..ea6d3578c540 100644
--- a/nemo/collections/vision/models/megatron_vit_classification_models.py
+++ b/nemo/collections/vision/models/megatron_vit_classification_models.py
@@ -621,9 +621,11 @@ def build_pretraining_data_loader(self, dataset, consumed_samples, drop_last=Tru
         )
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/core/classes/dataset.py b/nemo/core/classes/dataset.py
index 738ae22f5416..789fc0b863d7 100644
--- a/nemo/core/classes/dataset.py
+++ b/nemo/core/classes/dataset.py
@@ -42,12 +42,15 @@ def collate_fn(self, batch):
 
         Please note, subclasses of Dataset should not implement `input_types`.
 
-        # Usage:
-        dataloader = torch.utils.data.DataLoader(
-                ....,
-                collate_fn=dataset.collate_fn,
-                ....
-        )
+        Usage:
+
+        .. code-block:: python
+
+            dataloader = torch.utils.data.DataLoader(
+                    ....,
+                    collate_fn=dataset.collate_fn,
+                    ....
+            )
 
         Returns:
             Collated batch, with or without types.
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index be9a6e8cfbb3..5c7cac5a9a55 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -304,9 +304,9 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
                 recent checkpoint under ``*last.ckpt``, and the final checkpoint after training completes under ``*end.ckpt``.
                 Defaults to True.
             - create_early_stopping_callback (bool): Flag to decide if early stopping should be used to stop training. Default is False.
-             See EarlyStoppingParams dataclass above.
+                See EarlyStoppingParams dataclass above.
             - create_preemption_callback (bool): Flag to decide whether to enable preemption callback to save checkpoints and exit training
-             immediately upon preemption. Default is True.
+                immediately upon preemption. Default is True.
             - files_to_copy (list): A list of files to copy to the experiment logging directory. Defaults to None which
                 copies no files.
             - log_local_rank_0_only (bool): Whether to only create log files for local rank 0. Defaults to False.

From 9e2325d18b4a0e6576ffabe8003c3cad26eb3954 Mon Sep 17 00:00:00 2001
From: Valerie Sarge <vsarge@nvidia.com>
Date: Wed, 1 May 2024 16:34:21 -0700
Subject: [PATCH 017/178] Handle case where num_query_groups is set to null for
 LoRA config setup (#9075)

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
---
 nemo/collections/nlp/parts/peft_config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index 47d5167d630e..820e2ad63f24 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -123,6 +123,9 @@ def __init__(self, cfg):
         kv_channels = self._calculate_kv_channels(cfg)
         projection_size = kv_channels * cfg.num_attention_heads
         num_query_groups = cfg.get("num_query_groups", cfg.num_attention_heads)
+        if num_query_groups is None:
+            # Cover the case where num_query_groups is explicitly set to null
+            num_query_groups = cfg.num_attention_heads
 
         qkv_projection_size = projection_size + (2 * kv_channels * num_query_groups)
 

From d66ca999b80bb9da0af05da13b6b3b51142535dc Mon Sep 17 00:00:00 2001
From: Alexey Panteleev <alpanteleev@nvidia.com>
Date: Wed, 1 May 2024 17:33:32 -0700
Subject: [PATCH 018/178] TRT-LLM export P-tuning related fixes (#8863)

* Fixed the uses of pathlib.Path.

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>

* Add the bos token to LLAMA based models.

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>

* P-tuning related fixes:
- Remember the vtoken counts for each p-tuning table when the tables are added;
- Prepend the right number of vtokens to each query based on its task_id;
- Preserve the dtype of the p-tuning table when it is padded;
- Validate that all p-tuning tables fit into max_prompt_embedding_table_size limit.

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 nemo/export/tensorrt_llm.py               | 24 ++++++++--
 nemo/export/trt_llm/tensorrt_llm_model.py |  4 +-
 nemo/export/trt_llm/tensorrt_llm_run.py   | 55 ++++++++++++++++++-----
 3 files changed, 68 insertions(+), 15 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 40fb93816a33..033044b3b328 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -97,6 +97,7 @@ def __init__(self, model_dir: str, lora_ckpt_list: List[str] = None, load_model:
         self.ptuning_tables = []
         self.p_table = None
         self.task_vocab_size = 0
+        self.task_vtoken_counts = []
         self.task_ids = {}
 
         if load_model:
@@ -358,12 +359,15 @@ def forward(
                     prompt_embeddings_table, prompt_embeddings_checkpoint_path
                 )
                 tv_size = prompt_table.size(dim=0)
+                task_vtoken_counts = [tv_size]
             elif len(self.ptuning_tables) > 0:
                 prompt_table = self.p_table
                 tv_size = self.task_vocab_size
+                task_vtoken_counts = self.task_vtoken_counts
             else:
                 prompt_table = None
                 tv_size = None
+                task_vtoken_counts = None
 
             if task_ids is None:
                 assert prompt_table is None, "There is a prompt embedding table and task_ids cannot be None"
@@ -404,6 +408,7 @@ def forward(
                     temperature=temperature,
                     prompt_table=prompt_table,
                     task_vocab_size=tv_size,
+                    task_vtoken_counts=task_vtoken_counts,
                     task_ids=input_task_ids,
                     lora_uids=lora_uids,
                     stop_words_list=stop_words_list,
@@ -423,6 +428,7 @@ def forward(
                     temperature=temperature,
                     prompt_table=prompt_table,
                     task_vocab_size=tv_size,
+                    task_vtoken_counts=task_vtoken_counts,
                     task_ids=input_task_ids,
                     lora_uids=lora_uids,
                     stop_words_list=stop_words_list,
@@ -578,19 +584,31 @@ def _prep_ptuning_table(self):
             if self.task_vocab_size < pt["table"].size(dim=0):
                 self.task_vocab_size = pt["table"].size(dim=0)
 
-        # pad tasks to longest task embedding table
+        # pad tasks to longest task embedding table, remember the original task vtoken counts
         vtokens_embeddings = []
+        self.task_vtoken_counts = []
         self.task_ids = {}
         tid = 0
         for i, ptuning_table in enumerate(self.ptuning_tables):
-            padded_table = torch.zeros((self.task_vocab_size, self.get_hidden_size))
-            padded_table[: ptuning_table["table"].size(dim=0), :] = ptuning_table["table"]
+            original_table = ptuning_table["table"]
+            vtoken_count = original_table.size(dim=0)
+            padded_table = torch.zeros((self.task_vocab_size, self.get_hidden_size), dtype=original_table.dtype)
+            padded_table[:vtoken_count, :] = original_table
             vtokens_embeddings.append(padded_table)
             self.task_ids[ptuning_table["task_name"]] = tid
+            self.task_vtoken_counts.append(vtoken_count)
             tid = tid + 1
 
         if len(vtokens_embeddings) > 0:
             self.p_table = torch.stack(vtokens_embeddings, dim=0).view(-1, self.get_hidden_size)
+
+            max_prompt_embedding_table_size = self.config['builder_config']['max_prompt_embedding_table_size']
+            actual_prompt_table_size = self.p_table.shape[0]
+
+            if actual_prompt_table_size > max_prompt_embedding_table_size:
+                raise Exception(
+                    f"The size of the combined prompt embedding table ({actual_prompt_table_size}) is greater than max_prompt_embedding_table_size ({max_prompt_embedding_table_size})."
+                )
         else:
             self.p_table = None
 
diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py
index 52e9c4960fc9..736d6180807e 100644
--- a/nemo/export/trt_llm/tensorrt_llm_model.py
+++ b/nemo/export/trt_llm/tensorrt_llm_model.py
@@ -26,7 +26,7 @@
 from tensorrt_llm.module import Module, ModuleList
 
 from nemo.export.trt_llm.decoder import build_decoder_layer
-from nemo.export.trt_llm.model_config import DECODER_GEMMA, ModelConfig
+from nemo.export.trt_llm.model_config import DECODER_GEMMA, DECODER_LLAMA, ModelConfig
 from nemo.export.trt_llm.quantization_utils import quantize_linear
 from nemo.export.trt_llm.tensorrt_llm_build import build
 from nemo.export.trt_llm.tensorrt_llm_utils import (
@@ -65,7 +65,7 @@ def __init__(self, model_config: ModelConfig):
             else model_config.head_size
         )
         self._use_prompt_tuning = model_config.use_prompt_tuning
-        self._add_bos = model_config.layers[0].decoder_type == DECODER_GEMMA
+        self._add_bos = model_config.layers[0].decoder_type in (DECODER_GEMMA, DECODER_LLAMA)
         self._mapping = model_config.mapping
         self.rank = model_config.mapping.rank
         self.max_lora_rank = model_config.max_lora_rank
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index d7e3e40c87a2..c490f37e1fc4 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -491,6 +491,47 @@ def forward(
         raise RuntimeError("Internal error")
 
 
+def prepare_input_tensors(
+    input_texts: List[str],
+    host_context: TensorrtLLMHostContext,
+    prompt_table=None,
+    task_vtoken_counts: List[int] = None,
+    task_ids: List[int] = None,
+):
+    tokenizer = host_context.tokenizer
+
+    if host_context.add_bos:
+        bos_tokens = [tokenizer.bos_token_id]
+    else:
+        bos_tokens = []
+
+    input_tokens = [bos_tokens + tokenizer.encode(t) for t in input_texts]
+
+    # If p-tuning is used, we need to prepend vtokens to each input.
+    if prompt_table is not None:
+
+        # Go over the tokenized prompts and prepend vtokens.
+        # The number of vtokens could be different for each task.
+        for prompt_index in range(len(input_texts)):
+            # Find out the number of vtokens to generate
+            task_id = task_ids[prompt_index]
+            num_vtokens = task_vtoken_counts[task_id]
+
+            # Create a tensor with vtokens, e.g. 32000, 32001, 32002... when vocab_size=32000
+            # TRT-LLM will convert each vtoken into its corresponding embedding row from the prompt table.
+            vocab_size = tokenizer.vocab_size
+            vtokens = list(range(vocab_size, vocab_size + num_vtokens))
+
+            # Concatenate the vtokens with the real tokens
+            real_tokens = input_tokens[prompt_index]
+            input_tokens[prompt_index] = vtokens + real_tokens
+
+    # Convert input token lists to tensors
+    input_tensors = [torch.IntTensor(token_list) for token_list in input_tokens]
+
+    return input_tensors
+
+
 def generate(
     input_texts: List[str],
     max_output_len: int,
@@ -500,6 +541,7 @@ def generate(
     temperature: float = 1.0,
     prompt_table=None,
     task_vocab_size=None,
+    task_vtoken_counts: List[int] = None,
     task_ids: List[int] = None,
     lora_uids: List[str] = None,
     stop_words_list=None,
@@ -515,11 +557,7 @@ def generate(
     Returns a 2D string list with shape [batch_size, num_beams].
     """
     tokenizer = host_context.tokenizer
-
-    if host_context.add_bos:
-        input_tensors = [torch.IntTensor([tokenizer.bos_token_id] + tokenizer.encode(t)) for t in input_texts]
-    else:
-        input_tensors = [torch.IntTensor(tokenizer.encode(t)) for t in input_texts]
+    input_tensors = prepare_input_tensors(input_texts, host_context, prompt_table, task_vtoken_counts, task_ids)
 
     stop_words_list_tensors = None
     if stop_words_list is not None:
@@ -582,6 +620,7 @@ def generate_streaming(
     temperature: float = 1.0,
     prompt_table=None,
     task_vocab_size=None,
+    task_vtoken_counts: List[int] = None,
     task_ids: List[int] = None,
     lora_uids: List[str] = None,
     stop_words_list=None,
@@ -594,11 +633,7 @@ def generate_streaming(
     Returns a 2D string list with shape [batch_size, num_beams].
     """
     tokenizer = host_context.tokenizer
-
-    if host_context.add_bos:
-        input_tensors = [torch.IntTensor([tokenizer.bos_token_id] + tokenizer.encode(t)) for t in input_texts]
-    else:
-        input_tensors = [torch.IntTensor(tokenizer.encode(t)) for t in input_texts]
+    input_tensors = prepare_input_tensors(input_texts, host_context, prompt_table, task_vtoken_counts, task_ids)
 
     batch_size = len(input_texts)
 

From 0643511a29101801afad070c80d26040d48eaa3a Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Thu, 2 May 2024 05:47:11 +0200
Subject: [PATCH 019/178] [NeMo-UX] Add mixed-precision plugin (#9065)

* Adding MegatronParallel

* Move over _strategy_liMegatronCheckpointIO

* Adding GPTModel & MockDataModule

* Adding mixed-precision to NeMo

* Fix import

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert unintended changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* clean up code and reinstate mix precision tests

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: Chen Cui <chcui@nvidia.com>

* use cpu for unit test

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 nemo/lightning/__init__.py                    |  11 +-
 nemo/lightning/pytorch/plugins/__init__.py    |   6 +-
 .../pytorch/plugins/mixed_precision.py        | 166 ++++++++++++++++++
 tests/lightning/test_megatron_parallel.py     | 106 ++++++-----
 4 files changed, 232 insertions(+), 57 deletions(-)
 create mode 100644 nemo/lightning/pytorch/plugins/mixed_precision.py

diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py
index f900345f96eb..afbdb39f42d4 100644
--- a/nemo/lightning/__init__.py
+++ b/nemo/lightning/__init__.py
@@ -4,7 +4,7 @@
 from pytorch_lightning import plugins as _pl_plugins
 
 from nemo.lightning.base import get_vocab_size, teardown
-from nemo.lightning.pytorch.plugins import MegatronDataSampler
+from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision
 from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler
 from nemo.lightning.pytorch.strategies import MegatronStrategy
 from nemo.lightning.pytorch.trainer import Trainer
@@ -22,4 +22,11 @@ def _is_slurm_interactive_mode():
 _pl_plugins._PLUGIN_INPUT = Union[_pl_plugins._PLUGIN_INPUT, _data_sampler.DataSampler]  # noqa: SLF001
 
 
-__all__ = ["MegatronStrategy", "MegatronDataSampler", "Trainer", "get_vocab_size", "teardown"]
+__all__ = [
+    "MegatronStrategy",
+    "MegatronDataSampler",
+    "MegatronMixedPrecision",
+    "Trainer",
+    "get_vocab_size",
+    "teardown",
+]
diff --git a/nemo/lightning/pytorch/plugins/__init__.py b/nemo/lightning/pytorch/plugins/__init__.py
index 45f88a383681..d99e1a3ca7b9 100644
--- a/nemo/lightning/pytorch/plugins/__init__.py
+++ b/nemo/lightning/pytorch/plugins/__init__.py
@@ -1,3 +1,7 @@
 from nemo.lightning.pytorch.plugins.data_sampler import MegatronDataSampler
+from nemo.lightning.pytorch.plugins.mixed_precision import MegatronMixedPrecision
 
-__all__ = ["MegatronDataSampler"]
+__all__ = [
+    "MegatronDataSampler",
+    "MegatronMixedPrecision",
+]
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
new file mode 100644
index 000000000000..af7054526957
--- /dev/null
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import contextmanager
+from typing import Any, Callable, Generator, List, Literal, Tuple, TypeVar, Union
+
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.plugins.precision import MixedPrecision
+from torch.nn import Module
+from torch.optim import Optimizer
+
+from nemo.lightning._strategy_lib import GradScaler
+
+AnyT = TypeVar("AnyT")
+
+
+class MegatronMixedPrecision(MixedPrecision):
+    def __init__(self, precision: Literal["16-mixed", "bf16-mixed"], amp_O2: bool = True, device="cuda",) -> None:
+        if precision == "bf16-mixed":
+            scaler = None
+        else:
+            scaler = GradScaler(init_scale=2 ** 32, growth_interval=1000, hysteresis=2)
+
+        super().__init__(precision, device, scaler)
+
+        # MixedPrecisionPlugin class in PTL >= 2.0 takes only "16-mixed" or "bf16-mixed" for precision arg
+        if precision == "16-mixed":
+            dtype = torch.float16
+
+            def float16_convertor(val):
+                return val.half()
+
+        elif precision == "bf16-mixed":
+            dtype = torch.bfloat16
+
+            def float16_convertor(val):
+                return val.bfloat16()
+
+        else:
+            raise ValueError("precision must be '16-mixed' or 'bf16-mixed'")
+
+        self.dtype = dtype
+        torch.set_autocast_gpu_dtype(dtype)
+        self.float16_convertor = float16_convertor
+        self.amp_O2 = amp_O2
+
+    def connect(
+        self, model: Module, optimizers: List[Optimizer], lr_schedulers: List[Any]
+    ) -> Tuple[Module, List[Optimizer], List[Any]]:
+        """Connects this plugin to the accelerator and the training process."""
+        from nemo.core.optim import MainParamsOptimizerWrapper
+
+        if not optimizers or not self.amp_O2 or isinstance(optimizers[0], MainParamsOptimizerWrapper):
+            return model, optimizers, lr_schedulers
+
+        _optimizers = [*optimizers]
+        _optimizers[0] = self.convert_optimizer(_optimizers[0])
+
+        return model, _optimizers, lr_schedulers
+
+    def convert_module(self, module: Module) -> Module:
+        """Convert the module parameters to the precision type this plugin handles.
+
+        This is optional and depends on the precision limitations during optimization.
+
+        """
+        if self.precision == "bf16-mixed":
+            return module.bfloat16()
+        if self.precision == "16-mixed":
+            return module.half()
+
+        return module
+
+    def convert_optimizer(self, optimizer: Optimizer) -> Optimizer:
+        """Convert the optimizer parameters to the precision type this plugin handles.
+
+        This is optional and depends on the precision limitations during optimization.
+
+        """
+        from nemo.core.optim import MainParamsOptimizerWrapper
+
+        if isinstance(optimizer, MainParamsOptimizerWrapper) or not self.amp_O2:
+            return optimizer
+
+        return MainParamsOptimizerWrapper(optimizer, fp32_grad_accum=True, contiguous_grad_bucket=True,)
+
+    def convert_input(self, data: AnyT) -> AnyT:
+        """Convert model inputs (forward) to the floating point precision type of this plugin.
+
+        Note: MegatronStrategy will take care of only doing this when:
+            parallel_state.is_pipeline_first_stage()
+
+        """
+        from megatron.core.transformer.module import fp32_to_float16
+
+        return fp32_to_float16(data, self.float16_convertor)
+
+    def convert_output(self, data: AnyT) -> AnyT:
+        """Convert outputs to the floating point precision type expected after model's forward.
+
+        Note: MegatronStrategy will take care of only doing this when:
+            parallel_state.is_pipeline_last_stage()
+
+        """
+        from megatron.core.transformer.module import float16_to_fp32
+
+        return float16_to_fp32(data)
+
+    def optimizer_step(
+        self,
+        optimizer: torch.optim.Optimizer,
+        model: Union["pl.LightningModule", torch.nn.Module],
+        closure: Callable[[], Any],
+        **kwargs: Any,
+    ) -> None:
+        from nemo.core.optim import MainParamsOptimizerWrapper
+
+        if not self.amp_O2 and not isinstance(optimizer, MainParamsOptimizerWrapper):
+            return super().optimizer_step(optimizer, model, closure, **kwargs)
+
+        if self.scaler is None:
+            assert optimizer.fp32_grad_accumulation, "BF16 uses FP32 grad accumulation"
+            _ = closure()
+            self._after_closure(model, optimizer)
+            return optimizer.step(**kwargs)
+
+        assert not optimizer.fp32_grad_accumulation, "FP16 uses FP16 grad accumulation"
+        closure_result = closure()
+
+        # TODO: Add an option for merged all-reduce
+
+        # cast fp16 grads to fp32 and copy to main grads, which are used for unscale and param update
+        optimizer.copy_model_grads_to_main_grads()
+        # `unscale` after the closure is executed but before the `on_before_optimizer_step` hook.
+        # unscale main (fp32) gradients
+        self.scaler.unscale_(optimizer)
+        self._after_closure(model, optimizer)
+        skipped_backward = closure_result is None
+        # in manual optimization, the closure does not return a value
+        if not isinstance(model, pl.LightningModule) or not model.automatic_optimization or not skipped_backward:
+            # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found
+            self.scaler.step(optimizer, **kwargs)
+            self.scaler.update()
+
+    @contextmanager
+    def forward_context(self) -> Generator[None, None, None]:
+        """No explicit precision casting. Inputs are supposed to be manually casted."""
+        try:
+            yield
+        finally:
+            pass
+
+
+__all__ = ["MegatronMixedPrecision"]
diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py
index 06e614d48251..877e6a39a976 100644
--- a/tests/lightning/test_megatron_parallel.py
+++ b/tests/lightning/test_megatron_parallel.py
@@ -1,6 +1,7 @@
 from collections import defaultdict
 
 import pytest
+from megatron.core import parallel_state
 from torch import nn
 
 from nemo import lightning as nl
@@ -24,11 +25,10 @@ def forward(self, x):
 
         return DummyModule()
 
-    # TODO (chcui): Uncomment this test when we merge mixed-precision
-    # @pytest.fixture
-    # def mock_precision_plugin(self, mocker):
-    #     """Fixture to create a mock precision plugin."""
-    #     return nl.MegatronMixedPrecision(precision="bf16-mixed")
+    @pytest.fixture
+    def mock_precision_plugin(self, mocker):
+        """Fixture to create a mock precision plugin."""
+        return nl.MegatronMixedPrecision(precision="bf16-mixed")
 
     @pytest.fixture
     def mock_callbacks(self, mocker):
@@ -64,55 +64,53 @@ def test_init_with_defaults(self, mocker, mock_pipeline):
         assert megatron_parallel.forward_step == mp.default_forward_step
         assert megatron_parallel.loss_reduction is None
 
-    # TODO (chcui): Uncomment this test when we merge mixed-precision
-    # def test_init_with_custom_parameters(
-    #     self,
-    #     mocker,
-    #     mock_pipeline,
-    #     mock_precision_plugin,
-    #     mock_callbacks,
-    #     mock_data_step,
-    #     mock_forward_step,
-    #     mock_loss_reduction
-    # ):
-    #     """Test __init__ with custom parameters."""
-    #     mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=1)
-    #     mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=False)
-    #
-    #     megatron_parallel = mp.MegatronParallel(
-    #         pipeline=mock_pipeline,
-    #         precision_plugin=mock_precision_plugin,
-    #         callbacks=mock_callbacks,
-    #         data_step=mock_data_step,
-    #         forward_step=mock_forward_step,
-    #         loss_reduction=mock_loss_reduction
-    #     )
-    #
-    #     assert megatron_parallel.pipeline == mock_pipeline
-    #     assert megatron_parallel.precision_plugin == mock_precision_plugin
-    #     assert megatron_parallel.callbacks == mock_callbacks
-    #     assert megatron_parallel.data_step == mock_data_step
-    #     assert megatron_parallel.forward_step == mock_forward_step
-    #     assert megatron_parallel.loss_reduction == mock_loss_reduction
-
-    # TODO: Comment-out this test when we merge nemo.io
-    # def test_init_with_virtual_pipeline(self, mocker, mock_pipeline):
-    #     """Test __init__ with virtual pipeline model parallel world size."""
-    #     mocker.patch('torch.distributed.get_rank', return_value=1)
-    #     mocker.patch('megatron.core.parallel_state.get_tensor_model_parallel_group', return_value=1)
-    #     mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_group', return_value=1)
-    #     mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=2)
-    #     mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=True)
-    #     mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size')
-    #     mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank')
-    #     mocker.patch('nemo_ext.lightning._strategy_lib.init_lightning_module', return_value=mock_pipeline)
-
-    #     megatron_parallel = mp.MegatronParallel(mock_pipeline, vp_size=2)
-
-    #     assert len(megatron_parallel.pipeline) == 2
-    #     assert all(isinstance(mod, nn.Module) for mod in megatron_parallel.pipeline)
-    #     megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size.assert_called_once_with(2)
-    #     assert megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank.call_count == 1
+    def test_init_with_custom_parameters(
+        self,
+        mocker,
+        mock_pipeline,
+        mock_precision_plugin,
+        mock_callbacks,
+        mock_data_step,
+        mock_forward_step,
+        mock_loss_reduction,
+    ):
+        """Test __init__ with custom parameters."""
+        mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=1)
+        mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=False)
+
+        megatron_parallel = mp.MegatronParallel(
+            pipeline=mock_pipeline,
+            precision_plugin=mock_precision_plugin,
+            callbacks=mock_callbacks,
+            data_step=mock_data_step,
+            forward_step=mock_forward_step,
+            loss_reduction=mock_loss_reduction,
+        )
+
+        assert megatron_parallel.pipeline == mock_pipeline
+        assert megatron_parallel.precision_plugin == mock_precision_plugin
+        assert megatron_parallel.callbacks == mock_callbacks
+        assert megatron_parallel.data_step == mock_data_step
+        assert megatron_parallel.forward_step == mock_forward_step
+        assert megatron_parallel.loss_reduction == mock_loss_reduction
+
+    def test_init_with_virtual_pipeline(self, mocker, mock_pipeline):
+        """Test __init__ with virtual pipeline model parallel world size."""
+        mocker.patch('torch.distributed.get_rank', return_value=1)
+        mocker.patch('megatron.core.parallel_state.get_tensor_model_parallel_group', return_value=1)
+        mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_group', return_value=1)
+        mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=2)
+        mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=True)
+        mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size')
+        mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank')
+        mocker.patch('nemo.io.reinit', return_value=mock_pipeline)
+
+        megatron_parallel = mp.MegatronParallel(mock_pipeline, vp_size=2, cpu=True)
+
+        assert len(megatron_parallel.pipeline) == 2
+        assert all(isinstance(mod, nn.Module) for mod in megatron_parallel.pipeline)
+        parallel_state.set_virtual_pipeline_model_parallel_world_size.assert_called_once_with(2)
+        assert parallel_state.set_virtual_pipeline_model_parallel_rank.call_count == 1
 
 
 class TestCallbackConnector:

From a8e0ca1b6206b4158c96781176f5b0d80b49f9cc Mon Sep 17 00:00:00 2001
From: Eric Harper <complex451@gmail.com>
Date: Thu, 2 May 2024 00:06:20 -0600
Subject: [PATCH 020/178] Comment baichuan test and update pr template (#9085)

* comment test

Signed-off-by: eharper <eharper@nvidia.com>

* comment test

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
---
 .github/PULL_REQUEST_TEMPLATE.md |  8 ++---
 .github/workflows/cicd-main.yml  | 51 +++++++++++++++++---------------
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 2c4946bbbde1..ae22ede4807b 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -14,13 +14,13 @@ Add a one line overview of what this PR aims to accomplish.
 # Add a code snippet demonstrating how to use this 
 ```
 
-# Jenkins CI
+# GitHub Actions CI
 
 The Jenkins CI system has been replaced by GitHub Actions self-hosted runners.
 
-There's no need to comment `jenkins` on the PR to trigger Jenkins CI.
-The GitHub Actions CI will run automatically when the PR is opened.
-To run CI on an untrusted fork, a NeMo user with write access must click "Approve and run".
+The GitHub Actions CI will run automatically when the "Run CICD" label is added to the PR.
+To re-run CI remove and add the label again.
+To run CI on an untrusted fork, a NeMo user with write access must first click "Approve and run".
 
 # Before your PR is "Ready for review"
 **Pre checks**:
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 6f090bd34213..df631443e7f7 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -319,29 +319,32 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  L2_Community_LLM_Checkpoints_tests_Baichuan2:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
-            --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
-            --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
-            rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+  # this test is using a 7B model which is too large for GitHub CI
+  # replace the model in this test with a toy model or move the test
+  # to the nightly CI
+  # L2_Community_LLM_Checkpoints_tests_Baichuan2:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   container:
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+  #     options: 
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g
+  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /mnt/datadrive/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v4
+  #       - run: |
+  #           python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
+  #           --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
+  #           --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
+  #           rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
+  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+  #         if: "failure()"
 
   L2_PTQ_Llama2_Export_Only:
     needs: [cicd-test-container-setup]
@@ -6370,7 +6373,7 @@ jobs:
       - L2_Community_LLM_Checkpoints_tests_Llama
       - L2_Community_LLM_Checkpoints_tests_StarCoder
       - L2_Community_LLM_Checkpoints_tests_Falcon
-      - L2_Community_LLM_Checkpoints_tests_Baichuan2
+      #- L2_Community_LLM_Checkpoints_tests_Baichuan2
       - ASR_dev_run_Speech_to_Text
       - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet
       - ASR_dev_run_Speech_Pre-training_-_CitriNet

From f15e8975fd12e23ca1fd887222e7a636c52a8167 Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Thu, 2 May 2024 09:48:10 -0700
Subject: [PATCH 021/178] Add safe extraction of nemo tar files (#8976)

* Add safe extraction of nemo tar files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix bugs

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Replace print with logging

Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../core/connectors/save_restore_connector.py | 27 +++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/nemo/core/connectors/save_restore_connector.py b/nemo/core/connectors/save_restore_connector.py
index 2d01e9d5bad8..70d91066b7f0 100644
--- a/nemo/core/connectors/save_restore_connector.py
+++ b/nemo/core/connectors/save_restore_connector.py
@@ -553,6 +553,29 @@ def _make_nemo_file_from_folder(filename, source_dir):
         with tarfile.open(filename, "w:") as tar:
             tar.add(source_dir, arcname=".")
 
+    @staticmethod
+    def _is_safe_path(member, extract_to):
+        # Check for path traversal characters or absolute paths
+        member_path = os.path.normpath(member.name)
+        # Ensure the path does not start with a slash or contain ".." after normalization
+        if os.path.isabs(member_path) or ".." in member_path.split(os.sep):
+            return False
+        # Construct the full path where the member would be extracted
+        full_path = os.path.join(extract_to, member_path)
+        # Ensure the member would be extracted within the intended directory
+        return os.path.commonprefix([full_path, extract_to]) == extract_to
+
+    @staticmethod
+    def _safe_extract(tar, out_folder: str, members=None):
+        extract_to = os.path.realpath(out_folder)
+        if members is None:
+            members = tar.getmembers()
+        for member in members:
+            if SaveRestoreConnector._is_safe_path(member, extract_to):
+                tar.extract(member, extract_to)
+            else:
+                logging.warning(f"Skipping potentially unsafe member: {member.name}")
+
     @staticmethod
     def _unpack_nemo_file(path2file: str, out_folder: str, extract_config_only: bool = False) -> str:
         if not os.path.exists(path2file):
@@ -569,10 +592,10 @@ def _unpack_nemo_file(path2file: str, out_folder: str, extract_config_only: bool
             tar_header = "r:gz"
         tar = tarfile.open(path2file, tar_header)
         if not extract_config_only:
-            tar.extractall(path=out_folder)
+            SaveRestoreConnector._safe_extract(tar, out_folder)
         else:
             members = [x for x in tar.getmembers() if ".yaml" in x.name]
-            tar.extractall(path=out_folder, members=members)
+            SaveRestoreConnector._safe_extract(tar, out_folder, members)
         tar.close()
         return out_folder
 

From 9100cfd6462e1dbd5119b5affa845b1c061f265b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Thu, 2 May 2024 14:13:46 -0400
Subject: [PATCH 022/178] PyTorch CUDA allocator optimization for dynamic batch
 shape dataloading in ASR (#9061)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Option to auto-set expandable_segments in PyTorch CUDA allocator

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* warning

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* set opts after parsing config

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
---
 .../common/data/lhotse/dataloader.py          | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index eabc3da5d11b..191ac54589e5 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 import warnings
 from dataclasses import dataclass
 from functools import partial
@@ -74,6 +74,7 @@ class LhotseDataLoadingConfig:
     drop_last: bool = False
     shard_seed: int | str = "trng"
     max_open_streams: int | None = None
+    cuda_expandable_segments: bool = True
 
     # 2.1 Multimodal sampling override options
     use_multimodal_sampling: bool = False
@@ -150,6 +151,8 @@ def get_lhotse_dataloader_from_config(
 
     config = make_structured_with_schema_warnings(config)
 
+    maybe_set_cuda_expandable_segments(enabled=config.cuda_expandable_segments)
+
     # First, resolve the random seed in case a string value was provided.
     seed = resolve_seed(config.seed)
     fix_random_seed(seed)
@@ -451,6 +454,28 @@ def _flatten_alt_text(cut) -> list:
     return ans
 
 
+def maybe_set_cuda_expandable_segments(enabled: bool):
+    """
+    Configures PyTorch memory allocator to expand existing allocated segments
+    instead of re-allocating them when tensor shape grows.
+    This can help speed up the training when sequence length and/or batch size change often,
+    and makes GPU more robust towards OOM.
+
+    See here for more details:
+    https://pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf
+    """
+    if enabled and torch.cuda.is_available():
+        if (
+            (value := os.environ.get("PYTORCH_CUDA_ALLOC_CONF")) is not None
+            and len(value) > 0
+            and "expandable_segments:True" not in value
+        ):
+            warnings.warn(
+                "You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration."
+            )
+        torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+
+
 def _select_channel(cut, channel_selector: int | str) -> list:
     if isinstance(channel_selector, int):
         channel_idx = channel_selector

From f769ad504890f4798fe2a679d337d0ddf2c05fe3 Mon Sep 17 00:00:00 2001
From: Ryan Langman <rlangman@nvidia.com>
Date: Thu, 2 May 2024 12:04:32 -0700
Subject: [PATCH 023/178] [TTS] Add tutorial for training audio codecs (#8723)

* [TTS] Add tutorial for training audio codecs

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Update tutorial

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add diagrams

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add introduction and references

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Replace diagram with github release link

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
---
 examples/tts/audio_codec.py                   |   1 +
 .../conf/audio_codec/audio_codec_16000.yaml   |   8 +-
 .../conf/audio_codec/audio_codec_24000.yaml   |  12 +-
 .../tts/conf/audio_codec/encodec_24000.yaml   |   8 +-
 .../tts/conf/audio_codec/mel_codec_22050.yaml | 194 +++++
 .../tts/conf/audio_codec/mel_codec_44100.yaml |  10 +-
 tutorials/tts/Audio_Codec_Training.ipynb      | 800 ++++++++++++++++++
 7 files changed, 1011 insertions(+), 22 deletions(-)
 create mode 100644 examples/tts/conf/audio_codec/mel_codec_22050.yaml
 create mode 100644 tutorials/tts/Audio_Codec_Training.ipynb

diff --git a/examples/tts/audio_codec.py b/examples/tts/audio_codec.py
index 800edfb7fb0f..5fc4b6fd0afd 100644
--- a/examples/tts/audio_codec.py
+++ b/examples/tts/audio_codec.py
@@ -27,6 +27,7 @@ def main(cfg):
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
     model = AudioCodecModel(cfg=cfg.model, trainer=trainer)
+    model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
     trainer.fit(model)
 
 
diff --git a/examples/tts/conf/audio_codec/audio_codec_16000.yaml b/examples/tts/conf/audio_codec/audio_codec_16000.yaml
index 7182414a31db..93b44b579655 100644
--- a/examples/tts/conf/audio_codec/audio_codec_16000.yaml
+++ b/examples/tts/conf/audio_codec/audio_codec_16000.yaml
@@ -92,13 +92,13 @@ model:
     log_epochs: [1, 2, 3, 4, 5, 6]
     epoch_frequency: 1
     log_tensorboard: false
-    log_wandb: true
+    log_wandb: false
 
     generators:
       - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
         log_audio: true
-        log_encoding: true
-        log_dequantized: true
+        log_encoding: false
+        log_dequantized: false
 
     dataset:
       _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
@@ -129,8 +129,6 @@ model:
     _target_: nemo.collections.tts.modules.encodec_modules.MultiResolutionDiscriminatorSTFT
     resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]]
 
-  # The original EnCodec uses hinged loss, but squared-GAN loss is more stable
-  # and reduces the need to tune the loss weights or use a gradient balancer.
   generator_loss:
     _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss
 
diff --git a/examples/tts/conf/audio_codec/audio_codec_24000.yaml b/examples/tts/conf/audio_codec/audio_codec_24000.yaml
index e5e386722fb1..cf48db807d25 100644
--- a/examples/tts/conf/audio_codec/audio_codec_24000.yaml
+++ b/examples/tts/conf/audio_codec/audio_codec_24000.yaml
@@ -2,7 +2,7 @@
 # If you want to train model on other dataset, you can change config values according to your dataset.
 # Most dataset-specific arguments are in the head of the config file, see below.
 
-name: EnCodec
+name: AudioCodec
 
 max_epochs: ???
 # Adjust batch size based on GPU memory
@@ -90,13 +90,13 @@ model:
     log_epochs: [10, 50, 100, 150, 200]
     epoch_frequency: 100
     log_tensorboard: false
-    log_wandb: true
+    log_wandb: false
 
     generators:
       - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
         log_audio: true
-        log_encoding: true
-        log_dequantized: true
+        log_encoding: false
+        log_dequantized: false
 
     dataset:
       _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
@@ -127,8 +127,6 @@ model:
     _target_: nemo.collections.tts.modules.encodec_modules.MultiResolutionDiscriminatorSTFT
     resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]]
 
-  # The original EnCodec uses hinged loss, but squared-GAN loss is more stable
-  # and reduces the need to tune the loss weights or use a gradient balancer.
   generator_loss:
     _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss
 
@@ -162,7 +160,7 @@ exp_manager:
   exp_dir: null
   name: ${name}
   create_tensorboard_logger: false
-  create_wandb_logger: true
+  create_wandb_logger: false
   wandb_logger_kwargs:
     name: null
     project: null
diff --git a/examples/tts/conf/audio_codec/encodec_24000.yaml b/examples/tts/conf/audio_codec/encodec_24000.yaml
index 4898d449d520..be66fd4b4979 100644
--- a/examples/tts/conf/audio_codec/encodec_24000.yaml
+++ b/examples/tts/conf/audio_codec/encodec_24000.yaml
@@ -90,13 +90,13 @@ model:
     log_epochs: [10, 50, 100, 150, 200]
     epoch_frequency: 100
     log_tensorboard: false
-    log_wandb: true
+    log_wandb: false
 
     generators:
       - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
         log_audio: true
-        log_encoding: true
-        log_dequantized: true
+        log_encoding: false
+        log_dequantized: false
 
     dataset:
       _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
@@ -162,7 +162,7 @@ exp_manager:
   exp_dir: null
   name: ${name}
   create_tensorboard_logger: false
-  create_wandb_logger: true
+  create_wandb_logger: false
   wandb_logger_kwargs:
     name: null
     project: null
diff --git a/examples/tts/conf/audio_codec/mel_codec_22050.yaml b/examples/tts/conf/audio_codec/mel_codec_22050.yaml
new file mode 100644
index 000000000000..df77e7747a51
--- /dev/null
+++ b/examples/tts/conf/audio_codec/mel_codec_22050.yaml
@@ -0,0 +1,194 @@
+# This config contains the default values for training 22.05kHz audio codec model which encodes mel spectrogram
+# instead of raw audio.
+# If you want to train model on other dataset, you can change config values according to your dataset.
+# Most dataset-specific arguments are in the head of the config file, see below.
+
+name: MelCodec
+
+max_epochs: ???
+# Adjust batch size based on GPU memory
+batch_size: 16
+# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch.
+# If null, then weighted sampling is disabled.
+weighted_sampling_steps_per_epoch: null
+
+# Dataset metadata for each manifest
+# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41
+train_ds_meta: ???
+val_ds_meta: ???
+
+log_ds_meta: ???
+log_dir: ???
+
+# Modify these values based on your sample rate
+sample_rate: 22050
+win_length: 1024
+hop_length: 256
+train_n_samples: 8192 # ~0.37 seconds
+# The product of the up_sample_rates should match the hop_length.
+# For example 8 * 8 * 2 * 2 = 256.
+up_sample_rates: [8, 8, 2, 2]
+
+
+model:
+
+  max_epochs: ${max_epochs}
+  steps_per_epoch: ${weighted_sampling_steps_per_epoch}
+
+  sample_rate: ${sample_rate}
+  samples_per_frame: ${hop_length}
+
+  mel_loss_l1_scale: 1.0
+  mel_loss_l2_scale: 0.0
+  stft_loss_scale: 20.0
+  time_domain_loss_scale: 0.0
+  commit_loss_scale: 0.0
+
+  # Probability of updating the discriminator during each training step
+  # For example, update the discriminator 1/2 times (1 update for every 2 batches)
+  disc_updates_per_period: 1
+  disc_update_period: 2
+
+  # All resolutions for mel reconstruction loss, ordered [num_fft, hop_length, window_length]
+  loss_resolutions: [
+    [32, 8, 32], [64, 16, 64], [128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]
+  ]
+  mel_loss_dims: [5, 10, 20, 40, 80, 160, 320]
+  mel_loss_log_guard: 1.0
+  stft_loss_log_guard: 1.0
+  feature_loss_type: absolute
+
+  train_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
+      dataset_meta: ${train_ds_meta}
+      weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch}
+      sample_rate: ${sample_rate}
+      n_samples: ${train_n_samples}
+      min_duration: 0.4
+      max_duration: null
+
+    dataloader_params:
+      batch_size: ${batch_size}
+      drop_last: true
+      num_workers: 4
+
+  validation_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
+      sample_rate: ${sample_rate}
+      n_samples: null
+      min_duration: null
+      max_duration: null
+      trunc_duration: 10.0 # Only use the first 10 seconds of audio for computing validation loss
+      dataset_meta: ${val_ds_meta}
+
+    dataloader_params:
+      batch_size: 4
+      num_workers: 2
+
+  # Configures how audio samples are generated and saved during training.
+  # Remove this section to disable logging.
+  log_config:
+    log_dir: ${log_dir}
+    log_epochs: [10, 50, 100, 150, 200]
+    epoch_frequency: 100
+    log_tensorboard: false
+    log_wandb: false
+
+    generators:
+      - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
+        log_audio: true
+        log_encoding: false
+        log_dequantized: false
+
+    dataset:
+      _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
+      sample_rate: ${sample_rate}
+      n_samples: null
+      min_duration: null
+      max_duration: null
+      trunc_duration: 10.0 # Only log the first 10 seconds of generated audio.
+      dataset_meta: ${log_ds_meta}
+
+    dataloader_params:
+      batch_size: 4
+      num_workers: 2
+
+  audio_encoder:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.MultiBandMelEncoder
+    mel_bands: [[0, 10], [10, 20], [20, 30], [30, 40], [40, 50], [50, 60], [60, 70], [70, 80]]
+    out_channels: 4 # The dimension of each codebook
+    hidden_channels: 128
+    filters: 256
+    mel_processor:
+      _target_: nemo.collections.tts.modules.audio_codec_modules.MelSpectrogramProcessor
+      mel_dim: 80
+      sample_rate: ${sample_rate}
+      win_length: ${win_length}
+      hop_length: ${hop_length}
+
+  audio_decoder:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.HiFiGANDecoder
+    up_sample_rates: ${up_sample_rates}
+    input_dim: 32 # Should be equal to len(audio_encoder.mel_bands) * audio_encoder.out_channels
+    base_channels: 1024 # This is double the base channels of HiFi-GAN V1, making it approximately 4x larger.
+
+  vector_quantizer:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.GroupFiniteScalarQuantizer
+    num_groups: 8 # Should equal len(audio_encoder.mel_bands)
+    num_levels_per_group: [8, 5, 5, 5] # 8 * 5 * 5 * 5 = 1000 entries per codebook
+
+  discriminator:
+    _target_: nemo.collections.tts.modules.audio_codec_modules.Discriminator
+    discriminators:
+      - _target_: nemo.collections.tts.modules.encodec_modules.MultiResolutionDiscriminatorSTFT
+        resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]]
+      - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiPeriodDiscriminator
+
+  generator_loss:
+    _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss
+
+  discriminator_loss:
+    _target_: nemo.collections.tts.losses.audio_codec_loss.DiscriminatorSquaredLoss
+
+  optim:
+    _target_: torch.optim.Adam
+    lr: 2e-4
+    betas: [0.8, 0.99]
+
+    sched:
+      name: ExponentialLR
+      gamma: 0.998
+
+trainer:
+  num_nodes: 1
+  devices: 1
+  accelerator: gpu
+  strategy: ddp_find_unused_parameters_true
+  precision: 16
+  max_epochs: ${max_epochs}
+  accumulate_grad_batches: 1
+  enable_checkpointing: False # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 5
+  benchmark: false
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: false
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
+  create_checkpoint_callback: true 
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
+    save_top_k: 5
+    save_best_model: true
+    always_save_nemo: true
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
diff --git a/examples/tts/conf/audio_codec/mel_codec_44100.yaml b/examples/tts/conf/audio_codec/mel_codec_44100.yaml
index 15d12f009ae0..3ae528df6a64 100644
--- a/examples/tts/conf/audio_codec/mel_codec_44100.yaml
+++ b/examples/tts/conf/audio_codec/mel_codec_44100.yaml
@@ -94,13 +94,13 @@ model:
     log_epochs: [10, 50, 100, 150, 200]
     epoch_frequency: 100
     log_tensorboard: false
-    log_wandb: true
+    log_wandb: false
 
     generators:
       - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator
         log_audio: true
-        log_encoding: true
-        log_dequantized: true
+        log_encoding: false
+        log_dequantized: false
 
     dataset:
       _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset
@@ -146,8 +146,6 @@ model:
         resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]]
       - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiPeriodDiscriminator
 
-  # The original EnCodec uses hinged loss, but squared-GAN loss is more stable
-  # and reduces the need to tune the loss weights or use a gradient balancer.
   generator_loss:
     _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss
 
@@ -181,7 +179,7 @@ exp_manager:
   exp_dir: null
   name: ${name}
   create_tensorboard_logger: false
-  create_wandb_logger: true
+  create_wandb_logger: false
   wandb_logger_kwargs:
     name: null
     project: null
diff --git a/tutorials/tts/Audio_Codec_Training.ipynb b/tutorials/tts/Audio_Codec_Training.ipynb
new file mode 100644
index 000000000000..5f42fd73aa2c
--- /dev/null
+++ b/tutorials/tts/Audio_Codec_Training.ipynb
@@ -0,0 +1,800 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7X-TwhdTGmlc"
+      },
+      "source": [
+        "# License"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fCQUeZRPGnoe"
+      },
+      "source": [
+        "> Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n",
+        ">\n",
+        "> Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n",
+        ">\n",
+        "> http://www.apache.org/licenses/LICENSE-2.0\n",
+        ">\n",
+        "> Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rtBDkKqVGZJ8"
+      },
+      "source": [
+        "# Introduction"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "pZ2QSsXuGbMe"
+      },
+      "source": [
+        "In this tutorial we show how to use NeMo to train and fine-tune **neural audio codecs**.\n",
+        "\n",
+        "Neural audio codecs are deep learning models that compress audio into a low bitrate representation. The compact embedding space created by these models can be useful for various speech tasks, such as TTS and ASR.\n",
+        "\n",
+        "<div>\n",
+        "<img src=\"https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/nemo_audio_codec.png\" width=\"800\", height=\"400\"/>\n",
+        "</div>\n",
+        "\n",
+        "Audio codec models typically have an *encoder-quantizer-decoder* structure. The **encoder** takes an input audio signal and encodes it into a sequence of embeddings. The **quantizer** discretizes the embeddings to create a lookup table known as a **codebook**. The embeddings saved in the codebook are referred to as **audio codes**. The **decoder** takes the audio codes as input and attempts to reconstruct the original audio signal.\n",
+        "\n",
+        "To store compressed audio we only need to save the codebook index for each embedding in an audio sequence. This is how audio codec models achieve low bitrates. The codebook indices for an audio are referred to **audio tokens**. It is becoming common for speech generation models to synthesize speech by predicting audio tokens.\n",
+        "\n",
+        "In NeMo we have implementations of the [SEANet encoder and decoder](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/modules/encodec_modules.py#L146)  used by [EnCodec](https://github.com/facebookresearch/encodec). As well as a [ResNet encoder](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/modules/audio_codec_modules.py#L1035) and [HiFi-GAN decoder](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/modules/audio_codec_modules.py#L875). For quantizers we support [Residual Vector Quantizer](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/modules/encodec_modules.py#L694) (**RVQ**) and [Finite Scalar Quantizer](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/modules/audio_codec_modules.py#L409) (**FSQ**).\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3OZassNG5xff"
+      },
+      "source": [
+        "# Install"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WZvQvPkIhRi3"
+      },
+      "outputs": [],
+      "source": [
+        "BRANCH = 'main'\n",
+        "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below line\n",
+        "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n",
+        "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from pathlib import Path"
+      ],
+      "metadata": {
+        "id": "v8NGOM0EzK8W"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "tvsgWO_WhV3M"
+      },
+      "outputs": [],
+      "source": [
+        "# Directory where tutorialscripts will run and outputs will be saved.\n",
+        "ROOT_DIR = Path().absolute() / \"codec_tutorial\"\n",
+        "\n",
+        "# Nemo code paths\n",
+        "NEMO_DIR = ROOT_DIR / \"nemo\"\n",
+        "NEMO_SCRIPT_DIR = NEMO_DIR / \"scripts\" / \"dataset_processing\" / \"tts\"\n",
+        "NEMO_EXAMPLES_DIR = NEMO_DIR / \"examples\" / \"tts\"\n",
+        "NEMO_CONFIG_DIR = NEMO_EXAMPLES_DIR / \"conf\"\n",
+        "\n",
+        "nemo_download_dir = str(NEMO_DIR)\n",
+        "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n",
+        "# comment out the below line and set NEMO_ROOT_DIR to your local path.\n",
+        "!git clone -b $BRANCH https://github.com/NVIDIA/NeMo.git $nemo_download_dir"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KAbH7N427FdT"
+      },
+      "source": [
+        "# Configuration"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Predefined model configurations are available in https://github.com/NVIDIA/NeMo/tree/main/examples/tts/conf/audio_codec.\n",
+        "\n",
+        "Configurations available include:\n",
+        "\n",
+        "*   **audio_codec_*.yaml**: Audio codec configurations optimized for various sampling rates.\n",
+        "*   **mel_codec_*.yaml**: A mel-spectrogram based codec designed to maximize the performance of speech synthesis models.\n",
+        "*   **encodec_*.yaml**: A reproduction of the original [EnCodec](https://arxiv.org/abs/2210.13438) model setup.\n",
+        "\n",
+        "This tutorial can be run with any of our predefined configs. As a default we have selected `audio_codec_16000.yaml`, which works for 16kHz audio."
+      ],
+      "metadata": {
+        "id": "ODgdGgsAAUku"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from omegaconf import OmegaConf"
+      ],
+      "metadata": {
+        "id": "SPtjS2LkzE9Q"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "CONFIG_FILENAME = \"audio_codec_16000.yaml\"\n",
+        "CONFIG_DIR = NEMO_CONFIG_DIR / \"audio_codec\"\n",
+        "\n",
+        "config_filepath = CONFIG_DIR / CONFIG_FILENAME\n",
+        "\n",
+        "if not config_filepath.exists():\n",
+        "  raise ValueError(f\"Config file does not exist {config_filepath}\")"
+      ],
+      "metadata": {
+        "id": "iCPJFKg63Dsv"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Read model name and sample rate from model configuration\n",
+        "omega_conf = OmegaConf.load(config_filepath)\n",
+        "MODEL_NAME = omega_conf.name\n",
+        "SAMPLE_RATE = omega_conf.sample_rate\n",
+        "print(f\"Training {MODEL_NAME} with sample rate {SAMPLE_RATE}\")"
+      ],
+      "metadata": {
+        "id": "QE0HYh7FjAR3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We provide pretrained model checkpoints for fine-tuning. The list of available models can be found [here](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/models/audio_codec.py#L645)."
+      ],
+      "metadata": {
+        "id": "W7F--_0maLh5"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import wget\n",
+        "from nemo.collections.tts.models.audio_codec import AudioCodecModel\n",
+        "\n",
+        "# Optionally specify a pretrained model to fine-tune from. To train from scratch, set this to 'None'.\n",
+        "pretrained_model_name = \"audio_codec_16khz_small\"\n",
+        "\n",
+        "if pretrained_model_name is None:\n",
+        "  MODEL_CHECKPOINT_PATH = None\n",
+        "else:\n",
+        "  model_list = AudioCodecModel.list_available_models()\n",
+        "\n",
+        "  pretrained_model_url = None\n",
+        "  for model in model_list:\n",
+        "    if model.pretrained_model_name == pretrained_model_name:\n",
+        "      pretrained_model_url = model.location\n",
+        "      break\n",
+        "\n",
+        "  if pretrained_model_url is None:\n",
+        "    raise ValueError(f\"Could not find pretrained model {pretrained_model_name}. Models available {model_list}\")\n",
+        "\n",
+        "  # Optionally load pretrained checkpoint\n",
+        "  MODEL_CHECKPOINT_PATH = ROOT_DIR / \"models\" / f\"{pretrained_model_name}.nemo\"\n",
+        "\n",
+        "  if not MODEL_CHECKPOINT_PATH.exists():\n",
+        "      print(f\"Downloading {pretrained_model_url} to {MODEL_CHECKPOINT_PATH}\")\n",
+        "      MODEL_CHECKPOINT_PATH.parent.mkdir(exist_ok=True)\n",
+        "      wget.download(pretrained_model_url, out=str(MODEL_CHECKPOINT_PATH))"
+      ],
+      "metadata": {
+        "id": "XqAYWR65aKTx"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fM4QPsLTnzK7"
+      },
+      "source": [
+        "# Dataset Preparation"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tkZC6Dl7KRl6"
+      },
+      "source": [
+        "For our tutorial, we use a subset of [VCTK](https://datashare.ed.ac.uk/handle/10283/2950) dataset with 5 speakers (p225-p229)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "sYzvAYr2vo1K"
+      },
+      "outputs": [],
+      "source": [
+        "import tarfile\n",
+        "\n",
+        "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aoxN1QsUzX-k"
+      },
+      "outputs": [],
+      "source": [
+        "# Create dataset directory\n",
+        "DATA_DIR = ROOT_DIR / \"data\"\n",
+        "\n",
+        "DATA_DIR.mkdir(parents=True, exist_ok=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mArlQd5Hk36b"
+      },
+      "outputs": [],
+      "source": [
+        "# Download the dataset\n",
+        "dataset_url = \"https://vctk-subset.s3.amazonaws.com/vctk_subset_multispeaker.tar.gz\"\n",
+        "dataset_tar_filepath = DATA_DIR / \"vctk.tar.gz\"\n",
+        "\n",
+        "if not dataset_tar_filepath.exists():\n",
+        "    wget.download(dataset_url, out=str(dataset_tar_filepath))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "p987cjtOy9C7"
+      },
+      "outputs": [],
+      "source": [
+        "# Extract the dataset\n",
+        "with tarfile.open(dataset_tar_filepath) as tar_f:\n",
+        "    tar_f.extractall(DATA_DIR)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Ko6dxYJW0i3G"
+      },
+      "outputs": [],
+      "source": [
+        "DATASET_DIR = DATA_DIR / \"vctk_subset_multispeaker\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "We5FHYQt5BeO"
+      },
+      "outputs": [],
+      "source": [
+        "# Visualize the raw dataset\n",
+        "train_raw_filepath = DATASET_DIR / \"train.json\"\n",
+        "!head $train_raw_filepath"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "i3jsk2HCMSU5"
+      },
+      "source": [
+        "## Manifest Processing"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "N8WuAGJsMHRn"
+      },
+      "source": [
+        "The downloaded manifest is formatted for TTS training, which contains metadata such as text and speaker.\n",
+        "\n",
+        "For codec training we need `audio_filepath`. The `audio_filepath` field can either be an *absolute path*, or a *relative path* with the root directory provided as an argument to each script. Here we use relative paths.\n",
+        "\n",
+        "If you include `duration` the training script will automatically calculate the total size of each dataset used, and can be useful for filtering based on utterance length."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zoCRrKQ20VZP"
+      },
+      "outputs": [],
+      "source": [
+        "def update_manifest(data_type):\n",
+        "    input_filepath = DATASET_DIR / f\"{data_type}.json\"\n",
+        "    output_filepath = DATASET_DIR / f\"{data_type}_raw.json\"\n",
+        "\n",
+        "    entries = read_manifest(input_filepath)\n",
+        "    new_entries = []\n",
+        "    for entry in entries:\n",
+        "        # Provide relative path instead of absolute path\n",
+        "        audio_filepath = entry[\"audio_filepath\"].replace(\"audio/\", \"\")\n",
+        "        duration = round(entry[\"duration\"], 2)\n",
+        "        new_entry = {\n",
+        "            \"audio_filepath\": audio_filepath,\n",
+        "            \"duration\": duration\n",
+        "        }\n",
+        "        new_entries.append(new_entry)\n",
+        "\n",
+        "    write_manifest(output_path=output_filepath, target_manifest=new_entries, ensure_ascii=False)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PaCc3GCG1UbH"
+      },
+      "outputs": [],
+      "source": [
+        "update_manifest(\"dev\")\n",
+        "update_manifest(\"train\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "bVLIB3Ip1Aqn"
+      },
+      "outputs": [],
+      "source": [
+        "# Visualize updated 'audio_filepath' field.\n",
+        "train_filepath = DATASET_DIR / \"train_raw.json\"\n",
+        "!head $train_filepath"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "alrRDWio41qi"
+      },
+      "source": [
+        "## Audio Preprocessing"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4WfEaMwpUsFt"
+      },
+      "source": [
+        "Next we process the audio data using [preprocess_audio.py](https://github.com/NVIDIA/NeMo/blob/main/scripts/dataset_processing/tts/preprocess_audio.py).\n",
+        "\n",
+        "During this step we can apply the following transformations:\n",
+        "\n",
+        "1. Resample the audio from 48khz to the target sample rate for codec training.\n",
+        "2. Remove long silence from the beginning and end of each audio file. This can be done using an *energy* based approach which will work on clean audio, or using *voice activity detection (VAD)* which is slower but also works on audio with background or static noise (eg. from a microphone). Here we suggest VAD because some audio in VCTK has background noise."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WEvIefjnd7AG"
+      },
+      "outputs": [],
+      "source": [
+        "import IPython.display as ipd"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-qEuCH8S4vFP"
+      },
+      "outputs": [],
+      "source": [
+        "# Python wrapper to invoke the given bash script with the given input args\n",
+        "def run_script(script, args):\n",
+        "    args = ' \\\\'.join(args)\n",
+        "    cmd = f\"python {script} \\\\{args}\"\n",
+        "\n",
+        "    print(cmd.replace(\" \\\\\", \"\\n\"))\n",
+        "    print()\n",
+        "    !$cmd"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0kQ1UDnGfdX6"
+      },
+      "outputs": [],
+      "source": [
+        "audio_preprocessing_script = NEMO_SCRIPT_DIR / \"preprocess_audio.py\"\n",
+        "\n",
+        "# Directory with raw audio data\n",
+        "input_audio_dir = DATASET_DIR / \"audio\"\n",
+        "# Directory to write preprocessed audio to\n",
+        "output_audio_dir = DATASET_DIR / \"audio_preprocessed\"\n",
+        "# Whether to overwrite existing audio, if it exists in the output directory\n",
+        "overwrite_audio = True\n",
+        "# Whether to overwrite output manifest, if it exists\n",
+        "overwrite_manifest = True\n",
+        "# Number of threads to parallelize audio processing across\n",
+        "num_workers = 4\n",
+        "# Format of output audio files. Use \"flac\" to compress to a smaller file size.\n",
+        "output_format = \"flac\"\n",
+        "# Method for silence trimming. Can use \"energy.yaml\" or \"vad.yaml\".\n",
+        "trim_config_path = NEMO_CONFIG_DIR / \"trim\" / \"vad.yaml\"\n",
+        "\n",
+        "def preprocess_audio(data_type):\n",
+        "    input_filepath = DATASET_DIR / f\"{data_type}_raw.json\"\n",
+        "    output_filepath = DATASET_DIR / f\"{data_type}_manifest.json\"\n",
+        "\n",
+        "    args = [\n",
+        "        f\"--input_manifest={input_filepath}\",\n",
+        "        f\"--output_manifest={output_filepath}\",\n",
+        "        f\"--input_audio_dir={input_audio_dir}\",\n",
+        "        f\"--output_audio_dir={output_audio_dir}\",\n",
+        "        f\"--num_workers={num_workers}\",\n",
+        "        f\"--output_sample_rate={SAMPLE_RATE}\",\n",
+        "        f\"--output_format={output_format}\",\n",
+        "        f\"--trim_config_path={trim_config_path}\"\n",
+        "    ]\n",
+        "    if overwrite_manifest:\n",
+        "        args.append(\"--overwrite_manifest\")\n",
+        "    if overwrite_audio:\n",
+        "        args.append(\"--overwrite_audio\")\n",
+        "\n",
+        "    run_script(audio_preprocessing_script, args)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ai0zbXSOriuY"
+      },
+      "outputs": [],
+      "source": [
+        "preprocess_audio(\"dev\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NUKnidQYfgDo"
+      },
+      "outputs": [],
+      "source": [
+        "preprocess_audio(\"train\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x2yhJtsj2lDR"
+      },
+      "source": [
+        "Before we proceed, it is important to verify that the audio processing works as expected. Let's listen to an audio file before and after processing.\n",
+        "\n",
+        "Note that the processed audio is shorter because we trimmed the leading and trailing silence."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!ls $processed_audio_filepath"
+      ],
+      "metadata": {
+        "id": "AfdHUHAWuF-G"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_fM3GwJxkjOA"
+      },
+      "outputs": [],
+      "source": [
+        "audio_file = \"p228_009.wav\"\n",
+        "audio_filepath = input_audio_dir / audio_file\n",
+        "processed_audio_filepath = output_audio_dir / audio_file.replace(\".wav\", \".flac\")\n",
+        "\n",
+        "print(\"Original audio.\")\n",
+        "ipd.display(ipd.Audio(audio_filepath, rate=SAMPLE_RATE))\n",
+        "\n",
+        "print(\"Processed audio.\")\n",
+        "ipd.display(ipd.Audio(processed_audio_filepath, rate=SAMPLE_RATE))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "oRO842MUyODC"
+      },
+      "source": [
+        "# Audio Codec Training"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E4wUKYOfH8ax"
+      },
+      "source": [
+        "Here we show how to train an audio codec model from scratch. Instructions and checkpoints for fine-tuning will be provided later.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pqfl9jAYMJob"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import torch\n",
+        "from omegaconf import OmegaConf"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jK2rr-Kr6Qg8"
+      },
+      "outputs": [],
+      "source": [
+        "dataset_name = \"vctk\"\n",
+        "audio_dir = DATASET_DIR / \"audio_preprocessed\"\n",
+        "train_manifest_filepath = DATASET_DIR / \"train_manifest.json\"\n",
+        "dev_manifest_filepath = DATASET_DIR / \"dev_manifest.json\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Vr4D-NB-yQx8"
+      },
+      "outputs": [],
+      "source": [
+        "audio_codec_training_script = NEMO_EXAMPLES_DIR / \"audio_codec.py\"\n",
+        "\n",
+        "# The total number of training steps will be (epochs * steps_per_epoch)\n",
+        "epochs = 10\n",
+        "steps_per_epoch = 10\n",
+        "\n",
+        "# Name of the experiment that will determine where it is saved locally and in TensorBoard and WandB\n",
+        "run_id = \"test_run\"\n",
+        "exp_dir = ROOT_DIR / \"exps\"\n",
+        "codec_exp_output_dir = exp_dir / MODEL_NAME / run_id\n",
+        "# Directory where predicted audio will be stored periodically throughout training\n",
+        "codec_log_dir = codec_exp_output_dir / \"logs\"\n",
+        "# Optionally log visualization of learned codes.\n",
+        "log_dequantized = True\n",
+        "# Optionally log predicted audio and other artifacts to WandB\n",
+        "log_to_wandb = False\n",
+        "# Optionally log predicted audio and other artifacts to Tensorboard\n",
+        "log_to_tensorboard = False\n",
+        "\n",
+        "if torch.cuda.is_available():\n",
+        "    accelerator=\"gpu\"\n",
+        "    batch_size = 4\n",
+        "else:\n",
+        "    accelerator=\"cpu\"\n",
+        "    batch_size = 2\n",
+        "\n",
+        "args = [\n",
+        "    f\"--config-path={CONFIG_DIR}\",\n",
+        "    f\"--config-name={CONFIG_FILENAME}\",\n",
+        "    f\"max_epochs={epochs}\",\n",
+        "    f\"weighted_sampling_steps_per_epoch={steps_per_epoch}\",\n",
+        "    f\"batch_size={batch_size}\",\n",
+        "    f\"log_dir={codec_log_dir}\",\n",
+        "    f\"exp_manager.exp_dir={exp_dir}\",\n",
+        "    f\"+exp_manager.version={run_id}\",\n",
+        "    f\"model.log_config.log_wandb={log_to_wandb}\",\n",
+        "    f\"model.log_config.log_tensorboard={log_to_tensorboard}\",\n",
+        "    f\"model.log_config.generators.0.log_dequantized={log_dequantized}\",\n",
+        "    f\"trainer.accelerator={accelerator}\",\n",
+        "    f\"+train_ds_meta.{dataset_name}.manifest_path={train_manifest_filepath}\",\n",
+        "    f\"+train_ds_meta.{dataset_name}.audio_dir={audio_dir}\",\n",
+        "    f\"+val_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}\",\n",
+        "    f\"+val_ds_meta.{dataset_name}.audio_dir={audio_dir}\",\n",
+        "    f\"+log_ds_meta.{dataset_name}.manifest_path={dev_manifest_filepath}\",\n",
+        "    f\"+log_ds_meta.{dataset_name}.audio_dir={audio_dir}\"\n",
+        "]\n",
+        "\n",
+        "if MODEL_CHECKPOINT_PATH is not None:\n",
+        "  args.append(f\"+init_from_nemo_model={MODEL_CHECKPOINT_PATH}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Bn8lQG0PxWGi"
+      },
+      "outputs": [],
+      "source": [
+        "# If an error occurs, log the entire stacktrace.\n",
+        "os.environ[\"HYDRA_FULL_ERROR\"] = \"1\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yUxFCNrE3Ywi"
+      },
+      "outputs": [],
+      "source": [
+        "# Do the model training. For some configurations this step might hang when using CPU.\n",
+        "run_script(audio_codec_training_script, args)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BBPIpS-lL6z9"
+      },
+      "source": [
+        "During training, the model will automatically save predictions for all audio files specified in the `log_ds_meta` manifest."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rSFOm1Sg46Lh"
+      },
+      "outputs": [],
+      "source": [
+        "codec_log_epoch_dir = codec_log_dir / \"epoch_10\" / dataset_name\n",
+        "!ls $codec_log_epoch_dir"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "oCJs7oCLMIjD"
+      },
+      "source": [
+        "This makes it easy to listen to the audio to determine how well the model is performing. We can decide to stop training when either:\n",
+        "\n",
+        "*   The predicted audio sounds almost identical to the original audio.\n",
+        "*   The predicted audio stops improving in between epochs.\n",
+        "\n",
+        "**Note that when training from scratch, the dataset in this tutorial is too small to get good audio quality.**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "G6k4ymzfJ5Y6"
+      },
+      "outputs": [],
+      "source": [
+        "audio_filepath_ground_truth = output_audio_dir / \"p228_009.flac\"\n",
+        "audio_filepath_reconstructed = codec_log_epoch_dir / \"p228_009_audio_out.wav\"\n",
+        "\n",
+        "print(\"Ground truth audio.\")\n",
+        "ipd.display(ipd.Audio(audio_filepath_ground_truth, rate=SAMPLE_RATE))\n",
+        "\n",
+        "print(\"Reconstructed audio.\")\n",
+        "ipd.display(ipd.Audio(audio_filepath_reconstructed, rate=SAMPLE_RATE))\n",
+        "\n",
+        "dequantized_filepath = codec_log_epoch_dir / \"p228_009_dequantized.png\"\n",
+        "ipd.Image(dequantized_filepath)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Related Information"
+      ],
+      "metadata": {
+        "id": "rynZYwg2VP5d"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "To learn more about audio codec models in NeMo, look at our [documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/tts/models.html#codecs).\n",
+        "\n",
+        "For more information on how to download and run pre-trained audio codec models, visit [NGC](https://catalog.ngc.nvidia.com/models?filters=&orderBy=scoreDESC&query=codec)."
+      ],
+      "metadata": {
+        "id": "_LtyHHuLkNDv"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# References"
+      ],
+      "metadata": {
+        "id": "LeqV3VvJVOb-"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "1.   [EnCodec](https://arxiv.org/abs/2210.13438)\n",
+        "2.   [Finite Scalar Quantization (FSQ)](https://arxiv.org/abs/2309.15505)\n",
+        "3.   [HiFi-GAN](https://arxiv.org/abs/2010.05646)\n",
+        "4.   [SEANet](https://arxiv.org/abs/2009.02095)"
+      ],
+      "metadata": {
+        "id": "Rvu4w2x_3RSY"
+      }
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

From 8005fb2f7ef4936e090f93f5f80b0e76bfa18e78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Thu, 2 May 2024 15:31:45 -0400
Subject: [PATCH 024/178] Improved `shard_id` parsing in
 `LazyNemoTarredIterator`, enables AIS dataloading (#9077)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* More permissive shard_id parsing, enables AIS dataloading

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix to shard id discovery

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* More informative assertion errors

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
---
 .../common/data/lhotse/nemo_adapters.py       | 59 +++++++++++++------
 1 file changed, 40 insertions(+), 19 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 02b3e1f4edda..b8769b041b4f 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -14,7 +14,6 @@
 
 import random
 import re
-import secrets
 import tarfile
 from io import BytesIO
 from pathlib import Path
@@ -147,9 +146,20 @@ class LazyNeMoTarredIterator:
 
     Args ``manifest_path`` and ``tar_paths`` can be either a path/string to a single file, or a string in NeMo format
     that indicates multiple paths (e.g. "[[data/bucket0/tarred_audio_paths.json],[data/bucket1/...]]").
+    We discover shard ids from sharded tar and json files by parsing the input specifier/path and
+    searching for the following pattern: ``(manifest|audio)[^/]*_(\d+)[^/]*\.(json|tar)``.
+    It allows filenames such as ``manifest_0.json``, ``manifest_0_normalized.json``, ``manifest_normalized_0.json``,
+    ``manifest_0.jsonl.gz``, etc. (anologusly the same applies to tar files).
+
+    We also support generalized input specifiers that imitate webdataset's pipes (also very similar to Kaldi's pipes).
+    These are arbitrary shell commands to be lazily executed which yield manifest or tar audio contents.
+    For example, ``tar_paths`` can be set to ``pipe:ais get ais://my-bucket/audio_{0..127}.tar -``
+    to indicate that we want to read tarred audio data from shards on an AIStore bucket.
+    This can be used for other cloud storage APIs such as S3, GCS, etc.
+    The same mechanism applies to ``manifest_path``.
 
     The ``shard_seed`` argument is used to seed the RNG shuffling the shards.
-    By default it's ``trng`` which samples a seed number from OS-provided TRNG (see Python ``secrets`` module).
+    By default, it's ``trng`` which samples a seed number from OS-provided TRNG (see Python ``secrets`` module).
     Seed is resolved lazily so that every dataloading worker may sample a different one.
     Override with an integer value for deterministic behaviour and consult Lhotse documentation for details:
     https://lhotse.readthedocs.io/en/latest/datasets.html#handling-random-seeds
@@ -172,30 +182,36 @@ def __init__(
         text_field: str = "text",
         lang_field: str = "lang",
     ) -> None:
-        def strip_pipe(p):
-            if isinstance(p, str):
-                if p.startswith("pipe:"):
-                    p = p[5:]
-                return Path(p)
-            return p
-
         self.shard_id_to_manifest: dict[int, Iterable[dict]]
         self.paths = expand_sharded_filepaths(manifest_path)
         if len(self.paths) == 1:
             self.source = LazyJsonlIterator(self.paths[0])
             self.shard_id_to_manifest = groupby("shard_id", self.source)
         else:
-            pattern = re.compile(r".+_(\d+)\.jsonl?(?:.gz)?")
+            json_pattern = re.compile(r"manifest[^/]*_(\d+)[^/]*\.json")
             shard_ids = []
             for p in self.paths:
-                m = pattern.match(p)
-                assert m is not None, f"Cannot determine shard_id from manifest path: {p}"
+                m = json_pattern.search(p)
+                assert m is not None, (
+                    f"Cannot determine shard_id from manifest input specified: "
+                    f"we searched with regex '{json_pattern.pattern}' in input '{p}'"
+                )
                 shard_ids.append(int(m.group(1)))
             self.shard_id_to_manifest = {sid: LazyJsonlIterator(p) for sid, p in zip(shard_ids, self.paths)}
             self.source = LazyIteratorChain(*self.shard_id_to_manifest.values())
 
-        tar_paths = expand_sharded_filepaths(tar_paths)
-        self.shard_id_to_tar_path: dict[int, str] = {int(strip_pipe(p).stem.split("_")[1]): p for p in tar_paths}
+        self.tar_paths = expand_sharded_filepaths(tar_paths)
+        tar_pattern = re.compile(r"audio[^/]*_(\d+)[^/]*\.tar")
+        shard_ids = []
+        for p in self.tar_paths:
+            m = tar_pattern.search(p)
+            assert m is not None, (
+                f"Cannot determine shard_id from tar input specifier: "
+                f"we searched with regex '{tar_pattern.pattern}' in input '{p}'"
+            )
+            shard_ids.append(int(m.group(1)))
+        self.shard_id_to_tar_path = dict(zip(shard_ids, self.tar_paths))
+
         self.shuffle_shards = shuffle_shards
         self.shard_seed = shard_seed
         self.text_field = text_field
@@ -225,8 +241,11 @@ def _validate(self) -> None:
         shard_ids_tars = set(self.shard_id_to_tar_path)
         shard_ids_manifest = set(self.shard_id_to_manifest)
         assert shard_ids_tars == shard_ids_manifest, (
-            f"Mismatch between shard IDs discovered from tar files ({len(shard_ids_tars)=}) and "
-            f"JSON manifest ({len(shard_ids_manifest)=}): {shard_ids_tars - shard_ids_manifest=}"
+            f"Mismatch between shard IDs. Details:\n"
+            f"* JSON manifest(s) {self.paths}\n"
+            f"* Tar files: {self.tar_paths}\n"
+            f"* JSON manifest(s) indicate(s) IDs: {sorted(shard_ids_manifest)}\n"
+            f"* Tar path(s) indicate(s) IDs: {sorted(shard_ids_tars)}\n"
         )
 
     @property
@@ -245,9 +264,11 @@ def __iter__(self) -> Generator[Cut, None, None]:
             tar_path = self.shard_id_to_tar_path[sid]
             with tarfile.open(fileobj=open_best(tar_path, mode="rb"), mode="r|*") as tar:
                 for data, tar_info in zip(shard_manifest, tar):
-                    assert (
-                        data["audio_filepath"] == tar_info.name
-                    ), f"Mismatched JSON manifest and tar file. {data['audio_filepath']=} != {tar_info.name=}"
+                    manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
+                    assert data["audio_filepath"] == tar_info.name, (
+                        f"Mismatched entry between JSON manifest ('{manifest_path}') and tar file ('{tar_path}'). "
+                        f"Conflicting audio file names are JSON='{data['audio_filepath']}' and TAR='{tar_info.name}'"
+                    )
                     raw_audio = tar.extractfile(tar_info).read()
                     # Note: Lhotse has a Recording.from_bytes() utility that we won't use here because
                     #       the profiling indicated significant overhead in torchaudio ffmpeg integration

From e16d06999bd088eb2fa1b1787628fca9929613bf Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Thu, 2 May 2024 23:45:53 +0200
Subject: [PATCH 025/178] [NeMo-UX] Add mistral-7b model (#9066)

* Adding MegatronParallel

* Move over _strategy_liMegatronCheckpointIO

* Adding GPTModel & MockDataModule

* Adding mixed-precision to NeMo

* Fix import

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Adding MegatronParallel

* Move over _strategy_liMegatronCheckpointIO

* Adding GPTModel & MockDataModule

* Add nemo.io to MegatronStrategy

* Move to cloudpickle

* Adding Mistral7B model

* Fix small bug inside state-transform

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert unintended changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* clean up code and reinstate mix precision tests

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: Chen Cui <chcui@nvidia.com>

* use cpu for unit test

Signed-off-by: Chen Cui <chcui@nvidia.com>

* clean up

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix test

Signed-off-by: Chen Cui <chcui@nvidia.com>

* mistral requires hf login so use a toy model for now

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert accidental change

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 nemo/io/__init__.py                     |  17 +-
 nemo/io/api.py                          | 171 +++++++++-
 nemo/io/connector.py                    | 179 +++++++++++
 nemo/io/mixin.py                        | 184 ++++++++++-
 nemo/io/state.py                        | 403 ++++++++++++++++++++++++
 nemo/llm/__init__.py                    |  12 +-
 nemo/llm/gpt/model/__init__.py          |  11 +-
 nemo/llm/gpt/model/base.py              |   7 +-
 nemo/llm/gpt/model/mistral_7b.py        | 263 ++++++++++++++++
 requirements/requirements_lightning.txt |   1 +
 tests/io/test_state.py                  | 233 ++++++++++++++
 11 files changed, 1472 insertions(+), 9 deletions(-)
 create mode 100644 nemo/io/connector.py
 create mode 100644 nemo/io/state.py
 create mode 100644 nemo/llm/gpt/model/mistral_7b.py
 create mode 100644 tests/io/test_state.py

diff --git a/nemo/io/__init__.py b/nemo/io/__init__.py
index 5b1d48768848..1b541ff7ba34 100644
--- a/nemo/io/__init__.py
+++ b/nemo/io/__init__.py
@@ -1,14 +1,25 @@
-from nemo.io.api import load, load_ckpt
+from nemo.io.api import export_ckpt, import_ckpt, load, load_ckpt, model_exporter, model_importer
 from nemo.io.capture import reinit
-from nemo.io.mixin import IOMixin
+from nemo.io.connector import Connector, ModelConnector
+from nemo.io.mixin import ConnectorMixin, IOMixin
 from nemo.io.pl import TrainerCheckpoint, is_distributed_ckpt
-
+from nemo.io.state import TransformCTX, apply_transforms, state_transform
 
 __all__ = [
+    "apply_transforms",
+    "Connector",
+    "ConnectorMixin",
     "IOMixin",
+    "import_ckpt",
     "is_distributed_ckpt",
+    "export_ckpt",
     "load",
     "load_ckpt",
+    "ModelConnector",
+    "model_importer",
+    "model_exporter",
     'reinit',
+    "state_transform",
     "TrainerCheckpoint",
+    "TransformCTX",
 ]
diff --git a/nemo/io/api.py b/nemo/io/api.py
index f7de36cb9545..c8fe3c04a811 100644
--- a/nemo/io/api.py
+++ b/nemo/io/api.py
@@ -1,9 +1,11 @@
 import pickle
 from pathlib import Path
-from typing import Any, Type, TypeVar
+from typing import Any, Callable, Optional, Type, TypeVar
 
 import fiddle as fdl
+import pytorch_lightning as pl
 
+from nemo.io.mixin import ConnectorMixin, ConnT, ModelConnector
 from nemo.io.pl import TrainerCheckpoint
 
 CkptType = TypeVar("CkptType")
@@ -60,3 +62,170 @@ def load_ckpt(path: Path) -> TrainerCheckpoint:
         checkpoint: TrainerCheckpoint = load_ckpt("/path/to/checkpoint")
     """
     return load(path, output_type=TrainerCheckpoint)
+
+
+def model_importer(
+    target: Type[ConnectorMixin], ext: str, default_path: Optional[str] = None
+) -> Callable[[Type[ConnT]], Type[ConnT]]:
+    """
+    Registers an importer for a model with a specified file extension and an optional default path.
+
+    Args:
+        target (Type[ConnectorMixin]): The model class to which the importer will be attached.
+        ext (str): The file extension associated with the model files to be imported.
+        default_path (Optional[str]): The default path where the model files are located, if any.
+
+    Returns
+    -------
+        Callable[[Type[ConnT]], Type[ConnT]]: A decorator function that registers the importer
+        to the model class.
+
+    Example:
+        @model_importer(MyModel, "hf", default_path="path/to/default")
+        class MyModelHfImporter(io.ModelConnector):
+            ...
+    """
+    return target.register_importer(ext, default_path=default_path)
+
+
+def model_exporter(
+    target: Type[ConnectorMixin], ext: str, default_path: Optional[str] = None
+) -> Callable[[Type[ConnT]], Type[ConnT]]:
+    """
+    Registers an exporter for a model with a specified file extension and an optional default path.
+
+    Args:
+        target (Type[ConnectorMixin]): The model class to which the exporter will be attached.
+        ext (str): The file extension associated with the model files to be exported.
+        default_path (Optional[str]): The default path where the model files will be saved, if any.
+
+    Returns
+    -------
+        Callable[[Type[ConnT]], Type[ConnT]]: A decorator function that registers the exporter
+        to the model class.
+
+    Example:
+        @model_exporter(MyModel, "hf", default_path="path/to/default")
+        class MyModelHFExporter(io.ModelConnector):
+            ...
+    """
+    return target.register_exporter(ext, default_path=default_path)
+
+
+def import_ckpt(
+    model: pl.LightningModule, source: str, output_path: Optional[Path] = None, overwrite: bool = False
+) -> Path:
+    """
+    Imports a checkpoint into a model using the model's associated importer, typically for
+    the purpose of fine-tuning a community model trained in an external framework, such as
+    Hugging Face. This function leverages the ConnectorMixin interface to integrate external
+    checkpoint data seamlessly into the specified model instance.
+
+    The importer component of the model reads the checkpoint data from the specified source
+    and transforms it into the right format. This is particularly useful for adapting
+    models that have been pre-trained in different environments or frameworks to be fine-tuned
+    or further developed within the current system. The function allows for specifying an output
+    path for the imported checkpoint; if not provided, the importer's default path will be used.
+    The 'overwrite' parameter enables the replacement of existing data at the output path, which
+    is useful when updating models with new data and discarding old checkpoint files.
+    
+    For instance, using `import_ckpt(Mistral7BModel(), "hf")` initiates the import process 
+    by searching for a registered model importer tagged with "hf". In NeMo, `HFMistral7BImporter` 
+    is registered under this tag via:
+    `@io.model_importer(Mistral7BModel, "hf", default_path="mistralai/Mistral-7B-v0.1")`. 
+    This links `Mistral7BModel` to `HFMistral7BImporter`, designed for HuggingFace checkpoints. 
+    The importer then processes and integrates these checkpoints into `Mistral7BModel` for further 
+    fine-tuning. 
+
+    Args:
+        model (pl.LightningModule): The model into which the checkpoint will be imported.
+            This model must implement the ConnectorMixin, which includes the necessary
+            importer method for checkpoint integration.
+        source (str): The source from which the checkpoint will be imported. This can be
+            a file path, URL, or any other string identifier that the model's importer
+            can recognize.
+        output_path (Optional[Path]): The path where the imported checkpoint will be stored.
+            If not specified, the importer's default path is used.
+        overwrite (bool): If set to True, existing files at the output path will be overwritten.
+            This is useful for model updates where retaining old checkpoint files is not required.
+
+    Returns
+    -------
+        Path: The path where the checkpoint has been saved after import. This path is determined
+            by the importer, based on the provided output_path and its internal logic.
+
+    Raises
+    ------
+        ValueError: If the model does not implement ConnectorMixin, indicating a lack of
+            necessary importer functionality.
+
+    Example:
+        model = Mistral7BModel()
+        imported_path = import_ckpt(model, "hf")
+    """
+    if not isinstance(model, ConnectorMixin):
+        raise ValueError("Model must be an instance of ConnectorMixin")
+
+    importer: ModelConnector = model.importer(source)
+    return importer(overwrite=overwrite, output_path=output_path)
+
+
+def load_connector_from_trainer_ckpt(path: Path, target: str) -> ModelConnector:
+    model: pl.LightningModule = load_ckpt(path).model
+
+    if not isinstance(model, ConnectorMixin):
+        raise ValueError("Model must be an instance of ConnectorMixin")
+
+    return model.exporter(target, path)
+
+
+def export_ckpt(
+    path: Path,
+    target: str,
+    output_path: Optional[Path] = None,
+    overwrite: bool = False,
+    load_connector: Callable[[Path, str], ModelConnector] = load_connector_from_trainer_ckpt,
+) -> Path:
+    """
+    Exports a checkpoint from a model using the model's associated exporter, typically for
+    the purpose of sharing a model that has been fine-tuned or customized within NeMo. 
+    This function leverages the ConnectorMixin interface to seamlessly integrate
+    the model's state into an external checkpoint format.
+
+    The exporter component of the model reads the model's state from the specified path and
+    exports it into the format specified by the 'target' identifier. This is particularly
+    useful for adapting models that have been developed or fine-tuned within the current system
+    to be compatible with other environments or frameworks. The function allows for specifying
+    an output path for the exported checkpoint; if not provided, the exporter's default path
+    will be used. The 'overwrite' parameter enables the replacement of existing data at the
+    output path, which is useful when updating models with new data and discarding old checkpoint
+    files.
+
+    Args:
+        path (Path): The path to the model's checkpoint file from which data will be exported.
+        target (str): The identifier for the exporter that defines the format of the export.
+        output_path (Optional[Path]): The path where the exported checkpoint will be saved.
+            If not specified, the exporter's default path is used.
+        overwrite (bool): If set to True, existing files at the output path will be overwritten.
+            This is useful for model updates where retaining old checkpoint files is not required.
+        load_connector (Callable[[Path, str], ModelConnector]): A function to load the appropriate
+            exporter based on the model and target format. Defaults to `load_connector_from_trainer_ckpt`.
+
+    Returns
+    -------
+        Path: The path where the checkpoint has been saved after export. This path is determined
+            by the exporter, based on the provided output_path and its internal logic.
+
+    Raises
+    ------
+        ValueError: If the model does not implement ConnectorMixin, indicating a lack of
+            necessary exporter functionality.
+
+    Example:
+        nemo_ckpt_path = Path("/path/to/model.ckpt")
+        export_path = export_ckpt(nemo_ckpt_path, "hf")
+    """
+    exporter: ModelConnector = load_connector(path, target)
+    _output_path = output_path or Path(path) / target
+
+    return exporter(overwrite=overwrite, output_path=_output_path)
diff --git a/nemo/io/connector.py b/nemo/io/connector.py
new file mode 100644
index 000000000000..bf5f88f95992
--- /dev/null
+++ b/nemo/io/connector.py
@@ -0,0 +1,179 @@
+import os
+import shutil
+from pathlib import Path, PosixPath, WindowsPath
+from typing import Generic, Optional, Tuple, TypeVar
+
+import pytorch_lightning as pl
+
+# Dynamically inherit from the correct Path subclass based on the operating system.
+if os.name == 'nt':
+    BasePath = WindowsPath
+else:
+    BasePath = PosixPath
+
+
+SourceT = TypeVar("SourceT")
+TargetT = TypeVar("TargetT")
+
+
+class Connector(BasePath, Generic[SourceT, TargetT]):
+    """
+    A generic connector class that provides a framework for transforming a source type (SourceT)
+    to a target type (TargetT) while handling file paths based on the operating system.
+
+    Attributes
+    ----------
+        default_path (Optional[Path]): A default path used when no path is explicitly provided.
+
+    Methods
+    -------
+        init() -> TargetT:
+            Should be implemented to initialize the target type from the source type.
+        
+        apply(output_path: Path) -> Path:
+            Should be implemented to apply the transformation and save the result at the output path.
+        
+        __new__(cls, *args, **kwargs) -> 'Connector':
+            Creates a new instance of the connector, using default_path if no path is provided.
+        
+        __call__(output_path: Optional[Path] = None, overwrite: bool = False) -> Path:
+            Processes the transformation and handles file operations like overwriting.
+        
+        local_path(base_path: Optional[Path] = None) -> Path:
+            Computes the local path for storage based on a base path or a default cache home.
+        
+        is_in_cache(base_path: Optional[Path] = None) -> bool:
+            Checks if the transformed data is already cached at the specified base path.
+    """
+
+    default_path = None
+
+    def init(self) -> TargetT:
+        raise NotImplementedError()
+
+    def apply(self, output_path: Path) -> Path:
+        raise NotImplementedError()
+
+    def __new__(cls, *args, **kwargs):
+        if cls.default_path is not None and not args and 'path' not in kwargs:
+            # If default_path is set and no arguments are provided, use default_path as the argument
+            return super().__new__(cls, cls.default_path)
+
+        return super().__new__(cls, *args, **kwargs)
+
+    def __call__(self, output_path: Optional[Path] = None, overwrite: bool = False) -> Path:
+        _output_path = output_path or self.local_path()
+
+        if overwrite and _output_path.exists():
+            shutil.rmtree(_output_path)
+
+        if not _output_path.exists():
+            to_return = self.apply(_output_path)
+            _output_path = to_return or _output_path
+
+        return _output_path
+
+    def local_path(self, base_path: Optional[Path] = None) -> Path:
+        if base_path:
+            _base = base_path
+        else:
+            from nemo.lightning.base import NEMO_CACHE_HOME
+
+            _base = Path(NEMO_CACHE_HOME)
+
+        return _base / str(self).replace("://", "/")
+
+    def is_in_cache(self, base_path: Optional[Path] = None) -> bool:
+        return self.local_path(base_path=base_path).exists()
+
+
+class ModelConnector(Connector, Generic[SourceT, TargetT]):
+    """
+    A specialized connector that extends the generic Connector to handle model-specific operations
+    such as setup, save, and load using the Lightning framework.
+
+    Methods
+    -------
+        nemo_setup(model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer:
+            Sets up the model and trainer using a specified strategy, preparing it for training or inference.
+        
+        nemo_save(output_path: Path, trainer: pl.Trainer):
+            Saves the model's state to the specified path using the trainer's current strategy.
+        
+        nemo_load(path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True) -> Tuple[Any, pl.Trainer]:
+            Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer.
+    """
+
+    def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer:
+        """
+        Sets up the model and trainer using a specified strategy, preparing it for training or inference.
+
+        Args:
+            model (pl.LightningModule): The model to be set up.
+            trainer (Optional[pl.Trainer]): The trainer to be used, if not provided a new one will be created.
+
+        Returns
+        -------
+            pl.Trainer: The trainer configured with the model and strategy.
+        """
+        from nemo.lightning import MegatronStrategy, Trainer
+
+        _trainer = trainer or Trainer(devices=1, accelerator="cpu", strategy=MegatronStrategy())
+
+        _trainer.strategy.connect(model)
+        _trainer.strategy.setup_environment()
+
+        if not model.state_dict():
+            _trainer.strategy.lazy_init = True
+            with _trainer.init_module():
+                model.configure_model()
+
+        return _trainer
+
+    def nemo_save(self, output_path: Path, trainer: pl.Trainer) -> None:
+        """
+        Saves the model's state to the specified path using the trainer's current strategy.
+
+        Args:
+            output_path (Path): The path where the model checkpoint will be saved.
+            trainer (pl.Trainer): The trainer with the strategy to save the model.
+        """
+        trainer.strategy.setup(trainer)
+        trainer.save_checkpoint(output_path)
+
+    def nemo_load(
+        self, path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True
+    ) -> Tuple[pl.LightningModule, pl.Trainer]:
+        """
+        Loads a model from the specified path.
+
+        Args:
+            path (Path): The path from which the model will be loaded.
+            trainer (Optional[pl.Trainer]): The trainer to be used, if not provided a new one will be created.
+            cpu (bool): If True, the model will be loaded with a CPU-focused strategy.
+
+        Returns
+        -------
+            Tuple[pl.LightningModule, pl.Trainer]: The loaded model and the trainer configured with the model.
+        """
+        from nemo.io.api import load_ckpt
+        from nemo.lightning import MegatronStrategy, Trainer, _strategy_lib
+
+        model = load_ckpt(path).model
+        _trainer = trainer or Trainer(devices=1, accelerator="cpu" if cpu else "gpu", strategy=MegatronStrategy())
+
+        _trainer.strategy.connect(model)
+        _trainer.strategy.setup_environment()
+        # TODO: Fix cpu initialization
+        if not model.state_dict():
+            if cpu:
+                # TODO: Make this more generic
+                with _strategy_lib.megatron_cpu_init_context(model.config):
+                    model.configure_model()
+            else:
+                model.configure_model()
+
+        _trainer.strategy.setup(_trainer)
+        _trainer.strategy.load_checkpoint(path)
+
+        return model, _trainer
diff --git a/nemo/io/mixin.py b/nemo/io/mixin.py
index d09c456f7957..bba6677b452b 100644
--- a/nemo/io/mixin.py
+++ b/nemo/io/mixin.py
@@ -2,13 +2,16 @@
 import inspect
 from dataclasses import is_dataclass
 from pathlib import Path
-from typing import Any, Dict
+from typing import Any, Callable, Dict, Optional, Type, TypeVar, Union
 
 import fiddle as fdl
 from cloudpickle import dump
 from typing_extensions import Self
 
 from nemo.io.capture import IOProtocol
+from nemo.io.connector import ModelConnector
+
+ConnT = TypeVar('ConnT', bound=ModelConnector)
 
 
 class IOMixin:
@@ -137,3 +140,182 @@ def io_dump(self, output: Path):
         config_path = Path(output) / "io.pkl"
         with open(config_path, "wb") as f:
             dump(self.__io__, f)
+
+
+class ConnectorMixin:
+    """
+    A mixin class that provides methods to register and retrieve model connectors for importing
+    and exporting models. This class supports dynamic registration of connectors based on file
+    extensions, which facilitates the customization and extension of model serialization and
+    deserialization processes.
+
+    Attributes
+    ----------
+        _IMPORTERS (Dict[str, Type[ModelConnector]]): A dictionary mapping file extensions to
+            model connector classes that handle the import process.
+        _EXPORTERS (Dict[str, Type[ModelConnector]]): A dictionary mapping file extensions to
+            model connector classes that handle the export process.
+    """
+
+    _IMPORTERS: Dict[str, Type[ModelConnector]] = {}
+    _EXPORTERS: Dict[str, Type[ModelConnector]] = {}
+
+    @classmethod
+    def import_from(cls, path: str) -> Self:
+        """
+        Creates an instance of a model by using the appropriate importer based on the file
+        extension of the provided path.
+
+        Args:
+            path (str): The path to the model file to be imported.
+            
+        Example:
+            from nemo import llm
+            model = llm.Mistral7BModel.import_from("hf")
+
+        Returns
+        -------
+            Self: An instance of the model initialized from the imported data.
+        """
+        output = cls._get_connector(path).init()
+        output.ckpt_path = output.import_ckpt_path(path)
+
+        return output
+
+    @classmethod
+    def register_importer(cls, ext: str, default_path: Optional[str] = None) -> Callable[[Type[ConnT]], Type[ConnT]]:
+        """
+        A class method decorator to register a model connector as an importer for a specific file
+        extension.
+
+        Args:
+            ext (str): The file extension to associate with the model connector.
+            default_path (Optional[str]): The default path to use if no path is specified during import.
+
+        Returns
+        -------
+            Callable[[Type[ConnT]], Type[ConnT]]: The decorator that registers the model connector.
+        """
+
+        def decorator(connector: Type[ConnT]) -> Type[ConnT]:
+            cls._IMPORTERS[ext] = connector
+            if default_path:
+                connector.default_path = default_path
+            return connector
+
+        return decorator
+
+    @classmethod
+    def register_exporter(cls, ext: str, default_path: Optional[str] = None) -> Callable[[Type[ConnT]], Type[ConnT]]:
+        """
+        A class method decorator to register a model connector as an exporter for a specific file
+        extension.
+
+        Args:
+            ext (str): The file extension to associate with the model connector.
+            default_path (Optional[str]): The default path to use if no path is specified during export.
+
+        Returns
+        -------
+            Callable[[Type[ConnT]], Type[ConnT]]: The decorator that registers the model connector.
+        """
+
+        def decorator(connector: Type[ConnT]) -> Type[ConnT]:
+            cls._EXPORTERS[ext] = connector
+            if default_path:
+                connector.default_path = default_path
+            return connector
+
+        return decorator
+
+    @classmethod
+    def importer(cls, path: str) -> ModelConnector:
+        """
+        Retrieves the appropriate model connector for importing based on the extension of the
+        provided path.
+
+        Args:
+            path (str): The path to the model file to be imported.
+
+        Returns
+        -------
+            ModelConnector: The model connector instance capable of handling the import.
+        """
+        return cls._get_connector(path, importer=True)
+
+    @classmethod
+    def exporter(cls, ext: str, path: Union[str, Path]) -> ModelConnector:
+        """
+        Retrieves the appropriate model connector for exporting based on the extension.
+
+        Args:
+            ext (str): The file extension associated with the model connector.
+            path (Union[str, Path]): The path where the model will be exported.
+
+        Returns
+        -------
+            ModelConnector: The model connector instance capable of handling the export.
+        """
+        return cls._get_connector(ext, path, importer=False)
+
+    def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Path] = None) -> Path:
+        """
+        Imports a checkpoint from a specified path, potentially overwriting existing files.
+
+        Args:
+            path (str): The path to the checkpoint file to be imported.
+            overwrite (bool): Flag to determine if existing files should be overwritten (default is False).
+            base_path (Optional[Path]): The base path where the checkpoint file is located; used to resolve
+                                        relative paths.
+
+        Returns
+        -------
+            Path: The path to the imported checkpoint.
+
+        Raises
+        ------
+            FileNotFoundError: If the checkpoint file does not exist at the specified path.
+        """
+        connector = self._get_connector(path)
+        ckpt_path: Path = connector.local_path(base_path=base_path)
+        ckpt_path = connector(ckpt_path, overwrite=overwrite)
+
+        return ckpt_path
+
+    @classmethod
+    def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector:
+        """
+        Retrieves the appropriate model connector based on the file extension and path, 
+        distinguishing between importers and exporters.
+
+        Args:
+            ext (str): The file extension or a URI that may include a protocol specifier.
+            path (Optional[Union[str, Path]]): The path where the model file is located or will be saved.
+            importer (bool): Flag to determine if the connector is for importing (True) or exporting (False).
+
+        Returns
+        -------
+            ModelConnector: The model connector instance capable of handling the import or export.
+
+        Raises
+        ------
+            ValueError: If no connector is found for the specified extension or if no default path is provided
+                        when required.
+        """
+        _path = None
+        if "://" in ext:
+            ext, _path = ext.split("://")
+        else:
+            _path = path
+
+        connector = cls._IMPORTERS.get(ext) if importer else cls._EXPORTERS.get(ext)
+        if not connector:
+            raise ValueError(f"No connector found for extension '{ext}'")
+
+        if not _path:
+            if not connector.default_path:
+                raise ValueError(f"No default path specified for extension '{ext}'. ", "Please provide a path")
+
+            return connector()
+
+        return connector(_path)
diff --git a/nemo/io/state.py b/nemo/io/state.py
new file mode 100644
index 000000000000..d978cd0ade8e
--- /dev/null
+++ b/nemo/io/state.py
@@ -0,0 +1,403 @@
+import inspect
+import re
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Generic, List, Optional, Tuple, TypeVar, Union, overload
+
+import numpy as np
+from torch import nn
+
+SourceModuleT = TypeVar("SourceModuleT", bound=nn.Module)
+TargetModuleT = TypeVar("TargetModuleT", bound=nn.Module)
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+@dataclass
+class TransformCTX:
+    source: nn.Module
+    source_state: dict
+    target: nn.Module
+    target_state: dict
+
+
+def apply_transforms(
+    source: nn.Module,
+    target: TargetModuleT,
+    mapping: Dict[str, str],
+    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None,
+) -> TargetModuleT:
+    """
+    Applies a series of transformations to adapt the state dictionary of a source module to 
+    match the structure of a target module's state dictionary.
+
+    This function renames keys according to a provided mapping and modifies values using a list
+    of transformation functions. Each transformation function typically is decorated 
+    with `io.state_transform`.
+
+    Args:
+        source (nn.Module): The source module from which parameters and buffers are taken.
+        target (TargetModuleT): The target module to which parameters and buffers are adapted.
+        mapping (Dict[str, str]): Key-value pairs where each key from the source state dictionary
+            is mapped to a corresponding key in the target state dictionary.
+        transforms (Optional[List[Callable[[TransformCTX], TransformCTX]]]): A list of functions
+            that modify the `TransformCTX` object. If None, no transformations beyond key renaming
+            are applied. Defaults to None.
+
+    Returns
+    -------
+        TargetModuleT: The modified target module with its state dictionary adjusted according to
+        the specified mappings and transformations.
+
+    Raises
+    ------
+        ValueError: If there's a mismatch in shape between corresponding source and target parameters
+            or buffers.
+        RuntimeError: If the target state dictionary contains keys that are not present in the source
+            state dictionary after all transformations.
+
+    Examples
+    --------
+        >>> source_module = nn.Linear(10, 5)
+        >>> target_module = nn.Linear(10, 5)
+        >>> mapping = {'weight': 'weights', 'bias': 'biases'}
+        @io.state_transform(
+            source_key="weight",
+            target_key="weights"
+        )
+        def scale_weights(ctx):
+            ctx.target_state['weights'] = ctx.source_state['weight'] * 2
+            return ctx
+        >>> transformed_target = apply_transforms(
+        ...     source_module, target_module, mapping, [scale_weights]
+        ... )
+        >>> print(transformed_target.state_dict()['weights'])
+
+    See Also
+    --------
+        - `TransformCTX`: For more details on the context object used in transformations.
+        - `StateDictTransform`: For creating complex transformations.
+
+    Note:
+        This function is particularly useful when adapting models from different frameworks or
+        when consolidating models with different architectural changes.
+    """
+    from megatron.core.transformer.module import MegatronModule
+
+    # TODO: How can we improve this?
+    _source = source
+    if hasattr(source, "module") and isinstance(source.module, MegatronModule):
+        _source = source.module
+    _target = target
+    if hasattr(target, "module") and isinstance(target.module, MegatronModule):
+        _target = target.module
+
+    target_state = _target.state_dict()
+    ctx = TransformCTX(source=_source, source_state=_source.state_dict(), target=_target, target_state=target_state,)
+
+    for key, val in mapping.items():
+        ctx = StateDictTransform(key, val)(ctx)
+
+    if transforms:
+        for transform in transforms:
+            ctx = transform(ctx)
+
+    _params: Dict[str, nn.Parameter] = {}
+    for name, param in _target.named_parameters():
+        if name in target_state:
+            target_param = target_state[name]
+            if param.data.shape != target_param.shape:
+                raise ValueError(f"Shape mismatch for parameter {name}: {param.shape} vs {target_param.shape}")
+
+            _params[name] = nn.Parameter(target_param, requires_grad=param.requires_grad)
+            target_state.pop(name)
+        else:
+            print(f"Unexpected key: {name} not in checkpoint but in model.")
+
+    for key, val in _params.items():
+        _module, _key = _target, key
+        if "." in key:
+            for part in key.split(".")[:-1]:
+                _module = getattr(_module, part)
+            _key = key.split(".")[-1]
+
+        _module.register_parameter(_key, val)
+
+    _buffers = {}
+    for name, buffer in _target.named_buffers():
+        if name in target_state:
+            if buffer.shape != target_state[name].shape:
+                raise ValueError(f"Shape mismatch for buffer {name}: {buffer.shape} vs {target_state[name].shape}")
+
+            _buffers[name] = nn.Parameter(target_state[name], requires_grad=False)
+            target_state.pop(name)
+
+    for key, val in _buffers.items():
+        _module, _key = _target, key
+        if "." in key:
+            for part in key.split(".")[:-1]:
+                _module = getattr(_module, part)
+            _key = key.split(".")[-1]
+
+        _module.register_buffer(_key, val)
+
+    keys = [name for name in list(target_state.keys()) if not name.endswith("_extra_state")]
+    if len(keys) != 0:
+        raise RuntimeError(f"Additional keys: {target_state.keys()} in checkpoint but not in model.")
+
+    # TODO: Is this correct?
+    # for key in target.state_dict():
+    #     if key.endswith("_extra_state"):
+    #         del target.state_dict()[key]
+
+    """finally:
+        cls._set_model_restore_state(is_being_restored=False)"""
+
+    if hasattr(target, "module") and isinstance(target.module, MegatronModule):
+        target.module = _target
+
+        return target
+
+    return _target
+
+
+def _default_transform(inp):
+    return inp.float()
+
+
+class StateDictTransform(Generic[F]):
+    """
+    A transformation class for state dictionaries, allowing for flexible key matching and
+    transformation of values between source and target state dictionaries.
+
+    Attributes
+    ----------
+        source_key: A string, tuple of strings, or a dictionary specifying the keys in the source
+            state dictionary to match. Wildcards (*) are supported.
+        target_key: A string or tuple of strings specifying the keys in the target state dictionary
+            to match. Wildcards (*) are supported.
+        transform: A callable that performs the transformation on matched keys' values.
+
+    Examples
+    --------
+        >>> def example_transform(ctx, *args):
+        ...     return sum(args)
+        >>> transform = StateDictTransform(
+        ...     source_key="model.layers.*.self_attn.*_proj.weight",
+        ...     target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+        ...     transform=example_transform
+        ... )
+    """
+
+    def __init__(
+        self,
+        source_key: Union[str, Tuple[str, ...], Dict[str, str]],
+        target_key: Union[str, Tuple[str, ...]],
+        transform: F = _default_transform,
+    ):
+        self.source_key = source_key
+        self.target_key = target_key
+        self.transform = transform
+
+    def __call__(self, ctx: TransformCTX) -> TransformCTX:
+        source_key = self.source_key
+        target_key = self.target_key
+        source_dict, target_dict = ctx.source_state, ctx.target_state
+
+        fn_params = dict(inspect.signature(self.transform).parameters)
+        fn_params.pop("ctx", None)
+
+        if isinstance(source_key, (dict, tuple)):
+            if isinstance(source_key, tuple):
+                source_key_dict = {param: source_key[i] for i, param in enumerate(fn_params)}
+            else:
+                source_key_dict = source_key
+            source_matches_dict = {k: _match_keys(list(source_dict.keys()), v) for k, v in source_key_dict.items()}
+            target_matches = _match_keys(list(target_dict.keys()), target_key)
+
+            for target_index, target_match in np.ndenumerate(target_matches):
+                kwargs = {}
+                for param in fn_params:
+                    if param in source_matches_dict:
+                        source_match = source_matches_dict[param][target_index[:-1]]
+                        kwargs[param] = source_dict[source_match[target_index]]
+
+                target_dict[target_match] = self.call_transform(ctx, **kwargs)
+        else:
+            source_keys = list(source_dict.keys())
+            target_keys = list(target_dict.keys())
+
+            source_matches = _match_keys(source_keys, source_key)
+            if source_matches.size == 1 and source_matches == np.array(None):
+                raise ValueError(f"No matches found for source key: {source_key}")
+
+            if isinstance(target_key, str):
+                target_matches = _match_keys(target_keys, target_key)
+                if target_matches.size < 1:
+                    raise ValueError(f"No matches found for target key: {target_key}")
+            else:
+                if isinstance(target_key, dict):
+                    raise ValueError("Target key must be a string or a tuple of strings.")
+
+                _matches = np.vstack([_match_keys(target_keys, key) for key in target_key])
+                target_matches = np.transpose(_matches)
+
+            # Determine if we are dealing with multiple source matches or multiple target matches
+            multiple_sources = source_matches.ndim >= target_matches.ndim
+            accepts_var_args = any(
+                param.kind == param.VAR_POSITIONAL for param in inspect.signature(self.transform).parameters.values()
+            )
+
+            if multiple_sources:
+                for target_index, target_match in np.ndenumerate(target_matches):
+                    source_match = source_matches[target_index]
+
+                    if accepts_var_args:
+                        source_values = [source_dict[k] for k in source_match]
+                        target_dict[target_match] = self.call_transform(ctx, *source_values)
+                    else:
+                        _source_match_list = [source_match] if isinstance(source_match, str) else list(source_match)
+                        if len(fn_params) != len(_source_match_list):
+                            raise ValueError(
+                                f"Mismatch between source and target keys: {source_match} vs {target_match}"
+                            )
+
+                        kwargs = {param: source_dict[k] for param, k in zip(fn_params, _source_match_list)}
+                        target_dict[target_match] = self.call_transform(ctx, **kwargs)
+            else:
+                if source_matches.ndim == 0:
+                    source_matches_list = [source_matches.item()]
+                    source_matches = np.array(source_matches_list, dtype=object)
+                else:
+                    source_matches_list = list(source_matches)
+
+                if source_matches.shape[0] != target_matches.shape[0]:
+                    if target_matches.shape[0] == 1 and source_matches.shape[0] == target_matches.shape[1]:
+                        source_matches_list = [source_matches_list]
+                    else:
+                        raise ValueError(
+                            "Mismatch between source and target keys: {source_matches} vs {target_matches}"
+                        )
+
+                for source_index, source_match in enumerate(source_matches_list):
+                    target_match = target_matches[source_index]
+                    source_values = (
+                        [source_dict[source_match]]
+                        if np.isscalar(source_match)
+                        else [source_dict[k] for k in source_match]
+                    )
+                    if accepts_var_args:
+                        outputs = self.call_transform(ctx, *source_values)
+                    else:
+                        kwargs = {param: val for param, val in zip(fn_params, source_values)}
+                        outputs = self.call_transform(ctx, **kwargs)
+
+                    if isinstance(target_match, str):
+                        target_dict[target_match] = outputs
+                    else:
+                        for i, t in enumerate(outputs):
+                            target_dict[target_match[i]] = t
+
+        return ctx
+
+    def call_transform(self, ctx: TransformCTX, *args, **kwargs):
+        func_params = inspect.signature(self.transform).parameters
+        expected_num_args = len([p for p in func_params if p not in ['self', 'ctx']])
+        provided_num_args = len(args) + len(kwargs)
+        accepts_var_args = any(param.kind == param.VAR_POSITIONAL for param in func_params.values())
+
+        if not accepts_var_args and provided_num_args != expected_num_args:
+            raise ValueError(
+                f"Expected {expected_num_args} arguments for the transformation function, but got {provided_num_args}."
+            )
+
+        if 'ctx' in func_params:
+            return self.transform(ctx, *args, **kwargs)
+
+        return self.transform(*args, **kwargs)
+
+
+def _match_keys(keys: List[str], pattern: str) -> np.ndarray:
+    regex_pattern = re.compile("^" + pattern.replace("*", "(.*)") + "$")
+    wildcard_matches = [[] for _ in range(pattern.count("*"))]
+
+    for key in keys:
+        match = regex_pattern.match(key)
+        if match:
+            for i, group in enumerate(match.groups()):
+                if group not in wildcard_matches[i]:
+                    wildcard_matches[i].append(group)
+
+    # Sort the wildcard matches to maintain consistent ordering
+    for i in range(len(wildcard_matches)):
+        wildcard_matches[i].sort(key=lambda x: int(x) if x.isdigit() else x)
+
+    # Determine the shape of the output array based on the unique matches for each wildcard
+    shape = [len(matches) for matches in wildcard_matches]
+
+    # Initialize an empty array with the determined shape
+    output_array = np.empty(shape, dtype=object)
+
+    # Populate the array with the keys, now that we have the correct shape and ordering
+    for key in keys:
+        match = regex_pattern.match(key)
+        if match:
+            # Convert match groups to indices based on their position in wildcard_matches
+            indices = [wildcard_matches[i].index(group) for i, group in enumerate(match.groups())]
+            output_array[tuple(indices)] = key  # Place the key in the array based on the indices
+
+    return output_array
+
+
+@overload
+def state_transform(
+    source_key: Union[str, Tuple[str, ...], Dict[str, str]], target_key: Union[str, Tuple[str, ...]],
+) -> Callable[[F], StateDictTransform[F]]:
+    ...
+
+
+@overload
+def state_transform(
+    source_key: Union[str, Tuple[str, ...], Dict[str, str]], target_key: Union[str, Tuple[str, ...]], fn: F
+) -> StateDictTransform[F]:
+    ...
+
+
+def state_transform(
+    source_key: Union[str, Tuple[str, ...], Dict[str, str]],
+    target_key: Union[str, Tuple[str, ...]],
+    fn: Optional[F] = None,
+):
+    """
+    A decorator for creating StateDictTransform instances with specified source and target keys,
+    and a transformation function. This allows for concise definition of state dictionary
+    transformations.
+
+    Args:
+        source_key: A string, tuple of strings, or a dictionary specifying the keys in the source
+            state dictionary to match. Wildcards (*) are supported.
+        target_key: A string or tuple of strings specifying the keys in the target state dictionary
+            to match. Wildcards (*) are supported.
+        fn: An optional callable that performs the transformation on matched keys' values. If not
+            provided, the decorator can be used to wrap a function definition.
+
+    Returns
+    -------
+        A StateDictTransform instance if `fn` is provided, otherwise returns a decorator that
+        takes a function and returns a StateDictTransform instance.
+
+    Examples
+    --------
+        >>> @state_transform(
+        ...     source_key="model.layers.*.self_attn.*_proj.weight",
+        ...     target_key="decoder.layers.*.self_attention.linear_qkv.weight"
+        ... )
+        ... def sum_transform(ctx, *args):
+        ...     return sum(args)
+    """
+
+    def wrapper(fn) -> StateDictTransform:
+        return StateDictTransform(source_key, target_key, fn)
+
+    if fn is None:
+        return wrapper
+
+    return wrapper(fn)
diff --git a/nemo/llm/__init__.py b/nemo/llm/__init__.py
index 2dd39b3f170e..a05c96f60944 100644
--- a/nemo/llm/__init__.py
+++ b/nemo/llm/__init__.py
@@ -1,5 +1,13 @@
 from nemo.llm.gpt.data import MockDataModule
-from nemo.llm.gpt.model import GPTConfig, GPTModel, MaskedTokenLossReduction, gpt_data_step, gpt_forward_step
+from nemo.llm.gpt.model import (
+    GPTConfig,
+    GPTModel,
+    MaskedTokenLossReduction,
+    Mistral7BConfig,
+    Mistral7BModel,
+    gpt_data_step,
+    gpt_forward_step,
+)
 
 __all__ = [
     "MockDataModule",
@@ -8,4 +16,6 @@
     "gpt_data_step",
     "gpt_forward_step",
     "MaskedTokenLossReduction",
+    "Mistral7BConfig",
+    "Mistral7BModel",
 ]
diff --git a/nemo/llm/gpt/model/__init__.py b/nemo/llm/gpt/model/__init__.py
index 9481e75542ed..05c3e9928fab 100644
--- a/nemo/llm/gpt/model/__init__.py
+++ b/nemo/llm/gpt/model/__init__.py
@@ -1,3 +1,12 @@
 from nemo.llm.gpt.model.base import GPTConfig, GPTModel, MaskedTokenLossReduction, gpt_data_step, gpt_forward_step
+from nemo.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
 
-__all__ = ["GPTConfig", "GPTModel", "MaskedTokenLossReduction", "gpt_data_step", "gpt_forward_step"]
+__all__ = [
+    "GPTConfig",
+    "GPTModel",
+    "Mistral7BConfig",
+    "Mistral7BModel",
+    "MaskedTokenLossReduction",
+    "gpt_data_step",
+    "gpt_forward_step",
+]
diff --git a/nemo/llm/gpt/model/base.py b/nemo/llm/gpt/model/base.py
index 93186a7e7e08..554870712a36 100644
--- a/nemo/llm/gpt/model/base.py
+++ b/nemo/llm/gpt/model/base.py
@@ -23,7 +23,9 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     fp16_lm_cross_entropy: bool = False
     parallel_output: bool = True
     share_embeddings_and_output_weights: bool = False
+    make_vocab_size_divisible_by: int = 128
     position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute"
+    rotary_base: int = 10000
     rotary_percent: float = 1.0
     seq_len_interpolation_factor: Optional[float] = None
     seq_length: int = 1024
@@ -48,20 +50,21 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
         return MCoreGPTModel(
             self,
             transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
-            vocab_size=get_vocab_size(self, tokenizer.vocab_size),
+            vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
             max_sequence_length=self.seq_length,
             fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
             parallel_output=self.parallel_output,
             share_embeddings_and_output_weights=self.share_embeddings_and_output_weights,
             position_embedding_type=self.position_embedding_type,
             rotary_percent=self.rotary_percent,
+            rotary_base=self.rotary_base,
             seq_len_interpolation_factor=self.seq_len_interpolation_factor,
             pre_process=parallel_state.is_pipeline_first_stage(),
             post_process=parallel_state.is_pipeline_last_stage(),
         )
 
 
-class GPTModel(L.LightningModule, io.IOMixin):
+class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin):
     def __init__(
         self,
         config: GPTConfig,
diff --git a/nemo/llm/gpt/model/mistral_7b.py b/nemo/llm/gpt/model/mistral_7b.py
new file mode 100644
index 000000000000..83d3b3412a39
--- /dev/null
+++ b/nemo/llm/gpt/model/mistral_7b.py
@@ -0,0 +1,263 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, List, Optional
+
+import torch
+import torch.nn.functional as F
+
+from nemo import io
+from nemo.llm.gpt.model.base import GPTConfig, GPTModel
+
+if TYPE_CHECKING:
+    from transformers import MistralConfig, MistralForCausalLM
+
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+
+@dataclass
+class Mistral7BConfig(GPTConfig):
+    normalization: str = "RMSNorm"
+    activation_func: Callable = F.silu
+    position_embedding_type: str = "rope"
+    add_bias_linear: bool = False
+    gated_linear_unit: bool = True
+    apply_query_key_layer_scaling: bool = True
+
+    num_layers: int = 32
+    hidden_size: int = 4096
+    num_attention_heads: int = 32
+    num_query_groups: int = 8
+    ffn_hidden_size: int = 14336
+    seq_length: int = 32768
+
+    init_method_std: float = 0.02
+    layernorm_epsilon: float = 1e-5
+    window_size: List[int] = field(default_factory=lambda: [4096, 0])
+
+
+class Mistral7BModel(GPTModel):
+    def __init__(self, config: Optional[Mistral7BConfig] = None, tokenizer=None):
+        _tokenizer = tokenizer or HFMistral7BImporter().tokenizer
+
+        super().__init__(config or Mistral7BConfig(), _tokenizer)
+
+
+@io.model_importer(Mistral7BModel, "hf", default_path="mistralai/Mistral-7B-v0.1")
+class HFMistral7BImporter(io.ModelConnector["MistralForCausalLM", Mistral7BModel]):
+    def init(self) -> Mistral7BModel:
+        return Mistral7BModel(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import MistralForCausalLM
+
+        source = MistralForCausalLM.from_pretrained(str(self))
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
+            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "model.norm.weight": "decoder.final_layernorm.weight",
+            "lm_head.weight": "output_layer.weight",
+        }
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
+
+    @property
+    def tokenizer(self) -> "AutoTokenizer":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(str(self))
+
+    @property
+    def config(self) -> Mistral7BConfig:
+        from transformers import MistralConfig
+
+        source = MistralConfig.from_pretrained(str(self))
+
+        def make_vocab_size_divisible_by(mistral_vocab_size):
+            base = 128
+            while mistral_vocab_size % base != 0:
+                base //= 2
+            return base
+
+        output = Mistral7BConfig(
+            seq_length=source.max_position_embeddings,
+            num_layers=source.num_hidden_layers,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.intermediate_size,
+            num_attention_heads=source.num_attention_heads,
+            init_method_std=source.initializer_range,
+            layernorm_epsilon=source.rms_norm_eps,
+            num_query_groups=source.num_key_value_heads,
+            rotary_base=source.rope_theta,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
+            window_size=[source.sliding_window, 0],
+        )
+
+        return output
+
+
+@io.model_exporter(Mistral7BModel, "hf")
+class HFMistral7BExporter(io.ModelConnector[Mistral7BModel, "MistralForCausalLM"]):
+    def init(self) -> "MistralForCausalLM":
+        from transformers import AutoModelForCausalLM
+
+        return AutoModelForCausalLM.from_config(self.config)
+
+    def apply(self, output_path: Path) -> Path:
+        # TODO: Make it work with lazy init
+        # with torch.device("meta"):
+        #     target = self.init()
+        target = self.init()
+        source, _ = self.nemo_load(str(self))
+        target = self.convert_state(source, target)
+
+        # TODO: Make sure we don't need to do this
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight",
+        }
+
+        return io.apply_transforms(source, target, mapping=mapping, transforms=[_export_qkv, _export_linear_fc1])
+
+    @property
+    def tokenizer(self):
+        return io.load_ckpt(str(self)).model.tokenizer.tokenizer
+
+    @property
+    def config(self) -> "MistralConfig":
+        source: Mistral7BConfig = io.load_ckpt(str(self)).model.config
+
+        from transformers import MistralConfig
+
+        return MistralConfig(
+            sliding_window=source.window_size[0],
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.seq_length,
+            initializer_range=source.init_method_std,
+            rms_norm_eps=source.layernorm_epsilon,
+            num_key_value_heads=source.num_query_groups,
+            rope_theta=source.rotary_base,
+            vocab_size=self.tokenizer.vocab_size,
+        )
+
+
+@io.state_transform(
+    source_key=(
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_qkv(ctx: io.TransformCTX, q, k, v):
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
+    ),
+)
+def _export_qkv(ctx: io.TransformCTX, linear_qkv):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+@io.state_transform(
+    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight",
+)
+def _import_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0).float()
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+)
+def _export_linear_fc1(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 6acfddad9189..5ad2519cfd1a 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -1,3 +1,4 @@
+cloudpickle
 fiddle
 hydra-core>1.3,<=1.3.2
 omegaconf<=2.3
diff --git a/tests/io/test_state.py b/tests/io/test_state.py
new file mode 100644
index 000000000000..bb5dc4a9af3d
--- /dev/null
+++ b/tests/io/test_state.py
@@ -0,0 +1,233 @@
+import pytest
+from torch import nn
+
+from nemo.io.state import StateDictTransform, TransformCTX, state_transform
+
+
+class TestStateDictTransform:
+    """
+    Tests for the StateDictTransform functionality.
+    """
+
+    @pytest.fixture
+    def mock_ctx(self):
+        """
+        Provides a mock transformation context with predefined source and target states.
+
+        Returns
+        -------
+            TransformCTX: A context object with source and target states.
+        """
+        source_state = {
+            'model.layers.0.self_attn.q_proj.weight': 1,
+            'model.layers.0.self_attn.k_proj.weight': 2,
+            'model.layers.0.self_attn.v_proj.weight': 3,
+            'model.layers.1.self_attn.q_proj.weight': 1,
+            'model.layers.1.self_attn.k_proj.weight': 2,
+            'model.layers.1.self_attn.v_proj.weight': 3,
+        }
+        target_state = {
+            "decoder.layers.0.self_attention.linear_qkv.weight": 10,
+            "decoder.layers.1.self_attention.linear_qkv.weight": 10,
+        }
+        ctx = TransformCTX(
+            source=nn.Module(), source_state=source_state, target=nn.Module(), target_state=target_state
+        )
+        return ctx
+
+    @pytest.fixture
+    def mock_multi_target_ctx(self):
+        """
+        Provides a mock transformation context with a source state that matches the expected source_key
+        and a target state prepared with initial values for the expected target_keys.
+        """
+        source_state = {'model.layers.1.self_attn.q_proj.weight': 1}
+        # Populate target_state with initial placeholder values for keys expected to be matched and updated
+        target_state = {
+            'decoder.layers.1.self_attention.linear_q.weight': 0,
+            'decoder.layers.1.self_attention.linear_k.weight': 0,
+        }
+        ctx = TransformCTX(
+            source=nn.Module(), source_state=source_state, target=nn.Module(), target_state=target_state
+        )
+        return ctx
+
+    def test_transform_with_multiple_source_keys(self, mock_ctx):
+        """
+        Test transformation when multiple source keys are specified.
+        """
+        transform = StateDictTransform(
+            source_key=(
+                "model.layers.*.self_attn.q_proj.weight",
+                "model.layers.*.self_attn.k_proj.weight",
+                "model.layers.*.self_attn.v_proj.weight",
+            ),
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx, k, q, v: q + k + v,
+        )
+        transform(mock_ctx)
+        assert mock_ctx.target_state["decoder.layers.0.self_attention.linear_qkv.weight"] == 6
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_qkv.weight"] == 6
+
+    def test_transform_with_wildcard_in_source_keys(self, mock_ctx):
+        """
+        Test transformation using a wildcard pattern in source keys.
+        """
+        transform = StateDictTransform(
+            source_key="model.layers.*.self_attn.*_proj.weight",
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx, k, q, v: q + k + v,
+        )
+        transform(mock_ctx)
+        assert mock_ctx.target_state["decoder.layers.0.self_attention.linear_qkv.weight"] == 6
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_qkv.weight"] == 6
+
+    def test_transform_with_mapped_source_keys(self, mock_ctx):
+        """
+        Test transformation with a dictionary mapping for source keys.
+        """
+        transform = StateDictTransform(
+            source_key={
+                "k": "model.layers.*.self_attn.k_proj.weight",
+                "q": "model.layers.*.self_attn.q_proj.weight",
+                "v": "model.layers.*.self_attn.v_proj.weight",
+            },
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx, k, q, v: q + k + v,
+        )
+        transform(mock_ctx)
+        assert mock_ctx.target_state["decoder.layers.0.self_attention.linear_qkv.weight"] == 6
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_qkv.weight"] == 6
+
+    def test_transform_with_variable_arguments(self, mock_ctx):
+        """
+        Test transformation with a wildcard pattern and variable arguments.
+        """
+        transform = StateDictTransform(
+            source_key="model.layers.*.self_attn.*_proj.weight",
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx, *args: sum(args),
+        )
+        transform(mock_ctx)
+        assert mock_ctx.target_state["decoder.layers.0.self_attention.linear_qkv.weight"] == 6
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_qkv.weight"] == 6
+
+    def test_transform_with_no_matching_source_keys(self, mock_ctx):
+        """
+        Test transformation when no source keys match the pattern.
+        """
+        transform = StateDictTransform(
+            source_key="non.existent.pattern",
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx, *args: sum(args),
+        )
+        with pytest.raises(ValueError):
+            transform(mock_ctx)
+
+    def test_transform_with_invalid_transform_function(self, mock_ctx):
+        """
+        Test transformation with a transform function that does not match expected signature.
+        """
+        transform = StateDictTransform(
+            source_key="model.layers.*.self_attn.q_proj.weight",
+            target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+            transform=lambda ctx: 0,  # Invalid signature
+        )
+        with pytest.raises(ValueError):
+            transform(mock_ctx)
+
+    def test_transform_with_tuple_target_key_and_multiple_outputs(self, mock_multi_target_ctx):
+        """
+        Test transformation where the target_key is a tuple and the transform function
+        returns multiple values that are then unrolled to these target keys.
+        """
+        # Define a transformation that splits the input into two parts
+        def split_transform(ctx, x):
+            return x - 1, x + 1
+
+        # Apply the transformation
+        transform = StateDictTransform(
+            source_key="model.layers.1.self_attn.q_proj.weight",
+            target_key=(
+                "decoder.layers.1.self_attention.linear_q.weight",
+                "decoder.layers.1.self_attention.linear_k.weight",
+            ),
+            transform=split_transform,
+        )
+        transform(mock_multi_target_ctx)
+
+        # Check that the target state has been updated correctly
+        assert mock_multi_target_ctx.target_state["decoder.layers.1.self_attention.linear_q.weight"] == 0
+        assert mock_multi_target_ctx.target_state["decoder.layers.1.self_attention.linear_k.weight"] == 2
+
+
+class TestStateTransformDecorator:
+    """
+    Tests for the @state_transform decorator functionality.
+    """
+
+    @pytest.fixture
+    def mock_ctx(self):
+        """
+        Provides a mock transformation context with predefined source and target states.
+        """
+        source_state = {
+            'model.layers.1.self_attn.q_proj.weight': 1,
+            'model.layers.1.self_attn.k_proj.weight': 2,
+            'model.layers.1.self_attn.v_proj.weight': 3,
+        }
+        # Pre-populate target_state with initial values or placeholders
+        target_state = {
+            "decoder.layers.1.self_attention.linear_q.weight": 0,
+            "decoder.layers.1.self_attention.linear_k.weight": 0,
+            "decoder.layers.1.self_attention.linear_v.weight": 0,
+        }
+        ctx = TransformCTX(
+            source=nn.Module(), source_state=source_state, target=nn.Module(), target_state=target_state
+        )
+        return ctx
+
+    def test_single_transform(self, mock_ctx):
+        """
+        Test the @state_transform decorator with a single source and target key.
+        """
+        # Apply the transformation
+        single_transform(mock_ctx)
+        # Verify the target state is updated correctly
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_q.weight"] == 11
+
+    def test_multiple_outputs_transform(self, mock_ctx):
+        """
+        Test the @state_transform decorator with a single source key and multiple target keys.
+        """
+        # Apply the transformation
+        multiple_outputs_transform(mock_ctx)
+        # Verify the target state is updated correctly for each key
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_q.weight"] == 2
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_k.weight"] == 1
+        assert mock_ctx.target_state["decoder.layers.1.self_attention.linear_v.weight"] == 3
+
+
+@state_transform(
+    source_key="model.layers.*.self_attn.q_proj.weight", target_key="decoder.layers.1.self_attention.linear_q.weight"
+)
+def single_transform(ctx, x):
+    """
+    A single transformation function that adds 10 to the input value.
+    """
+    return x + 10
+
+
+@state_transform(
+    source_key="model.layers.1.self_attn.*_proj.weight",
+    target_key=(
+        "decoder.layers.1.self_attention.linear_q.weight",
+        "decoder.layers.1.self_attention.linear_k.weight",
+        "decoder.layers.1.self_attention.linear_v.weight",
+    ),
+)
+def multiple_outputs_transform(ctx, *args):
+    """
+    A transformation function that returns multiple values for multiple target keys.
+    """
+    return args

From 894e5022651f6b31523964333d07937344d258f0 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Fri, 3 May 2024 15:10:15 +0400
Subject: [PATCH 026/178] RNN-T and TDT inference: use CUDA graphs by default
 (#8972)

* Use Cuda graphs by default for RNN-T and TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 nemo/collections/asr/models/asr_model.py      |  51 ++++-
 nemo/collections/asr/modules/rnnt.py          |   4 +-
 .../cuda_graph_rnnt_greedy_decoding.py        |  13 +-
 .../asr/parts/submodules/rnnt_decoding.py     |   4 +-
 .../parts/submodules/rnnt_greedy_decoding.py  |  98 +++++++--
 .../submodules/rnnt_loop_labels_computer.py   | 180 +++++++++++++---
 .../submodules/tdt_loop_labels_computer.py    | 199 ++++++++++++++----
 .../common/parts/optional_cuda_graphs.py      |  89 ++++++++
 nemo/core/utils/cuda_python_utils.py          |   2 +-
 .../asr/decoding/rnnt_alignments_check.py     |  12 +-
 .../test_cuda_graph_rnnt_greedy_decoding.py   | 138 +++++++++---
 .../asr/test_asr_rnnt_encdec_model.py         |  18 +-
 .../common/test_optional_cuda_graphs.py       |  71 +++++++
 13 files changed, 746 insertions(+), 133 deletions(-)
 create mode 100644 nemo/collections/common/parts/optional_cuda_graphs.py
 create mode 100644 tests/collections/common/test_optional_cuda_graphs.py

diff --git a/nemo/collections/asr/models/asr_model.py b/nemo/collections/asr/models/asr_model.py
index e14424cec5c1..0539f961a1ca 100644
--- a/nemo/collections/asr/models/asr_model.py
+++ b/nemo/collections/asr/models/asr_model.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-from abc import ABC, abstractmethod
-from typing import List
+from abc import ABC
+from typing import List, Optional
 
 import torch
 
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.core.classes import ModelPT
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.classes.exportable import Exportable
@@ -171,6 +172,52 @@ def on_after_backward(self):
                 logging.warning(f'detected inf or nan values in gradients! Setting gradients to zero.')
                 self.zero_grad()
 
+    def on_train_epoch_start(self) -> None:
+        """
+        Decoder with CUDA graphs does not release memory, thus we disable it for training epoch.
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs
+        """
+        WithOptionalCudaGraphs.disable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+
+    def on_train_epoch_end(self) -> None:
+        """
+        After training, we can enable the decoder with CUDA graphs.
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs
+        """
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+
+    def on_validation_epoch_start(self) -> None:
+        """
+        For validation, we enable CUDA graphs to speedup validation.
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs.
+        """
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+
+    def on_validation_epoch_end(self) -> Optional[dict[str, dict[str, torch.Tensor]]]:
+        """
+        After validation, we disable CUDA graphs, since `validation` can be called in training loop, and
+        training will continue after validation
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs.
+        """
+        WithOptionalCudaGraphs.disable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+        return super().on_validation_epoch_end()
+
+    def on_test_epoch_start(self) -> None:
+        """
+        For testing, we enable CUDA graphs to speedup validation.
+        We do not need to disable CUDA graphs after testing, since `test` cannot be called in training loop.
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs.
+        """
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+
+    def on_predict_epoch_start(self) -> None:
+        """
+        For predicting, we enable CUDA graphs to speedup validation.
+        We do not need to disable CUDA graphs after predicting, since `predict` cannot be called in training loop.
+        EncDecRNNTModel.decoding.decoding is the inference class with CUDA graphs
+        """
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(self, attribute_path="decoding.decoding")
+
 
 class ExportableEncDecModel(Exportable):
     """
diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py
index 055066c00660..2355cfb7005b 100644
--- a/nemo/collections/asr/modules/rnnt.py
+++ b/nemo/collections/asr/modules/rnnt.py
@@ -312,7 +312,9 @@ def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]:
         batch = y.size(0)
         # state contains context_size - 1 elements for each utterance in batch,
         # consistent with the state returned from StatelessNet.forward
-        state = [torch.ones([batch, self.context_size - 1], dtype=torch.long, device=y.device) * self.blank_idx]
+        state = [
+            torch.full([batch, self.context_size - 1], fill_value=self.blank_idx, dtype=torch.long, device=y.device)
+        ]
         return state
 
     def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]):
diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
index 388737443fd4..93cef4d4138e 100644
--- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
@@ -292,14 +292,21 @@ def __call__(
         partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None,
     ):
         if partial_hypotheses is not None:
-            raise NotImplementedError("`partial_hypotheses` support is not available with cuda graphs (but could be)")
+            raise NotImplementedError(
+                "`partial_hypotheses` support is not available "
+                "with Frame-Looping algorithm with Cuda graphs (not implemented yet)"
+            )
 
         if self.caller.preserve_alignments:
-            raise NotImplementedError("`preserve_alignments` support is not available with cuda graphs (but could be)")
+            raise NotImplementedError(
+                "`preserve_alignments` support is not available"
+                "with Frame-Looping algorithm with Cuda graphs (not implemented yet)"
+            )
 
         if self.caller.preserve_frame_confidence:
             raise NotImplementedError(
-                "`preserve_frame_confidence` support is not available with cuda graphs (but could be)"
+                "`preserve_frame_confidence` support is not available"
+                "with Frame-Looping algorithm with Cuda graphs (not implemented yet)"
             )
 
         batch_size = x.shape[0]
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
index 71079f4b6382..5fa225864f8c 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -331,7 +331,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
                         loop_labels=self.cfg.greedy.get('loop_labels', True),
-                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
                     )
                 else:
                     self.decoding = rnnt_greedy_decoding.GreedyBatchedTDTInfer(
@@ -347,7 +347,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         include_duration_confidence=self.tdt_include_duration_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
-                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
                     )
 
             else:
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index e5de99cf0776..b2fa9b85b5fd 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -38,6 +38,7 @@
 from nemo.collections.asr.parts.submodules.tdt_loop_labels_computer import GreedyBatchedTDTLoopLabelsComputer
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.collections.common.parts.rnn import label_collate
 from nemo.core.classes import Typing, typecheck
 from nemo.core.neural_types import AcousticEncodedRepresentation, HypothesisType, LengthsType, NeuralType
@@ -508,7 +509,7 @@ def _greedy_decode(
         return hypothesis
 
 
-class GreedyBatchedRNNTInfer(_GreedyRNNTInfer):
+class GreedyBatchedRNNTInfer(_GreedyRNNTInfer, WithOptionalCudaGraphs):
     """A batch level greedy transducer decoder.
 
     Batch level greedy decoding, performed auto-regressively.
@@ -589,7 +590,7 @@ def __init__(
         preserve_frame_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
         loop_labels: bool = True,
-        use_cuda_graph_decoder: bool = False,
+        use_cuda_graph_decoder: bool = True,
     ):
         super().__init__(
             decoder_model=decoder_model,
@@ -602,13 +603,14 @@ def __init__(
         )
 
         self.use_cuda_graph_decoder = use_cuda_graph_decoder
+        self.loop_labels = loop_labels
 
         # Depending on availability of `blank_as_pad` support
         # switch between more efficient batch decoding technique
         self._decoding_computer = None
         if self.decoder.blank_as_pad:
-            if loop_labels:
-                # default (faster) algo: loop over labels
+            if self.loop_labels:
+                # Label-Looping algorithm (default, faster)
                 self._greedy_decode = self._greedy_decode_blank_as_pad_loop_labels
                 self._decoding_computer = GreedyBatchedRNNTLoopLabelsComputer(
                     decoder=self.decoder,
@@ -618,20 +620,74 @@ def __init__(
                     preserve_alignments=preserve_alignments,
                     preserve_frame_confidence=preserve_frame_confidence,
                     confidence_method_cfg=confidence_method_cfg,
-                    allow_cuda_graphs=use_cuda_graph_decoder,
+                    allow_cuda_graphs=self.use_cuda_graph_decoder,
                 )
-            elif use_cuda_graph_decoder:
-                from nemo.collections.asr.parts.submodules.cuda_graph_rnnt_greedy_decoding import (
-                    RNNTGreedyDecodeCudaGraph,
-                )
-
-                self._greedy_decode = RNNTGreedyDecodeCudaGraph(max_symbols_per_step, self)
             else:
-                # previous algo: loop over frames
-                self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
+                # Frame-Looping algorithm
+                if not self.use_cuda_graph_decoder:
+                    self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
+                else:
+                    if self.preserve_alignments:
+                        logging.warning("`preserve_alignments` is not implemented for Frame-Looping + CUDA graphs")
+                        self.use_cuda_graph_decoder = False
+                    if self.preserve_frame_confidence:
+                        logging.warning(
+                            "`preserve_frame_confidence` is not implemented for Frame-Looping + CUDA graphs"
+                        )
+                        self.use_cuda_graph_decoder = False
+                    if not torch.cuda.is_available():
+                        self.use_cuda_graph_decoder = False
+
+                    if self.use_cuda_graph_decoder:
+                        try:
+                            from nemo.collections.asr.parts.submodules.cuda_graph_rnnt_greedy_decoding import (
+                                RNNTGreedyDecodeCudaGraph,
+                            )
+
+                            self._greedy_decode = RNNTGreedyDecodeCudaGraph(max_symbols_per_step, self)
+                        except (ImportError, ModuleNotFoundError, ValueError) as e:
+                            self.use_cuda_graph_decoder = False
+                            logging.warning(f"Cannot use decoder with CUDA graphs, reason: {e.msg}")
+                            self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
+                    else:
+                        self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
         else:
             self._greedy_decode = self._greedy_decode_masked
 
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs (e.g., for decoding in training)"""
+        if not self.use_cuda_graph_decoder:
+            # CUDA graphs not allowed, nothing to do
+            return
+
+        if not self.decoder.blank_as_pad:
+            # blank as pad uses decoding without CUDA graphs
+            return
+
+        if self.loop_labels:
+            # Label-Looping implementation
+            self._decoding_computer.disable_cuda_graphs()
+        else:
+            self._greedy_decode = self._greedy_decode_blank_as_pad_loop_frames
+
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs (if allowed)"""
+        if not self.use_cuda_graph_decoder:
+            # CUDA graphs not allowed, nothing to do
+            return
+
+        if not self.decoder.blank_as_pad:
+            # blank as pad uses decoding without CUDA graphs
+            return
+
+        if self.loop_labels:
+            # Label-Looping implementation
+            self._decoding_computer.maybe_enable_cuda_graphs()
+        else:
+            from nemo.collections.asr.parts.submodules.cuda_graph_rnnt_greedy_decoding import RNNTGreedyDecodeCudaGraph
+
+            self._greedy_decode = RNNTGreedyDecodeCudaGraph(self.max_symbols, self)
+
     @typecheck()
     def forward(
         self,
@@ -2302,7 +2358,7 @@ class GreedyBatchedRNNTInferConfig:
     tdt_include_duration_confidence: bool = False
     confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig())
     loop_labels: bool = True
-    use_cuda_graph_decoder: bool = False
+    use_cuda_graph_decoder: bool = True
 
     def __post_init__(self):
         # OmegaConf.structured ensures that post_init check is always executed
@@ -2580,7 +2636,7 @@ def _greedy_decode(
         return hypothesis
 
 
-class GreedyBatchedTDTInfer(_GreedyRNNTInfer):
+class GreedyBatchedTDTInfer(_GreedyRNNTInfer, WithOptionalCudaGraphs):
     """A batch level greedy TDT decoder.
     Batch level greedy decoding, performed auto-regressively.
     Args:
@@ -2652,7 +2708,7 @@ def __init__(
         preserve_frame_confidence: bool = False,
         include_duration_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
-        use_cuda_graph_decoder: bool = False,
+        use_cuda_graph_decoder: bool = True,
     ):
         super().__init__(
             decoder_model=decoder_model,
@@ -2759,3 +2815,13 @@ def _greedy_decode_blank_as_pad_loop_labels(
         for hyp, state in zip(hyps, self.decoder.batch_split_states(last_decoder_state)):
             hyp.dec_state = state
         return hyps
+
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs (e.g., for decoding in training)"""
+        if self._decoding_computer is not None:
+            self._decoding_computer.disable_cuda_graphs()
+
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs (if allowed)"""
+        if self._decoding_computer is not None:
+            self._decoding_computer.maybe_enable_cuda_graphs()
diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
index 92cb8a36aeb5..b920dba09cfd 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Optional, Tuple
+from dataclasses import dataclass, field
+from typing import Any, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -21,6 +22,7 @@
 
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodMixin
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.core.utils.cuda_python_utils import (
     check_cuda_python_cuda_graphs_conditional_nodes_supported,
     cu_call,
@@ -28,6 +30,7 @@
     with_conditional_node,
 )
 from nemo.utils import logging
+from nemo.utils.enum import PrettyStrEnum
 
 try:
     from cuda import cudart
@@ -161,7 +164,17 @@ def need_reinit(self, encoder_output_projected: torch.Tensor) -> bool:
         )
 
 
-class GreedyBatchedRNNTLoopLabelsComputer(ConfidenceMethodMixin):
+@dataclass
+class SeparateGraphsLoopLabels:
+    """Class to store Cuda graphs for decoding when separate graphs are used"""
+
+    before_outer_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    before_inner_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    inner_loop_code: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    after_inner_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+
+
+class GreedyBatchedRNNTLoopLabelsComputer(WithOptionalCudaGraphs, ConfidenceMethodMixin):
     """
     Label Looping algorithm implementation: optimized batched greedy decoding. Callable.
     Iterates over labels, on each step finding the next non-blank label
@@ -174,6 +187,16 @@ class GreedyBatchedRNNTLoopLabelsComputer(ConfidenceMethodMixin):
     INITIAL_MAX_TIME = 375  # initial max time, used to init state for Cuda graphs
     CUDA_PROGRAM_NAME = b"while_loop_labels_conditional_rnnt.cu"
 
+    class CudaGraphsMode(PrettyStrEnum):
+        FULL_GRAPH = "full_graph"  # Cuda graphs with conditional nodes, fastest implementation
+        NO_WHILE_LOOPS = "no_while_loops"  # Decoding with PyTorch while loops + partial Cuda graphs
+        NO_GRAPHS = "no_graphs"  # decoding without graphs, stateful implementation, only for testing purposes
+
+    separate_graphs: Optional[SeparateGraphsLoopLabels]
+    full_graph: Optional[torch.cuda.CUDAGraph]
+    cuda_graphs_mode: Optional[CudaGraphsMode]
+    state: Optional[LoopLabelsState]
+
     def __init__(
         self,
         decoder,
@@ -203,24 +226,66 @@ def __init__(
         self.max_symbols = max_symbols_per_step
         self.preserve_alignments = preserve_alignments
         self.preserve_frame_confidence = preserve_frame_confidence
+        self.allow_cuda_graphs = allow_cuda_graphs
         self._SOS = self._blank_index
         self._init_confidence_method(confidence_method_cfg=confidence_method_cfg)
         assert self._SOS == self._blank_index  # "blank as pad" algorithm only
 
-        self.use_cuda_graphs = allow_cuda_graphs
+        self.state = None
+        self.full_graph = None
+        self.separate_graphs = None
 
-        if self.use_cuda_graphs and self.max_symbols is None:
-            logging.warning("Max symbols is None, which is not allowed with Cuda graphs.")
-            self.use_cuda_graphs = False
+        self.cuda_graphs_mode = None
+        self.maybe_enable_cuda_graphs()
 
-        if self.use_cuda_graphs:
+    def force_cuda_graphs_mode(self, mode: Optional[Union[str, CudaGraphsMode]]):
+        """
+        Method to set graphs mode. Use only for testing purposes.
+        For debugging the algorithm use "no_graphs" mode, since it is impossible to debug CUDA graphs directly.
+        """
+        self.cuda_graphs_mode = self.CudaGraphsMode(mode) if mode is not None else None
+        self.state = None
+
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs if conditions met"""
+        if self.cuda_graphs_mode is not None:
+            # CUDA graphs are already enabled
+            return
+
+        if not self.allow_cuda_graphs:
+            self.cuda_graphs_mode = None
+        else:
+            # cuda graphs are allowed
+            # check basic requirements for cuda graphs
+            if self.max_symbols is None:
+                logging.warning("Max symbols per step is None, which is not allowed with Cuda graphs. Setting to `10`")
+                self.max_symbols = 10
+            # basic requirements met, need to check while loops
             try:
                 check_cuda_python_cuda_graphs_conditional_nodes_supported()
-            except ImportError as e:
-                logging.warning(f"No conditional node support. Cuda graphs will be disabled,\n{e.msg}")
-                self.use_cuda_graphs = False
-
-        self.state: Optional[LoopLabelsState] = None
+                self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH
+            except (ImportError, ModuleNotFoundError) as e:
+                logging.warning(
+                    "No conditional node support for Cuda.\n"
+                    "Cuda graphs with while loops are disabled, decoding speed will be slower\n"
+                    f"Reason: {e.msg}"
+                )
+                self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS
+        self.reset_cuda_graphs_state()
+
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs, can be used to disable graphs temporary, e.g., in training process"""
+        if self.cuda_graphs_mode is None:
+            # nothing to disable
+            return
+        self.cuda_graphs_mode = None
+        self.reset_cuda_graphs_state()
+
+    def reset_cuda_graphs_state(self):
+        """Reset state to release memory (for CUDA graphs implementations)"""
+        self.state = None
+        self.full_graph = None
+        self.separate_graphs = None
 
     def loop_labels_torch(
         self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
@@ -237,6 +302,7 @@ def loop_labels_torch(
 
         # do not recalculate joint projection, project only once
         encoder_output_projected = self.joint.project_encoder(encoder_output)
+        float_dtype = encoder_output_projected.dtype
 
         # init output structures: BatchedHyps (for results), BatchedAlignments + last decoder state
         # init empty batched hypotheses
@@ -244,7 +310,7 @@ def loop_labels_torch(
             batch_size=batch_size,
             init_length=max_time * self.max_symbols if self.max_symbols is not None else max_time,
             device=device,
-            float_dtype=encoder_output_projected.dtype,
+            float_dtype=float_dtype,
         )
         # sample state, will be replaced further when the decoding for hypothesis is done
         last_decoder_state = self.decoder.initialize_state(encoder_output_projected)
@@ -256,7 +322,7 @@ def loop_labels_torch(
             logits_dim=self.joint.num_classes_with_blank,
             init_length=max_time * 2 if use_alignments else 1,  # blank for each timestep + text tokens
             device=device,
-            float_dtype=encoder_output_projected.dtype,
+            float_dtype=float_dtype,
             store_alignments=self.preserve_alignments,
             store_frame_confidence=self.preserve_frame_confidence,
         )
@@ -312,7 +378,7 @@ def loop_labels_torch(
                     time_indices=time_indices_current_labels,
                     logits=logits if self.preserve_alignments else None,
                     labels=labels if self.preserve_alignments else None,
-                    confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
+                    confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
                     if self.preserve_frame_confidence
                     else None,
                 )
@@ -350,7 +416,7 @@ def loop_labels_torch(
                         time_indices=time_indices_current_labels,
                         logits=logits if self.preserve_alignments else None,
                         labels=more_labels if self.preserve_alignments else None,
-                        confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
+                        confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
                         if self.preserve_frame_confidence
                         else None,
                     )
@@ -413,6 +479,8 @@ def loop_labels_cuda_graphs(
             encoder_output: output from the encoder
             encoder_output_length: lengths of the utterances in `encoder_output`
         """
+        assert self.cuda_graphs_mode is not None
+
         # do not recalculate joint projection, project only once
         encoder_output = self.joint.project_encoder(encoder_output)
         current_batch_size = encoder_output.shape[0]
@@ -430,16 +498,27 @@ def loop_labels_cuda_graphs(
         self.state.encoder_output_length[: encoder_output_length.shape[0]].copy_(encoder_output_length)
         # set length to zero for elements outside the current batch
         self.state.encoder_output_length[current_batch_size:].fill_(0)
-        self.graph.replay()
-
-        # example manual loop (can be used instead of graph.replay())
-        # self._before_outer_loop()
-        # while self.state.active_mask_any.item():
-        #     self._before_inner_loop_get_decoder_output()
-        #     self._before_inner_loop_get_joint_output()
-        #     while self.state.advance_mask_any.item():
-        #         self._inner_loop_code()
-        #     self._after_inner_loop()
+        if self.cuda_graphs_mode is self.CudaGraphsMode.FULL_GRAPH:
+            self.full_graph.replay()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_WHILE_LOOPS:
+            self.separate_graphs.before_outer_loop.replay()
+            while self.state.active_mask_any.item():
+                self.separate_graphs.before_inner_loop.replay()
+                while self.state.advance_mask_any.item():
+                    self.separate_graphs.inner_loop_code.replay()
+                self.separate_graphs.after_inner_loop.replay()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_GRAPHS:
+            # this mode is only for testing purposes
+            # manual loop instead of using graphs
+            self._before_outer_loop()
+            while self.state.active_mask_any.item():
+                self._before_inner_loop_get_decoder_output()
+                self._before_inner_loop_get_joint_output()
+                while self.state.advance_mask_any.item():
+                    self._inner_loop_code()
+                self._after_inner_loop()
+        else:
+            raise NotImplementedError(f"Unknown graph mode: {self.cuda_graphs_mode}")
 
         return (
             self.state.batched_hyps,
@@ -509,12 +588,49 @@ def _graph_reinitialize(
         )
         # to avoid recalculation of joint projection, store decoder output in state
         self.state.decoder_output = self.joint.project_prednet(decoder_output)
+        if self.cuda_graphs_mode is self.CudaGraphsMode.FULL_GRAPH:
+            self._full_graph_compile()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_WHILE_LOOPS:
+            self._partial_graphs_compile()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_GRAPHS:
+            # no graphs needed
+            pass
+        else:
+            raise NotImplementedError
+
+    def _partial_graphs_compile(self):
+        """Compile decoding by parts"""
+        # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
+        stream_for_graph = torch.cuda.Stream(self.state.device)
+        self.separate_graphs = SeparateGraphsLoopLabels()
+        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
+            self.separate_graphs.before_outer_loop, stream=stream_for_graph
+        ):
+            self._before_outer_loop()
+
+        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
+            self.separate_graphs.before_inner_loop, stream=stream_for_graph
+        ):
+            self._before_inner_loop_get_decoder_output()
+            self._before_inner_loop_get_joint_output()
+
+        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
+            self.separate_graphs.inner_loop_code, stream=stream_for_graph
+        ):
+            self._inner_loop_code()
+
+        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
+            self.separate_graphs.after_inner_loop, stream=stream_for_graph
+        ):
+            self._after_inner_loop()
 
+    def _full_graph_compile(self):
+        """Compile full graph for decoding"""
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.state.device)
-        self.graph = torch.cuda.CUDAGraph()
+        self.full_graph = torch.cuda.CUDAGraph()
         with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.graph, stream=stream_for_graph
+            self.full_graph, stream=stream_for_graph
         ):
             self._before_outer_loop()
 
@@ -612,12 +728,13 @@ def _before_inner_loop_get_joint_output(self):
         # blank_mask = self.labels == self._blank_index
         self.state.time_indices_current_labels.copy_(self.state.time_indices, non_blocking=True)
         if self.state.alignments is not None:
+            float_dtype = self.state.float_dtype
             self.state.alignments.add_results_masked_no_checks_(
                 active_mask=self.state.active_mask,
                 time_indices=self.state.time_indices_current_labels,
                 logits=logits if self.preserve_alignments else None,
                 labels=self.state.labels if self.preserve_alignments else None,
-                confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
+                confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
                 if self.preserve_frame_confidence
                 else None,
             )
@@ -662,12 +779,13 @@ def _inner_loop_code(self):
         torch.where(self.state.advance_mask, more_scores, self.state.scores, out=self.state.scores)
 
         if self.state.alignments is not None:
+            float_dtype = self.state.float_dtype
             self.state.alignments.add_results_masked_no_checks_(
                 active_mask=self.state.advance_mask,
                 time_indices=self.state.time_indices_current_labels,
                 logits=logits if self.preserve_alignments else None,
                 labels=more_labels if self.preserve_alignments else None,
-                confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
+                confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
                 if self.preserve_frame_confidence
                 else None,
             )
@@ -721,7 +839,7 @@ def _after_inner_loop(self):
     def __call__(
         self, x: torch.Tensor, out_len: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
-        if self.use_cuda_graphs and x.device.type == "cuda":
+        if self.cuda_graphs_mode is not None and x.device.type == "cuda":
             return self.loop_labels_cuda_graphs(encoder_output=x, encoder_output_length=out_len)
 
         return self.loop_labels_torch(encoder_output=x, encoder_output_length=out_len)
diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
index b136446d97fb..4e514966db2b 100644
--- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+from dataclasses import dataclass, field
 from typing import Any, Optional, Tuple, Union
 
 import numpy as np
@@ -22,6 +23,7 @@
 
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodMixin
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
 from nemo.core.utils.cuda_python_utils import (
     check_cuda_python_cuda_graphs_conditional_nodes_supported,
     cu_call,
@@ -29,6 +31,7 @@
     with_conditional_node,
 )
 from nemo.utils import logging
+from nemo.utils.enum import PrettyStrEnum
 
 try:
     from cuda import cudart
@@ -167,7 +170,17 @@ def need_reinit(self, encoder_output_projected: torch.Tensor) -> bool:
         )
 
 
-class GreedyBatchedTDTLoopLabelsComputer(ConfidenceMethodMixin):
+@dataclass
+class SeparateGraphsLoopLabels:
+    """Class to store Cuda graphs for decoding when separate graphs are used"""
+
+    before_outer_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    before_inner_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    inner_loop_code: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+    after_inner_loop: torch.cuda.CUDAGraph = field(default_factory=torch.cuda.CUDAGraph)
+
+
+class GreedyBatchedTDTLoopLabelsComputer(WithOptionalCudaGraphs, ConfidenceMethodMixin):
     """
     Label Looping algorithm implementation: optimized batched greedy decoding. Callable.
     Iterates over labels, on each step finding the next non-blank label
@@ -180,6 +193,16 @@ class GreedyBatchedTDTLoopLabelsComputer(ConfidenceMethodMixin):
     INITIAL_MAX_TIME = 375  # initial max time, used to init state for Cuda graphs
     CUDA_PROGRAM_NAME = b"while_loop_labels_conditional_tdt.cu"
 
+    class CudaGraphsMode(PrettyStrEnum):
+        FULL_GRAPH = "full_graph"  # Cuda graphs with conditional nodes, fastest implementation
+        NO_WHILE_LOOPS = "no_while_loops"  # Decoding with PyTorch while loops + partial Cuda graphs
+        NO_GRAPHS = "no_graphs"  # decoding without graphs, stateful implementation, only for testing purposes
+
+    separate_graphs: Optional[SeparateGraphsLoopLabels]
+    full_graph: Optional[torch.cuda.CUDAGraph]
+    cuda_graphs_mode: Optional[CudaGraphsMode]
+    state: Optional[LoopLabelsState]
+
     def __init__(
         self,
         decoder,
@@ -215,25 +238,67 @@ def __init__(
         self.max_symbols = max_symbols_per_step
         self.preserve_alignments = preserve_alignments
         self.preserve_frame_confidence = preserve_frame_confidence
+        self.allow_cuda_graphs = allow_cuda_graphs
         self.include_duration_confidence = include_duration_confidence
         self._SOS = self._blank_index
         self._init_confidence_method(confidence_method_cfg=confidence_method_cfg)
         assert self._SOS == self._blank_index  # "blank as pad" algorithm only
 
-        self.use_cuda_graphs = allow_cuda_graphs
+        self.state = None
+        self.full_graph = None
+        self.separate_graphs = None
 
-        if self.use_cuda_graphs and self.max_symbols is None:
-            logging.warning("Max symbols is None, which is not allowed with Cuda graphs.")
-            self.use_cuda_graphs = False
+        self.cuda_graphs_mode = None
+        self.maybe_enable_cuda_graphs()
 
-        if self.use_cuda_graphs:
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs if conditions met"""
+        if self.cuda_graphs_mode is not None:
+            # CUDA graphs are enabled
+            return
+
+        if not self.allow_cuda_graphs:
+            self.cuda_graphs_mode = None
+        else:
+            # cuda graphs are allowed
+            # check basic requirements for cuda graphs
+            if self.max_symbols is None:
+                logging.warning("Max symbols per step is None, which is not allowed with Cuda graphs. Setting to `10`")
+                self.max_symbols = 10
+            # basic requirements met, need to check while loops
             try:
                 check_cuda_python_cuda_graphs_conditional_nodes_supported()
-            except ImportError as e:
-                logging.warning(f"No conditional node support. Cuda graphs will be disabled,\n{e.msg}")
-                self.use_cuda_graphs = False
-
-        self.state: Optional[LoopLabelsState] = None
+                self.cuda_graphs_mode = self.CudaGraphsMode.FULL_GRAPH
+            except (ImportError, ModuleNotFoundError) as e:
+                logging.warning(
+                    "No conditional node support for Cuda.\n"
+                    "Cuda graphs with while loops are disabled, decoding speed will be slower\n"
+                    f"Reason: {e.msg}"
+                )
+                self.cuda_graphs_mode = self.CudaGraphsMode.NO_WHILE_LOOPS
+        self.reset_cuda_graphs_state()
+
+    def disable_cuda_graphs(self):
+        """Disable CUDA graphs, can be used to disable graphs temporary, e.g., in training process"""
+        if self.cuda_graphs_mode is None:
+            # nothing to disable
+            return
+        self.cuda_graphs_mode = None
+        self.reset_cuda_graphs_state()
+
+    def reset_cuda_graphs_state(self):
+        """Reset state to release memory (for CUDA graphs implementations)"""
+        self.state = None
+        self.full_graph = None
+        self.separate_graphs = None
+
+    def force_cuda_graphs_mode(self, mode: Optional[Union[str, CudaGraphsMode]]):
+        """
+        Method to set graphs mode. Use only for testing purposes.
+        For debugging the algorithm use "no_graphs" mode, since it is impossible to debug CUDA graphs directly.
+        """
+        self.cuda_graphs_mode = self.CudaGraphsMode(mode) if mode is not None else None
+        self.state = None
 
     def loop_labels_torch(
         self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
@@ -250,7 +315,7 @@ def loop_labels_torch(
 
         # do not recalculate joint projection, project only once
         encoder_output_projected = self.joint.project_encoder(encoder_output)
-        dtype = encoder_output_projected.dtype
+        float_dtype = encoder_output_projected.dtype
 
         # init output structures: BatchedHyps (for results), BatchedAlignments + last decoder state
         # init empty batched hypotheses
@@ -258,7 +323,7 @@ def loop_labels_torch(
             batch_size=batch_size,
             init_length=max_time * self.max_symbols if self.max_symbols is not None else max_time,
             device=device,
-            float_dtype=dtype,
+            float_dtype=float_dtype,
         )
         # sample state, will be replaced further when the decoding for hypothesis is done
         last_decoder_state = self.decoder.initialize_state(encoder_output_projected)
@@ -270,7 +335,7 @@ def loop_labels_torch(
             logits_dim=self.joint.num_classes_with_blank,
             init_length=max_time * 2 if use_alignments else 1,  # blank for each timestep + text tokens
             device=device,
-            float_dtype=dtype,
+            float_dtype=float_dtype,
             store_alignments=self.preserve_alignments,
             store_frame_confidence=self.preserve_frame_confidence,
             with_duration_confidence=self.include_duration_confidence,
@@ -338,16 +403,18 @@ def loop_labels_torch(
                     confidence=torch.stack(
                         (
                             self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
-                                dtype=dtype
+                                dtype=float_dtype
                             ),
                             self._get_confidence_tensor(F.log_softmax(logits[:, -num_durations:], dim=-1)).to(
-                                dtype=dtype
+                                dtype=float_dtype
                             ),
                         ),
                         dim=-1,
                     )
                     if self.include_duration_confidence
-                    else self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(dtype=dtype)
+                    else self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
+                        dtype=float_dtype
+                    )
                     if self.preserve_frame_confidence
                     else None,
                 )
@@ -390,17 +457,17 @@ def loop_labels_torch(
                         confidence=torch.stack(
                             (
                                 self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
-                                    dtype=dtype
+                                    dtype=float_dtype
                                 ),
                                 self._get_confidence_tensor(F.log_softmax(logits[:, -num_durations:], dim=-1)).to(
-                                    dtype=dtype
+                                    dtype=float_dtype
                                 ),
                             ),
                             dim=-1,
                         )
                         if self.include_duration_confidence
                         else self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
-                            dtype=dtype
+                            dtype=float_dtype
                         )
                         if self.preserve_frame_confidence
                         else None,
@@ -467,6 +534,8 @@ def loop_labels_cuda_graphs(
             encoder_output: output from the encoder
             encoder_output_length: lengths of the utterances in `encoder_output`
         """
+        assert self.cuda_graphs_mode is not None
+
         # do not recalculate joint projection, project only once
         encoder_output = self.joint.project_encoder(encoder_output)
         current_batch_size = encoder_output.shape[0]
@@ -484,16 +553,27 @@ def loop_labels_cuda_graphs(
         self.state.encoder_output_length[: encoder_output_length.shape[0]].copy_(encoder_output_length)
         # set length to zero for elements outside the current batch
         self.state.encoder_output_length[current_batch_size:].fill_(0)
-        self.graph.replay()
-
-        # example manual loop (can be used instead of graph.replay())
-        # self._before_outer_loop()
-        # while self.state.active_mask_any.item():
-        #     self._before_inner_loop_get_decoder_output()
-        #     self._before_inner_loop_get_joint_output()
-        #     while self.state.advance_mask_any.item():
-        #         self._inner_loop_code()
-        #     self._after_inner_loop()
+        if self.cuda_graphs_mode is self.CudaGraphsMode.FULL_GRAPH:
+            self.full_graph.replay()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_WHILE_LOOPS:
+            self.separate_graphs.before_outer_loop.replay()
+            while self.state.active_mask_any.item():
+                self.separate_graphs.before_inner_loop.replay()
+                while self.state.advance_mask_any.item():
+                    self.separate_graphs.inner_loop_code.replay()
+                self.separate_graphs.after_inner_loop.replay()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_GRAPHS:
+            # this mode is only for testing purposes
+            # manual loop instead of using graphs
+            self._before_outer_loop()
+            while self.state.active_mask_any.item():
+                self._before_inner_loop_get_decoder_output()
+                self._before_inner_loop_get_joint_output()
+                while self.state.advance_mask_any.item():
+                    self._inner_loop_code()
+                self._after_inner_loop()
+        else:
+            raise NotImplementedError(f"Unknown graph mode: {self.cuda_graphs_mode}")
 
         return (
             self.state.batched_hyps,
@@ -565,12 +645,49 @@ def _graph_reinitialize(
         )
         # to avoid recalculation of joint projection, store decoder output in state
         self.state.decoder_output = self.joint.project_prednet(decoder_output)
+        if self.cuda_graphs_mode is self.CudaGraphsMode.FULL_GRAPH:
+            self._full_graph_compile()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_WHILE_LOOPS:
+            self._partial_graphs_compile()
+        elif self.cuda_graphs_mode is self.CudaGraphsMode.NO_GRAPHS:
+            # no graphs needed
+            pass
+        else:
+            raise NotImplementedError
+
+    def _partial_graphs_compile(self):
+        """Compile decoding by parts"""
+        # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
+        stream_for_graph = torch.cuda.Stream(self.state.device)
+        self.separate_graphs = SeparateGraphsLoopLabels()
+        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
+            self.separate_graphs.before_outer_loop, stream=stream_for_graph
+        ):
+            self._before_outer_loop()
+
+        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
+            self.separate_graphs.before_inner_loop, stream=stream_for_graph
+        ):
+            self._before_inner_loop_get_decoder_output()
+            self._before_inner_loop_get_joint_output()
+
+        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
+            self.separate_graphs.inner_loop_code, stream=stream_for_graph
+        ):
+            self._inner_loop_code()
+
+        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
+            self.separate_graphs.after_inner_loop, stream=stream_for_graph
+        ):
+            self._after_inner_loop()
 
+    def _full_graph_compile(self):
+        """Compile full graph for decoding"""
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.state.device)
-        self.graph = torch.cuda.CUDAGraph()
+        self.full_graph = torch.cuda.CUDAGraph()
         with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.graph, stream=stream_for_graph
+            self.full_graph, stream=stream_for_graph
         ):
             self._before_outer_loop()
 
@@ -651,7 +768,6 @@ def _before_inner_loop_get_joint_output(self):
         # stage 2: get joint output, iteratively seeking for non-blank labels
         # blank label in `labels` tensor means "end of hypothesis" (for this index)
         self.state.active_mask_prev.copy_(self.state.active_mask, non_blocking=True)
-        dtype = self.state.encoder_output_projected.dtype
         logits = (
             self.joint.joint_after_projection(
                 self.state.encoder_output_projected[self.state.batch_indices, self.state.safe_time_indices].unsqueeze(
@@ -675,6 +791,7 @@ def _before_inner_loop_get_joint_output(self):
         # for blank labels force duration >= 1
         durations.masked_fill_(torch.logical_and(durations == 0, self.state.blank_mask), 1)
         if self.state.alignments is not None:
+            float_dtype = self.state.float_dtype
             self.state.alignments.add_results_masked_no_checks_(
                 active_mask=self.state.active_mask,
                 time_indices=self.state.time_indices_current_labels,
@@ -684,17 +801,17 @@ def _before_inner_loop_get_joint_output(self):
                     (
                         self._get_confidence_tensor(
                             F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                        ).to(dtype=dtype),
+                        ).to(dtype=float_dtype),
                         self._get_confidence_tensor(
                             F.log_softmax(logits[:, -self.state.all_durations.shape[0] :], dim=-1)
-                        ).to(dtype=dtype),
+                        ).to(dtype=float_dtype),
                     ),
                     dim=-1,
                 )
                 if self.include_duration_confidence
                 else self._get_confidence_tensor(
                     F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                ).to(dtype=dtype)
+                ).to(dtype=float_dtype)
                 if self.preserve_frame_confidence
                 else None,
             )
@@ -720,7 +837,6 @@ def _inner_loop_code(self):
             self.state.time_indices_current_labels,
             out=self.state.time_indices_current_labels,
         )
-        dtype = self.state.encoder_output_projected.dtype
         logits = (
             self.joint.joint_after_projection(
                 self.state.encoder_output_projected[self.state.batch_indices, self.state.safe_time_indices].unsqueeze(
@@ -742,6 +858,7 @@ def _inner_loop_code(self):
         torch.where(self.state.advance_mask, more_scores, self.state.scores, out=self.state.scores)
 
         if self.state.alignments is not None:
+            float_dtype = self.state.float_dtype
             self.state.alignments.add_results_masked_no_checks_(
                 active_mask=self.state.advance_mask,
                 time_indices=self.state.time_indices_current_labels,
@@ -751,17 +868,17 @@ def _inner_loop_code(self):
                     (
                         self._get_confidence_tensor(
                             F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                        ).to(dtype=dtype),
+                        ).to(dtype=float_dtype),
                         self._get_confidence_tensor(
                             F.log_softmax(logits[:, -self.state.all_durations.shape[0] :], dim=-1)
-                        ).to(dtype=dtype),
+                        ).to(dtype=float_dtype),
                     ),
                     dim=-1,
                 )
                 if self.include_duration_confidence
                 else self._get_confidence_tensor(
                     F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                ).to(dtype=dtype)
+                ).to(dtype=float_dtype)
                 if self.preserve_frame_confidence
                 else None,
             )
@@ -822,7 +939,7 @@ def _after_inner_loop(self):
     def __call__(
         self, x: torch.Tensor, out_len: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
-        if self.use_cuda_graphs and x.device.type == "cuda":
+        if self.cuda_graphs_mode is not None and x.device.type == "cuda":
             return self.loop_labels_cuda_graphs(encoder_output=x, encoder_output_length=out_len)
 
         return self.loop_labels_torch(encoder_output=x, encoder_output_length=out_len)
diff --git a/nemo/collections/common/parts/optional_cuda_graphs.py b/nemo/collections/common/parts/optional_cuda_graphs.py
new file mode 100644
index 000000000000..2417d9e00370
--- /dev/null
+++ b/nemo/collections/common/parts/optional_cuda_graphs.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+from typing import Optional
+
+import torch.nn as nn
+
+from nemo.utils import logging
+
+
+class WithOptionalCudaGraphs(abc.ABC):
+    """
+    Abstract interface for modules with CUDA graphs.
+    Allows to enable/disable CUDA graphs on the fly.
+    """
+
+    @classmethod
+    def disable_cuda_graphs_recursive(cls, module: nn.Module, attribute_path: Optional[str] = None):
+        """
+        Disable CUDA graphs Enable CUDA graphs, finding submodule recursively.
+
+        Args:
+            module: instance of nn.Module
+            attribute_path: field containing instance of WithOptionalCudaGraphs
+                   E.g., "decoding.decoding" means that "<module>.decoding.decoding" are checked.
+                   If None, "<module>" is checked.
+        """
+        attributes = attribute_path.split(".") if attribute_path else []
+
+        for name, submodule in module.named_modules():
+            object_to_check = submodule
+            try:
+                # recursively get attribute by iterating attribute_path
+                for attribute in attributes:
+                    object_to_check = getattr(object_to_check, attribute)
+            except AttributeError:
+                continue  # loop over modules, no attribute
+
+            if isinstance(object_to_check, cls):
+                object_to_check.disable_cuda_graphs()
+                logging.info(f"Disabled CUDA graphs for module {type(submodule)}" + ".".join([name] + attributes))
+
+    @classmethod
+    def enable_cuda_graphs_recursive(cls, module: nn.Module, attribute_path: Optional[str] = None):
+        """
+        Enable CUDA graphs, finding submodule recursively
+
+        Args:
+            module: instance of nn.Module
+            attribute_path: field containing instance of WithOptionalCudaGraphs
+                   E.g., "decoding.decoding" means that "<module>.decoding.decoding" are checked.
+                   If None, "<module>" is checked.
+        """
+        attributes = attribute_path.split(".") if attribute_path else []
+
+        for name, submodule in module.named_modules():
+            object_to_check = submodule
+            try:
+                # recursively get attribute by iterating attribute_path
+                for attribute in attributes:
+                    object_to_check = getattr(object_to_check, attribute)
+            except AttributeError:
+                continue  # loop over modules, no attribute
+
+            if isinstance(object_to_check, cls):
+                object_to_check.maybe_enable_cuda_graphs()
+                logging.info(f"Enabled CUDA graphs for module {type(submodule)}" + ".".join([name] + attributes))
+
+    @abc.abstractmethod
+    def disable_cuda_graphs(self):
+        """Disable (maybe temporary) CUDA graphs"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def maybe_enable_cuda_graphs(self):
+        """Enable CUDA graphs if all conditions met"""
+        raise NotImplementedError
diff --git a/nemo/core/utils/cuda_python_utils.py b/nemo/core/utils/cuda_python_utils.py
index fb47c22ceee0..eb8897df0797 100644
--- a/nemo/core/utils/cuda_python_utils.py
+++ b/nemo/core/utils/cuda_python_utils.py
@@ -25,7 +25,7 @@ def check_cuda_python_cuda_graphs_conditional_nodes_supported():
     try:
         from cuda import cuda
     except ImportError:
-        raise ModuleNotFoundError("Please do `pip install cuda-python>=12.3`")
+        raise ModuleNotFoundError("No `cuda-python` module. Please do `pip install cuda-python>=12.3`")
 
     from cuda import __version__ as cuda_python_version
 
diff --git a/tests/collections/asr/decoding/rnnt_alignments_check.py b/tests/collections/asr/decoding/rnnt_alignments_check.py
index aa4d5f044de1..d44f7f8fd985 100644
--- a/tests/collections/asr/decoding/rnnt_alignments_check.py
+++ b/tests/collections/asr/decoding/rnnt_alignments_check.py
@@ -28,13 +28,14 @@
 PRETRAINED_MODEL_NAME = "stt_en_conformer_transducer_small"
 
 
-def get_rnnt_alignments(strategy: str, loop_labels: bool = True, location="cuda"):
+def get_rnnt_alignments(strategy: str, loop_labels: bool = True, use_cuda_graph_decoder=False, location="cuda"):
     cfg = OmegaConf.structured(TranscriptionConfig(pretrained_name=PRETRAINED_MODEL_NAME))
     cfg.rnnt_decoding.confidence_cfg.preserve_frame_confidence = True
     cfg.rnnt_decoding.preserve_alignments = True
     cfg.rnnt_decoding.strategy = strategy
     if cfg.rnnt_decoding.strategy == "greedy_batch":
         cfg.rnnt_decoding.greedy.loop_labels = loop_labels
+        cfg.rnnt_decoding.greedy.use_cuda_graph_decoder = use_cuda_graph_decoder
     cfg.dataset_manifest = TEST_DATA_PATH
     filepaths = prepare_audio_data(cfg)[0][:10]  # selecting 10 files only
 
@@ -73,10 +74,15 @@ def cleanup_local_folder():
 # TODO: add the same tests for multi-blank RNNT decoding
 @pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine')
 @pytest.mark.parametrize("loop_labels", [True, False])
-def test_rnnt_alignments(loop_labels: bool):
+@pytest.mark.parametrize("use_cuda_graph_decoder", [True, False])
+def test_rnnt_alignments(loop_labels: bool, use_cuda_graph_decoder: bool):
+    if not loop_labels and use_cuda_graph_decoder:
+        pytest.skip("Frame-Looping algorithm with CUDA graphs does not yet support alignments")
     # using greedy as baseline and comparing all other configurations to it
     ref_transcriptions = get_rnnt_alignments("greedy")
-    transcriptions = get_rnnt_alignments("greedy_batch", loop_labels=loop_labels)
+    transcriptions = get_rnnt_alignments(
+        "greedy_batch", loop_labels=loop_labels, use_cuda_graph_decoder=use_cuda_graph_decoder
+    )
     # comparing that label sequence in alignments is exactly the same
     # we can't compare logits as well, because they are expected to be
     # slightly different in batched and single-sample mode
diff --git a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
index 538ff9d71cf1..31fe822573ce 100644
--- a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
+++ b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
@@ -11,19 +11,38 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import copy
 import glob
-import tempfile
 
 import jiwer
 import pytest
 import torch
-from omegaconf import OmegaConf, open_dict
+from omegaconf import open_dict
 
 from nemo.collections.asr.models import ASRModel
 from nemo.core.utils.cuda_python_utils import skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported
 
 
+@pytest.fixture(scope="module")
+def stt_en_fastconformer_transducer_xlarge():
+    model_name = "stt_en_fastconformer_transducer_xlarge"
+    return ASRModel.from_pretrained(model_name, map_location="cpu")
+
+
+@pytest.fixture(scope="module")
+def stt_en_fastconformer_transducer_xxlarge():
+    model_name = "stt_en_fastconformer_transducer_xxlarge"
+    return ASRModel.from_pretrained(model_name, map_location="cpu")
+
+
+@pytest.fixture(scope="module")
+def stt_en_fastconformer_transducer_large():
+    model_name = "stt_en_fastconformer_transducer_large"
+    return ASRModel.from_pretrained(model_name, map_location="cpu")
+
+
+@pytest.mark.with_downloads
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA decoder can run only on CUDA")
 @pytest.mark.parametrize(
     ("model_name", "batch_size", "enable_bfloat16"),
     [
@@ -42,28 +61,87 @@
     ],
 )
 @pytest.mark.parametrize("loop_labels", [False, True])
-def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16, loop_labels: bool):
-    skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
+def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16, loop_labels: bool, request):
+    if not loop_labels:
+        skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
+    if enable_bfloat16 and not torch.cuda.is_bf16_supported():
+        pytest.skip("bfloat16 is not supported")
+
+    device = torch.device("cuda")
+    nemo_model = request.getfixturevalue(model_name).to(device)
+    decoding_config = copy.deepcopy(nemo_model.cfg.decoding)
+
+    with open_dict(decoding_config):
+        decoding_config["greedy"]["max_symbols"] = 5
+        decoding_config["greedy"]["loop_labels"] = loop_labels
+        decoding_config["greedy"]["use_cuda_graph_decoder"] = False
+
+    nemo_model.change_decoding_strategy(decoding_config)
+    audio_filepaths = glob.glob("tests/.data/asr/test/an4/wav/*.wav")
+
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
+        actual_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
+
+    decoding_config["greedy"]["use_cuda_graph_decoder"] = True
+
+    nemo_model.change_decoding_strategy(decoding_config)
+
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
+        fast_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
 
-    conf = ASRModel.from_pretrained(model_name, return_config=True)
-    with open_dict(conf):
-        conf["decoding"]["greedy"]["max_symbols"] = 5
-        conf["decoding"]["greedy"]["loop_labels"] = loop_labels
-        conf["decoding"]["greedy"]["use_cuda_graph_decoder"] = False
+    wer = jiwer.wer(actual_transcripts, fast_transcripts)
 
-    with tempfile.NamedTemporaryFile() as fp:
-        OmegaConf.save(config=conf, f=fp.name)
-        nemo_model = ASRModel.from_pretrained(model_name, override_config_path=fp.name, map_location="cuda")
+    assert wer <= 1e-3, "Cuda graph greedy decoder should match original decoder implementation."
 
+    for actual, fast in zip(actual_transcripts, fast_transcripts):
+        if actual != fast:
+            print("erroneous samples:")
+            print("Original transcript:", actual)
+            print("New transcript:", fast)
+
+
+@pytest.mark.with_downloads
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA decoder can run only on CUDA")
+@pytest.mark.parametrize("force_mode", ["no_graphs", "no_while_loops", "full_graph"])
+@pytest.mark.parametrize("enable_bfloat16", [False, True])
+def test_loop_labels_cuda_graph_rnnt_greedy_decoder_forced_mode(
+    stt_en_fastconformer_transducer_large, force_mode: str, enable_bfloat16: bool
+):
+    """
+    Testing Label-Looping algorithm with CUDA graphs in forced mode.
+    This test guarantees that we check that the fallback behavior is working.
+    NB: Since it is impossible to directly debug CUDA graphs, when making changes,
+    start testing and debugging the code with forced "no_graphs" mode.
+    """
+    if enable_bfloat16 and not torch.cuda.is_bf16_supported():
+        pytest.skip("bfloat16 is not supported")
+
+    if force_mode == "full_graph":
+        skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
+
+    batch_size = 16
+    device = torch.device("cuda")
+    nemo_model = stt_en_fastconformer_transducer_large.to(device)
+    decoding_config = copy.deepcopy(nemo_model.cfg.decoding)
+
+    with open_dict(decoding_config):
+        decoding_config["greedy"]["max_symbols"] = 5
+        decoding_config["greedy"]["loop_labels"] = True
+        decoding_config["greedy"]["use_cuda_graph_decoder"] = False
+        # test that alignments and confidence do not introduce failures
+        decoding_config["greedy"]["preserve_alignments"] = True
+        decoding_config["greedy"]["preserve_frame_confidence"] = True
+
+    nemo_model.change_decoding_strategy(decoding_config)
     audio_filepaths = glob.glob("tests/.data/asr/test/an4/wav/*.wav")
 
     with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
         actual_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
 
-    with open_dict(conf):
-        conf["decoding"]["greedy"]["use_cuda_graph_decoder"] = True
-
-    nemo_model.change_decoding_strategy(conf["decoding"])
+    # transcribe with use implementation with cuda graphs
+    decoding_config["greedy"]["use_cuda_graph_decoder"] = True
+    nemo_model.change_decoding_strategy(decoding_config)
+    nemo_model.decoding.decoding._decoding_computer.force_cuda_graphs_mode(mode=force_mode)
 
     with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=enable_bfloat16):
         fast_transcripts, _ = nemo_model.transcribe(audio_filepaths, batch_size=batch_size, num_workers=None)
@@ -79,27 +157,27 @@ def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16,
             print("New transcript:", fast)
 
 
+@pytest.mark.with_downloads
+@pytest.mark.skipif(not torch.cuda.is_available() or torch.cuda.device_count() < 2, reason="Test requires 2 GPUs")
 @pytest.mark.parametrize("loop_labels", [False, True])
-def test_change_devices(loop_labels: bool):
-    skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
-
-    if torch.cuda.device_count() < 2:
-        pytest.skip("Test requires more than 2 GPUs")
+def test_change_devices(loop_labels: bool, stt_en_fastconformer_transducer_xlarge):
+    if not loop_labels:
+        skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
 
     first_device = torch.device("cuda:0")
     second_device = torch.device("cuda:1")
 
-    model_name = "stt_en_fastconformer_transducer_xlarge"
     batch_size = 8
 
-    conf = ASRModel.from_pretrained(model_name, return_config=True)
-    with open_dict(conf):
-        conf["decoding"]["greedy"]["max_symbols"] = 5
-        conf["decoding"]["greedy"]["loop_labels"] = loop_labels
-        conf["decoding"]["greedy"]["use_cuda_graph_decoder"] = True
+    nemo_model = stt_en_fastconformer_transducer_xlarge.to(second_device)
+    decoding_config = copy.deepcopy(nemo_model.cfg.decoding)
+
+    with open_dict(decoding_config):
+        decoding_config["greedy"]["max_symbols"] = 5
+        decoding_config["greedy"]["loop_labels"] = loop_labels
+        decoding_config["greedy"]["use_cuda_graph_decoder"] = True
 
-    nemo_model = ASRModel.from_pretrained(model_name, map_location=second_device)
-    nemo_model.change_decoding_strategy(conf["decoding"])
+    nemo_model.change_decoding_strategy(decoding_config)
 
     # Test that the model can run successfully when it is first
     # initialized on second_device and then transferred to
diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py
index a6e3714f20f5..c3b214751d04 100644
--- a/tests/collections/asr/test_asr_rnnt_encdec_model.py
+++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py
@@ -432,9 +432,14 @@ def test_BeamRNNTInferConfig(self):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding(self, greedy_class):
+    def test_greedy_decoding(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -454,7 +459,14 @@ def test_greedy_decoding(self, greedy_class):
         for joint_type in [RNNTJoint, HATJoint]:
             joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list)
 
-            greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+            greedy = greedy_class(
+                decoder,
+                joint_net,
+                blank_index=len(token_list) - 1,
+                max_symbols_per_step=5,
+                **additional_decoding_kwargs,
+            )
 
             # (B, D, T)
             enc_out = torch.randn(1, encoder_output_size, 30)
diff --git a/tests/collections/common/test_optional_cuda_graphs.py b/tests/collections/common/test_optional_cuda_graphs.py
new file mode 100644
index 000000000000..7b1dda775863
--- /dev/null
+++ b/tests/collections/common/test_optional_cuda_graphs.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from types import SimpleNamespace
+
+import torch.nn as nn
+
+from nemo.collections.common.parts.optional_cuda_graphs import WithOptionalCudaGraphs
+
+
+class MockClassWithCudaGraphs(WithOptionalCudaGraphs):
+    def __init__(self):
+        super().__init__()
+        self.cuda_graphs_used = True
+
+    def disable_cuda_graphs(self):
+        self.cuda_graphs_used = False
+
+    def maybe_enable_cuda_graphs(self):
+        self.cuda_graphs_used = True
+
+
+class MockModuleWithCudaGraphs(MockClassWithCudaGraphs, nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(10, 20)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class MockModuleWithCudaGraphsByPath(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(10, 20)
+        self.decoding = SimpleNamespace(decoding=MockClassWithCudaGraphs())
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class TestWithOptionalCudaGraphs:
+    def test_module_toggle_cuda_graphs(self):
+        module_with_graphs = MockModuleWithCudaGraphs()
+        assert module_with_graphs.cuda_graphs_used
+        WithOptionalCudaGraphs.disable_cuda_graphs_recursive(module_with_graphs)
+        assert not module_with_graphs.cuda_graphs_used
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(module_with_graphs)
+        assert module_with_graphs.cuda_graphs_used
+
+    def test_module_toggle_cuda_graphs_by_path(self):
+        module_with_graphs_by_path = MockModuleWithCudaGraphsByPath()
+        assert module_with_graphs_by_path.decoding.decoding.cuda_graphs_used
+        WithOptionalCudaGraphs.disable_cuda_graphs_recursive(
+            module_with_graphs_by_path, attribute_path="decoding.decoding"
+        )
+        assert not module_with_graphs_by_path.decoding.decoding.cuda_graphs_used
+        WithOptionalCudaGraphs.enable_cuda_graphs_recursive(
+            module_with_graphs_by_path, attribute_path="decoding.decoding"
+        )
+        assert module_with_graphs_by_path.decoding.decoding.cuda_graphs_used

From 10e15ed1ffdf409c1b130c024524d056ea13ffa7 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Fri, 3 May 2024 08:54:01 -0500
Subject: [PATCH 027/178] Alit/griffin (#9021)

* add init griffin

* remove unnecessary imports

* add sft

* add sft model init

* add text gen starategy for Griffin no cache

* test SFT

* minor fix to config

* fix logprob output issue

* sft WS fixed

* replace trainer in conversion script

* Revert "Fix PTL2.2 saving multiple `*-last.ckpt` checkpoints in resumed training (#8480)"

This reverts commit 11b7a733cbd4b8311eacba581323f88c7cd4bac4.

* Revert "FSDP update to PTL 2.2 (#8658)"

This reverts commit 355e36c344be55b2bf7b1fd55f5554a831e6fcd3.

* init dist opt

* add peft

* fix generate script

* convert to HF format

* further cleanups

* minor fix

* minor fix

* more refactoring

* remove local path from config

* undo unnecessary changes

* remove pretraining

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix val param sync

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Addresing MR comments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* code ql fixed

* more code ql

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address comments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add jenkins

* remove jenkins for momentarily

* add reqs for griffin

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add req test

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add reqs to nlp

* add reqs to nlp

* replace torch scan

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* jit fusion for embedding decoder

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* jit fusion for embedding decoder

* add fix to rglru

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../conf/megatron_griffin_config.yaml         | 168 +++++++++
 .../megatron_griffin_finetuning_config.yaml   | 285 ++++++++++++++++
 .../megatron_griffin_generate_config.yaml     | 292 ++++++++++++++++
 .../megatron_griffin_finetuning.py            |  60 ++++
 .../megatron_griffin_generate.py              |  69 ++++
 .../megatron/gpt_sft_dataset.py               |  10 +-
 .../megatron/griffin/__init__.py              |  13 +
 .../megatron/griffin/griffin_block.py         |  75 ++++
 .../megatron/griffin/griffin_layer_spec.py    |  81 +++++
 .../megatron/griffin/griffin_model.py         | 156 +++++++++
 .../megatron/griffin/recurrent_layer.py       | 106 ++++++
 .../megatron/griffin/recurrent_module.py      | 321 ++++++++++++++++++
 .../megatron_gpt_sft_model.py                 |   1 +
 .../megatron_griffin_model.py                 |  96 ++++++
 .../megatron_griffin_sft_model.py             |  55 +++
 .../common/text_generation_strategy.py        |  75 ++++
 requirements/requirements_nlp.txt             |   2 +
 .../convert_griffin_hf_to_nemo.py             | 174 ++++++++++
 .../convert_griffin_nemo_to_hf.py             | 147 ++++++++
 19 files changed, 2185 insertions(+), 1 deletion(-)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
 create mode 100644 examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
 create mode 100644 examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
 create mode 100644 examples/nlp/language_modeling/megatron_griffin_finetuning.py
 create mode 100644 examples/nlp/language_modeling/megatron_griffin_generate.py
 create mode 100755 nemo/collections/nlp/models/language_modeling/megatron/griffin/__init__.py
 create mode 100755 nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py
 create mode 100755 nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py
 create mode 100755 nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
 create mode 100755 nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py
 create mode 100755 nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py
 create mode 100644 scripts/checkpoint_converters/convert_griffin_hf_to_nemo.py
 create mode 100644 scripts/checkpoint_converters/convert_griffin_nemo_to_hf.py

diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
new file mode 100644
index 000000000000..ea23cf630f8b
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
@@ -0,0 +1,168 @@
+name: megatron_griffin
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_griffin
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    filename: 'megatron_griffin--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  micro_batch_size: 2
+  global_batch_size: 2
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  vocab_size: 256000
+  # model architecture
+  encoder_seq_length: 512
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  logits_soft_cap: 30.0
+  num_layers: 26
+  gated_linear_unit: True
+  window_size: [1024, 0]
+  num_query_groups: 1
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 2560
+  bias_activation_fusion: True
+  ffn_hidden_size: 7680 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 10
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-6
+  rotary_interleaved: False
+  layernorm_zero_centered_gamma: True
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'google/recurrentgemma-2b' 
+    model: null 
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: False 
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  data:
+    # Path to data must be specified by the user.
+    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    data_prefix: [1.0, /path/to/data]
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 0
+    dataloader_type: single  # cyclic, LDDL
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    masked_lm_prob: 0.15 # Probability of replacing a token with mask.
+    short_seq_prob: 0.1 # Probability of producing a short sequence.
+    ceil_to_power_2: True
+  
+  optim:
+    name: fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
new file mode 100644
index 000000000000..64d1b67bc148
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
@@ -0,0 +1,285 @@
+name: megatron_griffin
+restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 1 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  limit_val_batches: 1024
+  limit_test_batches: 500
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: griffin
+    name: sft-test
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  restore_from_path: 
+  # model parallelism 
+  micro_batch_size: 2
+  global_batch_size: 2
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  vocab_size: 256000
+  apply_rope_fusion: True
+  # model architecture
+  encoder_seq_length: 512
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 26
+  gated_linear_unit: True
+  window_size: [1024, 0]
+  num_query_groups: 1
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 2560
+  bias_activation_fusion: True
+  ffn_hidden_size: 7680 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 10
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-6
+  rotary_interleaved: False
+  layernorm_zero_centered_gamma: True
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  activation: 'fast-geglu'
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'google/recurrentgemma-2b'
+    model: null
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: False 
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: null # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: [1.0] # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
+      ceil_to_power_2: True
+    validation_ds:
+        file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${model.global_batch_size}
+        micro_batch_size: ${model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: 2048
+        min_seq_length: 1
+        drop_last: False
+        label_key: ${model.data.train_ds.label_key}
+        add_eos: ${model.data.train_ds.add_eos}
+        add_sep: ${model.data.train_ds.add_sep}
+        add_bos: ${model.data.train_ds.add_bos}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+        ceil_to_power_2: True
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+    test_ds:
+      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      label_key: ${model.data.train_ds.label_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template}
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: distributed_fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
new file mode 100644
index 000000000000..4b3c14c846d1
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
@@ -0,0 +1,292 @@
+name: megatron_griffin
+restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 1 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  limit_val_batches: 1024
+  limit_test_batches: 500
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: griffin
+    name: sft-test
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  micro_batch_size: 2
+  global_batch_size: 2
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  vocab_size: 256000
+  apply_rope_fusion: True
+  # model architecture
+  encoder_seq_length: 512
+  logits_soft_cap: 30.0
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 26
+  gated_linear_unit: True
+  window_size: [1024, 0]
+  num_query_groups: 1
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 2560
+  bias_activation_fusion: True
+  ffn_hidden_size: 7680 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 10
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-6
+  rotary_interleaved: False
+  layernorm_zero_centered_gamma: True
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  activation: 'fast-geglu'
+
+  answer_only_loss: True
+
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'google/recurrentgemma-2b' 
+    model: null
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: False 
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+  data:
+    test_ds:
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: ??? # Names of the corresponding datasets used to log metrics.
+      global_batch_size: 1
+      micro_batch_size: 1
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "input" # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  outfile_path: output.txt
+  compute_attention_mask: True
+
+# server-related configs
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: True  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server 1058
+chat: False # use the chat interface
+chatbot_config:
+  value: False   # whether to inject the value attributes
+  attributes:
+    - name: Quality
+      min: 0
+      max: 4
+      key: quality
+      type: int
+      default: 4
+    - name: Toxicity
+      min: 0
+      max: 4
+      key: toxcity
+      type: int
+      default: 0
+    - name: Humor
+      min: 0
+      max: 4
+      key: humor
+      type: int
+      default: 0
+    - name: Creativity
+      min: 0
+      max: 4
+      key: creativity
+      type: int
+      default: 0
+    - name: Violence
+      min: 0
+      max: 4
+      key: violence
+      type: int
+      default: 0
+    - name: Helpfulness
+      min: 0
+      max: 4
+      key: helpfulness
+      type: int
+      default: 4
+    - name: Not_Appropriate
+      min: 0
+      max: 4
+      key: not_appropriate
+      type: int
+      default: 0
+    - name: Language
+      choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
+      key: lang
+      type: list
+      default: en
+   
+  user: User
+  assistant: Assistant
+  system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/megatron_griffin_finetuning.py b/examples/nlp/language_modeling/megatron_griffin_finetuning.py
new file mode 100644
index 000000000000..c5ae513d5874
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_griffin_finetuning.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.nlp.models.language_modeling.megatron_griffin_sft_model import MegatronGriffinSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+mp.set_start_method("spawn", force=True)
+
+
+@hydra_runner(config_path="conf", config_name="megatron_griffin_finetuning_config")
+def main(cfg) -> None:
+
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    precision = cfg.trainer.precision
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+    # Restore the precision value after Trainer is built.
+    cfg.trainer.precision = precision
+    exp_manager(trainer, cfg.exp_manager)
+
+    model_cfg = MegatronGriffinSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
+    model = MegatronGriffinSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+
+    if cfg.model.peft.restore_from_path is not None:
+        # initialize peft weights from a checkpoint instead of randomly
+        # This is not the same as resume training because optimizer states are not restored.
+        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
+        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
+    elif peft_cfg_cls is not None:
+        logging.info("Adding adapter weights to the model for PEFT")
+        model.add_adapter(peft_cfg_cls(model_cfg))
+    else:
+        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/language_modeling/megatron_griffin_generate.py b/examples/nlp/language_modeling/megatron_griffin_generate.py
new file mode 100644
index 000000000000..c8e36668fced
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_griffin_generate.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+from nemo.collections.nlp.models.language_modeling.megatron_griffin_sft_model import MegatronGriffinSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+
+mp.set_start_method("spawn", force=True)
+
+
+@hydra_runner(config_path="conf", config_name="megatron_griffin_generate_config")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+
+    if cfg.model.peft.restore_from_path:
+        model_cfg = MegatronGriffinSFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
+    else:
+        model_cfg = MegatronGriffinSFTModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
+
+    model = MegatronGriffinSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    if cfg.model.peft.restore_from_path:
+        model.load_adapters(cfg.model.peft.restore_from_path)
+    elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
+        peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+        checkpoint_path = os.path.join(
+            cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+        )
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(
+                os.path.join(
+                    cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+                )
+            )
+            model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg))
+        else:
+            raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
+
+    model.freeze()
+    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
+
+    trainer.test(model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index 501c766374e1..6354387c18e7 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 import re
 from typing import List, Mapping, Optional
 
@@ -60,6 +61,7 @@ def __init__(
         special_tokens: Optional[Mapping[str, str]] = None,  # special tokens, a dictory of {token_type: token}
         is_test: bool = False,
         output_original_text: bool = False,
+        ceil_to_power_2: bool = False,
     ):
         """
         file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
@@ -109,6 +111,8 @@ def __init__(
         self.truncation_method = truncation_method
         self.is_test = is_test
         self.output_original_text = output_original_text
+        self.ceil_to_power_2 = ceil_to_power_2
+
         if special_tokens is None:
             self.special_tokens = {
                 "system_turn_start": "<extra_id_0>",
@@ -406,7 +410,11 @@ def _maybe_cast_to_list(self, x):
         return x
 
     def _ceil_to_nearest(self, n, m):
-        return (n + m - 1) // m * m
+        if self.ceil_to_power_2:
+            # Reccurent Gemma (AKA Griffin) requires seq length to be a power of 2 for parallel scan
+            return 2 ** math.ceil(math.log2(n))
+        else:
+            return (n + m - 1) // m * m
 
     def _collate_item(self, item, max_length, pad_id):
         item = self._maybe_cast_to_list(item)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/__init__.py
new file mode 100755
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py
new file mode 100755
index 000000000000..3fc26a51f3c1
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+from megatron.core.transformer.spec_utils import build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from torch import nn
+
+from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_layer_spec import (
+    griffin_mqa_layer_with_transformer_engine_spec,
+    griffin_recurrent_layer_with_transformer_engine_spec,
+)
+
+
+def get_griffin_layers(num_layers):
+    dict_spec = {
+        "Recurrent_Layer": griffin_recurrent_layer_with_transformer_engine_spec,
+        "Attention_Layer": griffin_mqa_layer_with_transformer_engine_spec,
+    }
+
+    griffin_layers = []
+    for i in range(num_layers):
+        if i % 3 == 2:
+            griffin_layers.append(dict_spec["Attention_Layer"])
+        else:
+            griffin_layers.append(dict_spec["Recurrent_Layer"])
+
+    return griffin_layers
+
+
+def create_block(
+    config, layer_spec, layer_idx,
+):
+    block = build_module(layer_spec, config,)
+    block.layer_number = layer_idx + 1
+    return block
+
+
+class GriffinStack(LanguageModule):
+    def __init__(
+        self, config: TransformerConfig,
+    ):
+
+        super().__init__(config)
+        self.config = config
+        self.griffin_layers = get_griffin_layers(self.config.num_layers)
+
+        self.layers = nn.ModuleList(
+            [create_block(self.config, layer_spec, layer_idx=i,) for i, layer_spec in enumerate(self.griffin_layers)]
+        )
+        self.final_layernorm = TENorm(
+            config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon,
+        )
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb):
+
+        for layer in self.layers:
+
+            hidden_states, _ = layer(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py
new file mode 100755
index 000000000000..a504898e9d64
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_layer_spec.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+from nemo.collections.nlp.models.language_modeling.megatron.griffin.recurrent_layer import (
+    RecurrentBlock,
+    RecurrentBlockSubmodules,
+)
+from nemo.collections.nlp.models.language_modeling.megatron.griffin.recurrent_module import (
+    RGLRU,
+    Conv1D,
+    RecurrentLayer,
+    RecurrentLayerSubmodules,
+)
+
+griffin_mqa_layer_with_transformer_engine_spec = ModuleSpec(
+    module=TransformerLayer,
+    submodules=TransformerLayerSubmodules(
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.causal},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=TELayerNormColumnParallelLinear,
+                core_attention=TEDotProductAttention,
+                linear_proj=TERowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
+            ),
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        mlp=ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+        ),
+        mlp_bda=get_bias_dropout_add,
+    ),
+)
+
+griffin_recurrent_layer_with_transformer_engine_spec = ModuleSpec(
+    module=RecurrentBlock,
+    submodules=RecurrentBlockSubmodules(
+        recurrent_layer=ModuleSpec(
+            module=RecurrentLayer,
+            submodules=RecurrentLayerSubmodules(
+                linear_in=TELayerNormColumnParallelLinear,
+                linear_out=TERowParallelLinear,
+                conv_1d=Conv1D,
+                rg_lru=RGLRU,
+            ),
+        ),
+        recurrent_bda=get_bias_dropout_add,
+        mlp=ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+        ),
+        mlp_bda=get_bias_dropout_add,
+    ),
+)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
new file mode 100755
index 000000000000..9f00fb9dd156
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+from megatron.core.jit import jit_fuser
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from torch import Tensor, nn
+
+from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_block import GriffinStack
+
+
+class GriffinModel(LanguageModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        vocab_size: int = 256000,
+        logits_soft_cap: float = 30.0,
+        position_embedding_type: str = 'rope',
+        max_sequence_length: int = 1024,
+        rotary_percent: float = 0.5,
+        rotary_base: int = 10000,
+        pre_process=True,
+    ):
+
+        super().__init__(config)
+        self.config = config
+        self.vocab_size = vocab_size
+        self.logits_soft_cap = logits_soft_cap
+        self.position_embedding_type = position_embedding_type
+        self.pre_process = pre_process
+        self.post_process = False
+        self.share_embeddings_and_output_weights = True
+
+        if pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=max_sequence_length,
+                position_embedding_type=None,
+            )
+
+        if self.position_embedding_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=config.kv_channels,
+                rotary_percent=rotary_percent,
+                rotary_interleaved=config.rotary_interleaved,
+                seq_len_interpolation_factor=None,
+                rotary_base=rotary_base,
+            )
+
+        self.decoder = GriffinStack(self.config)
+
+    def shared_embedding_or_output_weight(self) -> Tensor:
+        """Gets the emedding weight or output logit weights when share embedding and output weights set to True.
+
+        Returns:
+            Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight
+        """
+        if self.pre_process:
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.output_layer.weight
+        return None
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+
+    def set_input_tensor(self, input_tensor: Tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def griffin_position_ids(self, token_ids):
+        # Create position ids
+        seq_length = token_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+
+        return position_ids
+
+    def embedding_forward(self, input_ids):
+
+        position_ids = self.griffin_position_ids(input_ids)
+        embeddings = self.embedding(input_ids, position_ids)
+        embeddings = embeddings * torch.tensor(math.sqrt(self.config.hidden_size)).type_as(embeddings)
+
+        return embeddings
+
+    @jit_fuser
+    def _embedding_decode_(self, logits, transpose):
+        logits = nn.functional.tanh(logits / self.logits_soft_cap) * self.logits_soft_cap
+        if transpose:
+            logits = logits.transpose(0, 1)
+        return logits.contiguous()
+
+    def embedding_decode(self, x, transpose):
+        x = x.permute(1, 0, 2)
+        logits = x @ self.embedding.word_embeddings.state_dict()['weight'].T
+        logits = self._embedding_decode_(logits, transpose)
+
+        return logits
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor = None,
+        attention_mask: Tensor = None,
+        labels: Tensor = None,
+        **extra_arg
+    ):
+        if input_ids is None:
+            input_ids = self.input_tensor
+
+        hidden_states = self.embedding_forward(input_ids)
+
+        rotary_pos_emb = None
+        self.decoder.input_tensor = None
+        if self.position_embedding_type == 'rope':
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(None, self.decoder, hidden_states, self.config)
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        hidden_states = self.decoder(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
+
+        logits = self.embedding_decode(hidden_states, labels is not None)
+
+        if labels is None:
+            # [b s h]
+            return logits
+
+        loss = self.compute_language_model_loss(labels, logits)
+
+        return loss
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py
new file mode 100755
index 000000000000..8263f54889a0
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Union
+
+from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_viewless_tensor
+from torch import Tensor
+
+
+@dataclass
+class RecurrentBlockSubmodules:
+    input_layernorm: Union[ModuleSpec, type] = IdentityOp
+    recurrent_layer: Union[ModuleSpec, type] = IdentityOp
+    recurrent_bda: Union[ModuleSpec, type] = IdentityFuncOp
+
+    pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp
+    mlp: Union[ModuleSpec, type] = IdentityOp
+    mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp
+
+
+class RecurrentBlock(MegatronModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: RecurrentBlockSubmodules,
+        layer_idx=None,
+        residual_in_fp32=False,
+        **kwargs,
+    ):
+        """
+        Top level Mamba Layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.residual_in_fp32 = residual_in_fp32
+        self.hidden_dropout = config.hidden_dropout
+
+        self.input_layernorm = build_module(submodules.input_layernorm, dim=self.config.hidden_size)
+
+        self.recurrent_layer = build_module(
+            submodules.recurrent_layer,
+            self.config,
+            width=self.config.hidden_size,
+            num_heads=self.config.num_attention_heads,
+            lru_width=self.config.hidden_size,
+            conv1d_temporal_width=4,
+            final_w_init_variance_scale=1.0,
+        )
+
+        self.recurrent_bda = build_module(submodules.recurrent_bda)
+
+        self.pre_mlp_layernorm = build_module(submodules.pre_mlp_layernorm, dim=self.config.hidden_size)
+
+        self.mlp = build_module(submodules.mlp, config=self.config)
+
+        self.mlp_bda = build_module(submodules.mlp_bda)
+
+    def forward(self, hidden_states: Tensor, attention_mask: Tensor, inference_params=None, **kwargs):
+
+        residual = hidden_states
+
+        # Optional Input Layer norm
+        input_layernorm_output = self.input_layernorm(hidden_states)
+
+        # Reccurent block.
+        recurrent_output_with_bias = self.recurrent_layer(input_layernorm_output)
+
+        hidden_states = self.recurrent_bda(self.training, self.config.bias_dropout_fusion)(
+            recurrent_output_with_bias, residual, self.hidden_dropout
+        )
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Layer norm post the cross-attention.
+        pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states)
+
+        # MLP.
+        mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output)
+
+        hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
+            mlp_output_with_bias, residual, self.hidden_dropout
+        )
+
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+
+        return output, None
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
new file mode 100755
index 000000000000..6cd9eeaadc63
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Union
+
+import einops
+import torch
+from accelerated_scan.ref import scan
+from causal_conv1d import causal_conv1d_fn
+from einops import rearrange
+from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from torch import nn
+
+
+# Class copied from https://github.com/google-deepmind/recurrentgemma
+class BlockDiagonalLinear(nn.Module):
+    """Block-diagonal linear layer."""
+
+    def __init__(
+        self, width: int, num_blocks: int, w_init_variance_scale: float = 1.0,
+    ):
+        """Initializes the BlockDiagonalLinear.
+
+    Args:
+      width: The number of dimensions of the input and output.
+      num_blocks: The number of diagonal blocks in the layer.
+      w_init_variance_scale: A parameters that scales the variance of the
+        initialization of the weights.
+    """
+        super().__init__()
+        self.width = width
+        self.num_blocks = num_blocks
+        self.w_init_variance_scale = w_init_variance_scale
+        self.block_width = self.width // self.num_blocks
+
+        # Parameters.
+        self.w = nn.Parameter(torch.zeros([self.num_blocks, self.block_width, self.block_width]))
+        self.b = nn.Parameter(torch.zeros([self.num_blocks, self.block_width]))
+
+        # Initialization.
+        self.w_init_(self.w)
+
+    def w_init_(self, w: torch.Tensor) -> None:
+        """Initializes the weight `w` of the layer."""
+        std = math.sqrt(self.w_init_variance_scale / self.block_width)
+        torch.nn.init.normal_(w, mean=0.0, std=std)
+
+    def forward(self, x):
+        """Calls the BlockDiagonalLinear."""
+        # Split x to blocks.
+        x = einops.rearrange(x, "... (h i) -> ... h i", h=self.num_blocks)
+
+        # Linear layer over each block + bias.
+        y = torch.einsum("... h i, h i j -> ... h j", x, self.w) + self.b
+
+        # Flatten the output.
+        return einops.rearrange(y, "... h j -> ... (h j)", h=self.num_blocks)
+
+
+# Class copied from https://github.com/google-deepmind/recurrentgemma
+
+
+def rnn_scan(
+    x, a, reset, h0,
+):
+    """Runs the recurrence of a linear RNN.
+
+  Args:
+    x: The input sequence.
+    a: The diagonal of the recurrence matrix `A`.
+    reset: Indicator of document boundaries, e.g. when to reset the hidden
+      state of the RNN.
+    h0: The initial hidden state.
+
+  Returns:
+    The output of the linear recurrence.
+  """
+
+    assert x.ndim == 3
+    assert a.shape == x.shape[-a.ndim :]
+    assert a.dtype == x.dtype
+    assert type(a) is type(x)
+
+    # Multiply `a` by the reset.
+    a = a * (1 - reset)[..., None]
+
+    if x.shape[1] == 1:
+        # Using scan in sampling mode.
+        y = a * h0[:, None] + x
+    else:
+        # Using scan in linear mode.
+        x = x.permute(0, 2, 1)
+        a = a.permute(0, 2, 1)
+        x = x.contiguous()
+        a = a.contiguous()
+        y = scan(a.float(), x.float()).type_as(x)
+        y = y.permute(0, 2, 1)
+    return y, None
+
+
+# Class copied from https://github.com/google-deepmind/recurrentgemma
+
+
+def rnn_param_init(*, width: int, min_rad: float, max_rad: float, transform: str = "softplus",) -> torch.Tensor:
+    """Initializes the `A` parameter of the RG-LRU uniformly on a ring."""
+    unif = torch.rand(width)
+    # Proportional to area in a ring.
+    a_real = 0.5 * torch.log(unif * (max_rad ** 2 - min_rad ** 2) + min_rad ** 2 + 1e-8)
+
+    if transform == "softplus":
+        # Inverse transform.
+        return torch.log(torch.exp(-a_real) - 1.0)
+    else:
+        raise NotImplementedError()
+
+
+# Class copied from https://github.com/google-deepmind/recurrentgemma
+
+
+class RGLRU(nn.Module):
+    """A Real-Gated Linear Recurrent Unit (RG-LRU) layer."""
+
+    def __init__(
+        self, width: int, num_heads: int, w_init_variance_scale: float = 1.0,
+    ):
+        """Initializes the RG-LRU.
+
+    Args:
+      width: The number of dimensions of the input and output.
+      num_heads: The number of diagonal blocks in the input and A gate layers.
+      w_init_variance_scale: Initialization parameter for the
+        BlockDiagonalLinear layers of the gates. See the `BlockDiagonalLinear`
+        layer for details.
+    """
+        super().__init__()
+        self.width = width
+        self.num_heads = num_heads
+        self.w_init_variance_scale = w_init_variance_scale
+
+        # Parameters and layers.
+        self.a_param = nn.Parameter(self.a_param_init)
+        self.input_gate = BlockDiagonalLinear(
+            width=self.width, num_blocks=self.num_heads, w_init_variance_scale=w_init_variance_scale,
+        )
+        self.a_gate = BlockDiagonalLinear(
+            width=self.width, num_blocks=self.num_heads, w_init_variance_scale=self.w_init_variance_scale
+        )
+
+    @property
+    def a_param_init(self) -> torch.Tensor:
+        """Initializes the `A` parameter of the RG-LRU."""
+        return rnn_param_init(width=self.width, min_rad=0.9, max_rad=0.999)
+
+    def __call__(
+        self, x, segment_pos, prev_h,
+    ):
+        """Calls the RG-LRU.
+
+    Args:
+      x: Sequence of input activations.
+      segment_pos: Position of each token in the sequence.
+      prev_h: The previous hidden state of the RG-LRU.
+
+    Returns:
+      Output of the block together with the updated hidden state.
+    """
+        for param in self.parameters():
+            param.data_ptr()
+
+        bs, l, d = x.shape
+        assert segment_pos.shape == (bs, l)
+        reset = (segment_pos == 0).type(torch.int32)
+        prev_h = torch.zeros(size=(bs, d)) if prev_h is None else prev_h
+        prev_h = prev_h.cuda()
+        # Gates for x and a.
+        gate_x = torch.sigmoid(self.input_gate(x))
+        gate_a = torch.sigmoid(self.a_gate(x))
+
+        # Compute the parameter `A` of the recurrence.
+        log_a = -8.0 * gate_a * nn.functional.softplus(self.a_param)
+        a = torch.exp(log_a)
+
+        # Gate the input.
+        gated_x = x * gate_x
+
+        # Apply gamma normalization to the input.
+        multiplier = torch.sqrt((1 - torch.exp(2 * log_a)) + 1e-6)
+        multiplier = reset[..., None] + (1 - reset)[..., None] * multiplier
+        normalized_x = gated_x * multiplier.type(x.dtype)
+
+        y, last_h = rnn_scan(x=normalized_x, a=a, reset=reset, h0=prev_h,)
+
+        return y, last_h
+
+
+class Conv1D(MegatronModule):
+    def __init__(self, config, width, temporal_width):
+        super().__init__(config=config)
+        self.config = config
+        self.width = width
+        self.temporal_width = temporal_width
+        self.conv_1d = nn.Conv1d(
+            in_channels=width,
+            out_channels=width,
+            bias=True,
+            kernel_size=temporal_width,
+            groups=width,
+            padding=temporal_width - 1,
+        )
+
+    def forward(
+        self, x, segment_pos=None, prev_x=None,
+    ):
+        x = x.permute(0, 2, 1)
+        output = causal_conv1d_fn(
+            x=x, weight=rearrange(self.conv_1d.weight, "d 1 w -> d w"), bias=self.conv_1d.bias, activation=None,
+        ).permute(0, 2, 1)
+        return output, None
+
+
+@dataclass
+class RecurrentLayerSubmodules:
+    linear_in: Union[ModuleSpec, type] = IdentityOp
+    linear_out: Union[ModuleSpec, type] = IdentityOp
+    conv_1d: Union[ModuleSpec, type] = IdentityOp
+    rg_lru: Union[ModuleSpec, type] = IdentityOp
+
+
+def gelu(x: torch.Tensor) -> torch.Tensor:
+    """Returns the GELU activation function with the same approximation as JAX."""
+    return nn.functional.gelu(x, approximate="tanh")
+
+
+class RecurrentLayer(MegatronModule):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: RecurrentLayerSubmodules,
+        layer_idx=None,
+        residual_in_fp32=False,
+        **kwargs,
+    ):
+        """
+        Top level Mamba Layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.linear_in = build_module(
+            submodules.linear_in,
+            self.config.hidden_size,
+            self.config.hidden_size * 2,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=False,
+        )
+
+        self.linear_out = build_module(
+            submodules.linear_out,
+            self.config.hidden_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=False,
+            input_is_parallel=True,
+        )
+
+        self.conv_1d = build_module(
+            submodules.conv_1d, config=self.config, width=self.config.hidden_size, temporal_width=4
+        )
+
+        self.rg_lru = build_module(
+            submodules.rg_lru, width=self.config.hidden_size, num_heads=self.config.num_attention_heads
+        )
+
+    def forward(self, hidden_states, attention_mask=None, rotary_pos_emb=None):
+
+        segment_pos = torch.arange(hidden_states.shape[0]).unsqueeze(0).repeat(hidden_states.shape[1], 1).cuda()
+        in_intermidiate_parallel, in_bias_parallel = self.linear_in(hidden_states)
+
+        x_bias_parallel, y_bias_parallel = in_bias_parallel.chunk(2, dim=-1)
+        x_intermidiate_parallel, y_intermidiate_parallel = in_intermidiate_parallel.chunk(2, dim=-1)
+
+        y = bias_gelu_impl(y_intermidiate_parallel, y_bias_parallel)
+
+        x = x_intermidiate_parallel + x_bias_parallel
+        x = x.permute(1, 0, 2)
+
+        x, _ = self.conv_1d(x=x, segment_pos=segment_pos, prev_x=None)
+
+        x, _ = self.rg_lru(x=x, segment_pos=segment_pos, prev_h=None,)
+
+        x = x.permute(1, 0, 2)
+
+        x = x * y
+        x_intermidiate_parallel, x_bias_parallel = self.linear_out(x)
+
+        return x_intermidiate_parallel, x_bias_parallel
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 892a87189880..32b22df22d2c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -290,6 +290,7 @@ def _build_dataset(self, data_cfg, is_train=True):
                 pad_to_max_length=data_cfg.get('pad_to_max_length', False),
                 index_mapping_dir=data_cfg.get('index_mapping_dir', None),
                 prompt_template=data_cfg.get('prompt_template', None),
+                ceil_to_power_2=data_cfg.get('ceil_to_power_2', False),
                 virtual_tokens=self.virtual_tokens,
                 tokens_to_generate=data_cfg.get(
                     'tokens_to_generate', 0
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py b/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
new file mode 100644
index 000000000000..20ad376b8f98
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_model import GriffinModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
+
+
+class MegatronGriffinModel(MegatronGPTModel):
+    """
+    Megatron Griffin pretraining.
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(
+                "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+            )
+
+        # build the transformer config
+        # TODO: add type hint once pip package is out
+
+        self.vocab_size = cfg.get('vocab_size', 256000)
+        self.cfg = cfg
+        super().__init__(cfg=cfg, trainer=trainer)
+        self.mcore_gpt = True
+
+    def model_provider_func(self, pre_process, post_process):
+        model = GriffinModel(
+            config=self.transformer_config,
+            max_sequence_length=self.cfg.get('encoder_seq_length', 512),
+            vocab_size=self.cfg.get('vocab_size', 256000),
+            position_embedding_type=self.cfg.get('position_embedding_type', 'rope'),
+            logits_soft_cap=self.cfg.get('logits_soft_cap', 30.0),
+            rotary_percent=self.cfg.get('rotary_percentage', 0.5),
+            rotary_base=self.cfg.get('rotary_base', 10000),
+        )
+
+        return model
+
+    def forward(self, input_ids, position_ids=None, attention_mask=None, labels=None):
+
+        output_tensor = self.model(
+            input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask, labels=labels
+        )
+        return output_tensor
+
+    def build_transformer_config(self):
+        transformer_config = super().build_transformer_config()
+        transformer_config.gated_linear_unit = self.cfg.get('gated_linear_unit', True)
+        transformer_config.layernorm_zero_centered_gamma = self.cfg.get('layernorm_zero_centered_gamma', True)
+
+        return transformer_config
+
+    def on_validation_epoch_end(self):
+
+        averaged_loss = torch.tensor(0.0, dtype=torch.float32).cuda()
+        return averaged_loss
+
+    def sharded_state_dict(self, prefix: str = ''):
+        return None
+
+    def _reset_activation_checkpointing_args(self):
+        return
+
+    def _restore_activation_checkpointing_args(self):
+        return
+
+    def _reset_sequence_parallelism_args(self):
+        return
+
+    def _restore_sequence_parallelism_args(self):
+        return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py
new file mode 100644
index 000000000000..c53d231b2719
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron_griffin_sft_model.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from omegaconf import DictConfig
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
+
+try:
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+
+__all__ = ['MegatronGriffinSFTModel']
+
+
+class MegatronGriffinSFTModel(MegatronGPTSFTModel, MegatronGriffinModel):
+    """
+    Megatron Griffin Supervised Fine-Tuning
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        if not HAVE_APEX:
+            raise ImportError(
+                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+            )
+
+        super().__init__(cfg, trainer=trainer)
+        self.mcore_gpt = True
+        self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False)
+
+    def _reset_activation_checkpointing_args(self):
+        pass
+
+    def on_validation_model_zero_grad(self) -> None:
+        """
+         Skip gradient zeroing at the beginning of validation routine.
+         This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
+         """
+        if not self.validation_param_sync_overlap:
+            MegatronBaseModel.on_validation_model_zero_grad(self)
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index e29bb3423c4a..c6e96e94e6ff 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -330,6 +330,78 @@ def prepare_batch_at_step(
         return batch, tensor_shape
 
 
+class GriffinModelTextGenerationStrategy(TextGenerationStrategy):
+    def __init__(self, model):
+        super().__init__(model)
+        self.forward_model = self.model.model
+
+    def clip_max_len(self, maxlen: int) -> int:
+        """ clip the max len based on the LM model max sequence length"""
+
+        # for positional embedding types that allow length extrapolation, don't clip the max length
+        if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
+            if maxlen > self.model.cfg.encoder_seq_length + 1:
+                maxlen = self.model.cfg.encoder_seq_length + 1
+        return maxlen
+
+    def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_attention_mask: bool):
+        """initialize the batch data before the inference steps."""
+        # Move to GPU.
+        tokenizer = self.model.tokenizer
+        tokens = context_tokens.contiguous().cuda()
+        # Get the attention mask and postition ids.
+        self.attention_mask, _, self.position_ids = get_ltor_masks_and_position_ids(
+            tokens,
+            tokenizer.eos_id,
+            self.model.cfg.get('reset_position_ids', False),
+            self.model.cfg.get('reset_attention_mask', False),
+            self.model.cfg.get('eod_mask_loss', False),
+            compute_attention_mask=compute_attention_mask,
+        )
+        self.attention_mask = None
+
+    def prepare_batch_at_step(
+        self,
+        tokens: torch.Tensor,
+        maxlen: int,
+        micro_batch_size: int,
+        step: int,
+        context_length: int,
+        compute_attention_mask: bool = False,
+    ) -> Tuple[List[torch.Tensor], List[int]]:
+        """
+        generate the batch used in inference for each of the steps
+        """
+        # types2use = None
+        # Allocate memory for the entire context.
+
+        tokens2use = tokens
+
+        """Prepare batch for each of the inference steps"""
+        attention_mask_repeat = None
+
+        batch = [tokens2use, attention_mask_repeat]
+        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
+        return batch, (tensor_shape, context_length)
+
+    def forward_step(self, batch, tensor_shape_and_context_length):
+        tensor_shape, context_length = tensor_shape_and_context_length
+        fwd_bwd_function = get_forward_backward_func()
+
+        output_tensor = fwd_bwd_function(
+            forward_step_func=self.model.get_forward_output_only_func(),
+            data_iterator=iter([batch,]),
+            model=[self.forward_model],
+            num_microbatches=get_num_microbatches(),
+            forward_only=True,
+            seq_length=tensor_shape[0],
+            micro_batch_size=tensor_shape[1],
+        )
+
+        output_tensor[0]['logits'] = output_tensor[0]['logits'][:, :context_length, :]
+        return output_tensor
+
+
 def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, conv_template):
     from nemo.collections.multimodal.data.neva.neva_dataset import (
         DEFAULT_IMAGE_TOKEN,
@@ -821,6 +893,7 @@ def model_inference_strategy_dispatcher(model, **args):
     from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import (
         MegatronGPTPromptLearningModel,
     )
+    from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
     from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
     from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
     from nemo.collections.nlp.modules.common.retro_inference_strategies import (
@@ -829,6 +902,8 @@ def model_inference_strategy_dispatcher(model, **args):
         RetroQAModelTextGenerationStrategy,
     )
 
+    if isinstance(model, MegatronGriffinModel):
+        return GriffinModelTextGenerationStrategy(model)
     if isinstance(model, MegatronNevaModel):
         return NevaModelTextGenerationStrategy(model)
     if isinstance(model, MegatronGPTPromptLearningModel):
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 46e82089f0ea..9fd75ad8a95a 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -1,4 +1,6 @@
+accelerated-scan
 boto3
+causal-conv1d>=1.2.0
 einops
 faiss-cpu
 fasttext
diff --git a/scripts/checkpoint_converters/convert_griffin_hf_to_nemo.py b/scripts/checkpoint_converters/convert_griffin_hf_to_nemo.py
new file mode 100644
index 000000000000..44435cc21135
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_griffin_hf_to_nemo.py
@@ -0,0 +1,174 @@
+import os
+from argparse import ArgumentParser
+
+import torch
+from omegaconf.omegaconf import OmegaConf
+from transformers import AutoModelForCausalLM
+
+from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_griffin_config.yaml",
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--input_name_or_path", type=str, default="google/recurrentgemma-2b")
+    parser.add_argument(
+        "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(args):
+
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.trainer["precision"] = args.precision
+
+    logging.info(f"Loading checkpoint from HF: `{args.input_name_or_path}`")
+    hf_model = AutoModelForCausalLM.from_pretrained(args.input_name_or_path, device_map="auto")
+
+    trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer()
+
+    nemo_model_from_hf = MegatronGriffinModel(nemo_config.model, trainer)
+
+    new_state_dict = {}
+
+    new_state_dict['model.embedding.word_embeddings.weight'] = hf_model.state_dict()['model.embed_tokens.weight']
+    new_state_dict['model.decoder.final_layernorm.weight'] = hf_model.state_dict()['model.final_norm.weight']
+
+    for l in range(nemo_config.model.num_layers):
+        print(f"Converting Layer {l}")
+        print("********************")
+
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc1.weight'] = torch.cat(
+            [
+                hf_model.state_dict()[f'model.layers.{l}.mlp_block.gate_proj.weight'],
+                hf_model.state_dict()[f'model.layers.{l}.mlp_block.up_proj.weight'],
+            ]
+        )
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc1.bias'] = torch.cat(
+            [
+                hf_model.state_dict()[f'model.layers.{l}.mlp_block.gate_proj.bias'],
+                hf_model.state_dict()[f'model.layers.{l}.mlp_block.up_proj.bias'],
+            ]
+        ).flatten()
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc2.weight'] = hf_model.state_dict()[
+            f'model.layers.{l}.mlp_block.down_proj.weight'
+        ]
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc2.bias'] = hf_model.state_dict()[
+            f'model.layers.{l}.mlp_block.down_proj.bias'
+        ]
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc1._extra_state'] = nemo_model_from_hf.state_dict()[
+            f'model.decoder.layers.{l}.mlp.linear_fc1._extra_state'
+        ]
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc2._extra_state'] = nemo_model_from_hf.state_dict()[
+            f'model.decoder.layers.{l}.mlp.linear_fc2._extra_state'
+        ]
+
+        new_state_dict[f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'] = hf_model.state_dict()[
+            f'model.layers.{l}.channel_pre_norm.weight'
+        ]
+
+        if l % 3 == 2:
+            new_state_dict[f'model.decoder.layers.{l}.self_attention.linear_proj.weight'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.o_proj.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.self_attention.linear_proj.bias'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.o_proj.bias'
+            ]
+            new_state_dict[
+                f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+            ] = hf_model.state_dict()[f'model.layers.{l}.temporal_pre_norm.weight']
+            new_state_dict[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'] = torch.cat(
+                [
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.q_proj.weight'],
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.k_proj.weight'],
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.v_proj.weight'],
+                ]
+            )
+            new_state_dict[f'model.decoder.layers.{l}.self_attention.linear_qkv.bias'] = torch.zeros(
+                new_state_dict[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'].shape[0]
+            )
+            new_state_dict[
+                f'model.decoder.layers.{l}.self_attention.linear_proj._extra_state'
+            ] = nemo_model_from_hf.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_proj._extra_state']
+            new_state_dict[
+                f'model.decoder.layers.{l}.self_attention.linear_qkv._extra_state'
+            ] = nemo_model_from_hf.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv._extra_state']
+
+        else:
+
+            new_state_dict[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_in.layer_norm_weight'
+            ] = hf_model.state_dict()[f'model.layers.{l}.temporal_pre_norm.weight']
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.linear_in.weight'] = torch.cat(
+                [
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.linear_x.weight'],
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.linear_y.weight'],
+                ]
+            )
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.linear_in.bias'] = torch.cat(
+                [
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.linear_x.bias'],
+                    hf_model.state_dict()[f'model.layers.{l}.temporal_block.linear_y.bias'],
+                ]
+            )
+
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.linear_out.weight'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.linear_out.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.linear_out.bias'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.linear_out.bias'
+            ]
+
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.conv_1d.conv_1d.weight'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.conv_1d.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.conv_1d.conv_1d.bias'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.conv_1d.bias'
+            ]
+
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_param'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.rg_lru.recurrent_param'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.rg_lru.input_gate.w'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.rg_lru.input_gate_weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.rg_lru.input_gate.b'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.rg_lru.input_gate_bias'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_gate.w'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.rg_lru.recurrent_gate_weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_gate.b'] = hf_model.state_dict()[
+                f'model.layers.{l}.temporal_block.rg_lru.recurrent_gate_bias'
+            ]
+
+            new_state_dict[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_in._extra_state'
+            ] = nemo_model_from_hf.state_dict()[f'model.decoder.layers.{l}.recurrent_layer.linear_in._extra_state']
+            new_state_dict[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_out._extra_state'
+            ] = nemo_model_from_hf.state_dict()[f'model.decoder.layers.{l}.recurrent_layer.linear_out._extra_state']
+
+    nemo_model_from_hf.load_state_dict(new_state_dict, strict=True)
+    dtype = torch_dtype_from_precision(args.precision)
+    nemo_model_from_hf = nemo_model_from_hf.to(dtype=dtype)
+
+    nemo_model_from_hf.save_to(args.output_path)
+    logging.info(f'Griffin NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/checkpoint_converters/convert_griffin_nemo_to_hf.py b/scripts/checkpoint_converters/convert_griffin_nemo_to_hf.py
new file mode 100644
index 000000000000..265af9e55cbd
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_griffin_nemo_to_hf.py
@@ -0,0 +1,147 @@
+import os
+from argparse import ArgumentParser
+
+from omegaconf.omegaconf import OmegaConf
+from transformers import AutoConfig, RecurrentGemmaModel
+
+from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_griffin_config.yaml",
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--input_path", type=str, default=None, required=True)
+    parser.add_argument(
+        "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(args):
+
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.trainer["precision"] = args.precision
+
+    logging.info(f"Loading checkpoint from NeMo: `{args.input_path}`")
+
+    trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer()
+
+    nemo_model = MegatronGriffinModel.restore_from(args.input_path, trainer=trainer)
+    hf_config = AutoConfig.from_pretrained("google/recurrentgemma-2b")
+
+    # NeMo doesn't support LM Head for Griffin yet, so RecurrentGemmaModel is used instead of AutoModelForCausalLM
+    hf_model = RecurrentGemmaModel._from_config(hf_config)
+
+    new_state_dict = {}
+
+    new_state_dict['embed_tokens.weight'] = nemo_model.state_dict()['model.embedding.word_embeddings.weight']
+    new_state_dict['final_norm.weight'] = nemo_model.state_dict()['model.decoder.final_layernorm.weight']
+
+    for l in range(nemo_config.model.num_layers):
+        print(f"Converting Layer {l}")
+        print("********************")
+
+        (
+            new_state_dict[f'layers.{l}.mlp_block.gate_proj.weight'],
+            new_state_dict[f'layers.{l}.mlp_block.up_proj.weight'],
+        ) = nemo_model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.weight'].chunk(2)
+        (
+            new_state_dict[f'layers.{l}.mlp_block.gate_proj.bias'],
+            new_state_dict[f'layers.{l}.mlp_block.up_proj.bias'],
+        ) = nemo_model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.bias'].chunk(2)
+        new_state_dict[f'layers.{l}.mlp_block.down_proj.weight'] = nemo_model.state_dict()[
+            f'model.decoder.layers.{l}.mlp.linear_fc2.weight'
+        ]
+        new_state_dict[f'layers.{l}.mlp_block.down_proj.bias'] = nemo_model.state_dict()[
+            f'model.decoder.layers.{l}.mlp.linear_fc2.bias'
+        ]
+
+        new_state_dict[f'layers.{l}.channel_pre_norm.weight'] = nemo_model.state_dict()[
+            f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
+        ]
+
+        if l % 3 == 2:
+
+            new_state_dict[f'layers.{l}.temporal_block.o_proj.weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.o_proj.bias'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.self_attention.linear_proj.bias'
+            ]
+            new_state_dict[f'layers.{l}.temporal_pre_norm.weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+            ]
+            (
+                new_state_dict[f'layers.{l}.temporal_block.q_proj.weight'],
+                new_state_dict[f'layers.{l}.temporal_block.k_proj.weight'],
+                new_state_dict[f'layers.{l}.temporal_block.v_proj.weight'],
+            ) = nemo_model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'].split(
+                [2560, 256, 256]
+            )
+
+        else:
+
+            new_state_dict[f'layers.{l}.temporal_pre_norm.weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_in.layer_norm_weight'
+            ]
+            (
+                new_state_dict[f'layers.{l}.temporal_block.linear_x.weight'],
+                new_state_dict[f'layers.{l}.temporal_block.linear_y.weight'],
+            ) = nemo_model.state_dict()[f'model.decoder.layers.{l}.recurrent_layer.linear_in.weight'].chunk(2)
+            (
+                new_state_dict[f'layers.{l}.temporal_block.linear_x.bias'],
+                new_state_dict[f'layers.{l}.temporal_block.linear_y.bias'],
+            ) = nemo_model.state_dict()[f'model.decoder.layers.{l}.recurrent_layer.linear_in.bias'].chunk(2)
+
+            new_state_dict[f'layers.{l}.temporal_block.linear_out.weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_out.weight'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.linear_out.bias'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.linear_out.bias'
+            ]
+
+            new_state_dict[f'layers.{l}.temporal_block.conv_1d.weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.conv_1d.conv_1d.weight'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.conv_1d.bias'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.conv_1d.conv_1d.bias'
+            ]
+
+            new_state_dict[f'layers.{l}.temporal_block.rg_lru.recurrent_param'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_param'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.rg_lru.input_gate_weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.rg_lru.input_gate.w'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.rg_lru.input_gate_bias'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.rg_lru.input_gate.b'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.rg_lru.recurrent_gate_weight'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_gate.w'
+            ]
+            new_state_dict[f'layers.{l}.temporal_block.rg_lru.recurrent_gate_bias'] = nemo_model.state_dict()[
+                f'model.decoder.layers.{l}.recurrent_layer.rg_lru.a_gate.b'
+            ]
+
+    hf_model.load_state_dict(new_state_dict, strict=True)
+    dtype = torch_dtype_from_precision(args.precision)
+    hf_model = hf_model.to(dtype=dtype)
+
+    hf_model.save_pretrained(args.output_path)
+    logging.info(f'Full HF model model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)

From 93e326592c67f17af5c97fb4bd1f69371b861055 Mon Sep 17 00:00:00 2001
From: Ao Tang <aot@nvidia.com>
Date: Fri, 3 May 2024 12:10:59 -0400
Subject: [PATCH 028/178] Llama3 Conversion Script Update (#9089)

* Add conversion script and CI test

* fix llama2 vocab_file

* typo
---
 .github/workflows/cicd-main.yml               | 25 +++++++++++++++
 .../convert_llama_hf_to_nemo.py               | 32 ++++++++++++++++---
 .../convert_llama_nemo_to_hf.py               |  2 +-
 3 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index df631443e7f7..8389efff07ad 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -271,6 +271,31 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
+  L2_Community_LLM_Checkpoints_tests_Llama3:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
+            --input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \
+            --output_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo \
+            --precision=16
+            rm -f /home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
   L2_Community_LLM_Checkpoints_tests_StarCoder:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
diff --git a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
index c8ccf50aa05f..e1dc00c77439 100644
--- a/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
@@ -27,7 +27,7 @@
 import torch
 from omegaconf import OmegaConf
 from pytorch_lightning.trainer.trainer import Trainer
-from transformers import LlamaForCausalLM, LlamaTokenizer
+from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import (
@@ -78,7 +78,19 @@ def load_config(args, llama_config):
         nemo_config.num_query_groups = llama_config['num_key_value_heads']
     nemo_config.use_cpu_initialization = True
     nemo_config.activation = 'fast-swiglu'
-    nemo_config.tokenizer.model = llama_config['tokenizer_model']
+
+    # Tokenizer config
+    if 'tokenizer_model' in llama_config:
+        nemo_config.tokenizer.model = llama_config['tokenizer_model']
+    else:
+        # Llama3 uses converted TikToken Tokenizer
+        tokenizer_dict = {
+            'library': 'huggingface',
+            'type': args.input_name_or_path,
+            'use_fast': True,
+        }
+        nemo_config.tokenizer = tokenizer_dict
+
     if llama_config['rope_scaling'] is not None:
         if llama_config['rope_scaling']['type'] == 'linear':
             nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor']
@@ -98,9 +110,12 @@ def load_config(args, llama_config):
 def convert(args):
     logging.info(f"loading checkpoint {args.input_name_or_path}")
     model = LlamaForCausalLM.from_pretrained(args.input_name_or_path)
-    tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
     hf_config = vars(model.config)
-    hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+    if os.path.exists(f'{args.input_name_or_path}/tokenizer.model'):
+        tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
+        hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path)
     print(f"hf_config: {hf_config}")
     print("named parameters:")
     for name, param in model.named_parameters():
@@ -274,6 +289,15 @@ def convert(args):
 
     model._save_restore_connector = NLPSaveRestoreConnector()
 
+    # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
+    if 'tokenizer_model' not in hf_config:
+        if hf_config['num_hidden_layers'] == 32:
+            model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-8B')
+        elif hf_config['num_hidden_layers'] == 80:
+            model.cfg.tokenizer.update(type='meta-llama/Meta-Llama-3-70B')
+        else:
+            logging.warning("Unexpected model config for Llama3. Tokenizer config has not been modified.")
+
     # cast to target precision and disable cpu init
     dtype = torch_dtype_from_precision(precision)
     model = model.to(dtype=dtype)
diff --git a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
index 159676f8b58e..8da15148dfd8 100644
--- a/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
@@ -263,5 +263,5 @@ def replace_hf_weights_and_tokenizer(
             args.hf_output_tokenizer,
         )
     else:
-        logging.info("`hf-in-path` and/or `hf-out-path` not provided, not generating full HF model.")
+        logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.")
         logging.info(f".bin file is saved to {args.output_path}")

From 805e5ec595dd217a3c3b39577e0e998b2ce38570 Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Fri, 3 May 2024 12:23:31 -0400
Subject: [PATCH 029/178] Update radtts.py (#9097)

* Update radtts.py

Signed-off-by: Jason <jasoli@nvidia.com>

* Update Jenkinsfile

Signed-off-by: Jason <jasoli@nvidia.com>

* Update cicd-main.yml

Signed-off-by: Jason <jasoli@nvidia.com>

* Update cicd-main.yml

Signed-off-by: Jason <jasoli@nvidia.com>

* Update Jenkinsfile

Signed-off-by: Jason <jasoli@nvidia.com>

* Update cicd-main.yml

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Jason <jasoli@nvidia.com>
---
 .github/workflows/cicd-main.yml | 3 ++-
 examples/tts/radtts.py          | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 8389efff07ad..ad6a1faf78ae 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -78,7 +78,7 @@ jobs:
       run: |
         # Pull base PyTorch container
         docker pull nvcr.io/nvidia/pytorch:24.02-py3
-        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.02-py3 /bin/bash -c '
+        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.02-py3 /bin/bash -c '
             set -x
 
             # PyTorch version
@@ -6224,6 +6224,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_RADTTS:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 15
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
diff --git a/examples/tts/radtts.py b/examples/tts/radtts.py
index 7dbdaedced03..09bf69a2d6e5 100644
--- a/examples/tts/radtts.py
+++ b/examples/tts/radtts.py
@@ -68,7 +68,7 @@ def main(cfg):
     lr_logger = pl.callbacks.LearningRateMonitor()
     epoch_time_logger = LogEpochTimeCallback()
     trainer.callbacks.extend([lr_logger, epoch_time_logger])
-    trainer.fit(model.cuda())
+    trainer.fit(model)
 
 
 if __name__ == '__main__':

From f28773f14f6bdc8d0f7f7bee1da17aea44c2f803 Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Fri, 3 May 2024 19:29:56 +0200
Subject: [PATCH 030/178] Implement DistributedCheckpointIO (#9016)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Introduce DistributedCheckpointIO

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix DistCkptIO usage

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Use NeMo logger

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [DCIO] Fix save_to dist ckpt path

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Use dist ckpt flag in all methods

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Improve error msg

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add dist ckpt unit tests

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix load_checkpoint

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix auto-issues

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix ckpt_dir var

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Restore skipping behavior

The fix from prevent-duplicated-checkpoints is required to skip the checkpoints

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix steps on single-GPU machine

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add docs

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Apply black

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix num steps in tests

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Use dist-ckpt for Bert

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix load checkpoint return val

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Use dist-ckpt based on sharded_state_dict

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Use correct checkpoint_io

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../conf/megatron_gpt_config.yaml             |  4 +-
 .../nlp/parts/megatron_trainer_builder.py     |  9 +-
 nemo/collections/nlp/parts/nlp_overrides.py   | 84 +++++++---------
 nemo/utils/callbacks/dist_ckpt_io.py          | 85 ++++++++++++++++
 tests/core/test_dist_ckpt.py                  | 98 +++++++++++++++++++
 5 files changed, 227 insertions(+), 53 deletions(-)
 create mode 100644 nemo/utils/callbacks/dist_ckpt_io.py
 create mode 100644 tests/core/test_dist_ckpt.py

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index ea37237f2eac..57c82726ae11 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -150,8 +150,8 @@ model:
   fsdp_grad_reduce_dtype: 32 # Gradient reduction data type.
   fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
 
-  # PyTorch distributed checkpoint
-  torch_distributed_checkpoint: False # Set to True to use PyTorch distributed checkpoint format.
+  # Distributed checkpoint format
+  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
 
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 6b9763a53414..ad184157abc3 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -31,6 +31,7 @@
     PipelineMixedPrecisionPlugin,
 )
 from nemo.utils import logging
+from nemo.utils.callbacks.dist_ckpt_io import DistributedCheckpointIO
 
 
 class MegatronTrainerBuilder:
@@ -80,7 +81,6 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]:
             find_unused_parameters=False,
             nccl_communicator_config_path=self.cfg.model.get('nccl_communicator_config_path', None),
             sharp=self.cfg.model.get('sharp', False),
-            torch_dist_ckpt=self.cfg.model.get('torch_distributed_checkpoint', False),
         )
 
     def _grad_scaler(self) -> GradScaler:
@@ -127,6 +127,13 @@ def _plugins(self) -> list:
         if self.cfg.get('cluster_type', None) == 'BCP':
             plugins.append(TorchElasticEnvironment())
 
+        # Use dist-ckt for non-FSDP MCore models
+        use_dist_ckpt = not self.cfg.model.get('fsdp', False) and (
+            self.cfg.model.get('mcore_gpt', False) or self.cfg.model.get('mcore_bert', False)
+        )
+        if use_dist_ckpt:
+            plugins.append(DistributedCheckpointIO(self.cfg.model.get('dist_ckpt_format', 'zarr')))
+
         return plugins
 
     def create_trainer(self, callbacks=None) -> Trainer:
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 0a030759fe9b..b477c64a7510 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -25,6 +25,7 @@
 
 import pytorch_lightning as pl
 import torch
+from lightning_fabric.plugins import TorchCheckpointIO
 from lightning_fabric.utilities.cloud_io import get_filesystem
 from lightning_fabric.utilities.optimizer import _optimizer_to_device
 from omegaconf import OmegaConf
@@ -54,6 +55,8 @@
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.nn.parallel import DistributedDataParallel
 
+from nemo.utils.get_rank import is_global_rank_zero
+
 try:
     from torch.cuda.amp.grad_scaler import _refresh_per_optimizer_state
 except ImportError:
@@ -68,7 +71,6 @@
 from nemo.core.optim import MainParamsOptimizerWrapper
 from nemo.core.optim.optimizers import init_optimizer_states
 from nemo.utils import AppState, logging
-from nemo.utils.get_rank import is_global_rank_zero
 from nemo.utils.model_utils import ckpt_to_dir, inject_model_parallel_rank, uninject_model_parallel_rank
 
 try:
@@ -104,6 +106,7 @@
     from megatron.core.tensor_parallel.layers import param_is_not_tensor_parallel_duplicate
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_layer import TransformerLayer as MCoreTransformerLayer
+    from nemo.utils.callbacks.dist_ckpt_io import DistributedCheckpointIO
 
     HAVE_MEGATRON_CORE = True
 
@@ -178,7 +181,6 @@ def __init__(
         no_ddp_communication_hook: bool = False,
         nccl_communicator_config_path: Optional[str] = None,
         sharp: bool = False,
-        torch_dist_ckpt: bool = False,
         **kwargs: Union[Any, Dict[str, Any]],
     ) -> None:
         if not HAVE_APEX:
@@ -195,7 +197,6 @@ def __init__(
         self.no_ddp_communication_hook = no_ddp_communication_hook
         self.nccl_communicator_config_path = nccl_communicator_config_path
         self.sharp = sharp
-        self.torch_dist_ckpt = torch_dist_ckpt
 
     def setup(self, trainer: "pl.Trainer") -> None:
         """
@@ -350,10 +351,7 @@ def save_checkpoint(
             called on every rank and internally does the rank checking.
         """
         # check if using distributed checkpointing
-        if (
-            hasattr(self.lightning_module, 'sharded_state_dict')
-            and self.lightning_module.sharded_state_dict() is not None
-        ):
+        if self.use_distributed_checkpointing:
             assert (
                 len(checkpoint['optimizer_states']) == 1
             ), "Currently only support checkpointing 1 distributed optimizer per time!"
@@ -371,16 +369,10 @@ def save_checkpoint(
                 logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
                 return
 
-            if is_global_rank_zero():
-                fs.makedirs(checkpoint_dir, exist_ok=True)
-
             # remove device state_dict
             checkpoint['state_dict'] = OrderedDict([])
 
-            sharded_strategy = ('torch_dist', 1) if self.torch_dist_ckpt else ('zarr', 1)
-            dist_checkpointing.save(
-                sharded_state_dict=checkpoint, checkpoint_dir=checkpoint_dir, sharded_strategy=sharded_strategy
-            )
+            self.checkpoint_io.save_checkpoint(checkpoint, ckpt_to_dir(filepath), storage_options=storage_options)
         else:
             # PTL override to accomodate model parallel checkpoints
             filepath = inject_model_parallel_rank(filepath)
@@ -390,10 +382,7 @@ def save_checkpoint(
     # PTL 2.2 supports non strict loading of the ckpt with the strict arg (https://github.com/Lightning-AI/pytorch-lightning/pull/19404)
     def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = True) -> None:
         # if using distributed checkpointing, the state dict logic is at the model level
-        if (
-            hasattr(self.lightning_module, 'sharded_state_dict')
-            and self.lightning_module.sharded_state_dict() is not None
-        ):
+        if self.use_distributed_checkpointing:
             return
 
         # legacy state dict logic, does not use megatron core
@@ -442,11 +431,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
         fs = get_filesystem(checkpoint_path)
 
         # Check if using distributed checkpointing
-        if (
-            hasattr(self.lightning_module, 'sharded_state_dict')
-            and self.lightning_module.sharded_state_dict() is not None
-        ):
-
+        if self.use_distributed_checkpointing:
             # Distributed checkpoints must be directories.
             if not fs.isdir(checkpoint_path):
                 raise ValueError(f'Distributed checkpoints should be a directory. Found: {checkpoint_path}.')
@@ -458,16 +443,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
             # after dist_checkpointing.load, sharded tensors will be replaced with tensors
             checkpoint['state_dict'] = sharded_state_dict
             checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
-
-            if self.torch_dist_ckpt:
-                sharded_strategy = ('torch_dist', 1)
-            else:
-                sharded_strategy = tensorstore.TensorStoreLoadShardedStrategy(load_directly_on_device=True)
-            checkpoint = dist_checkpointing.load(
-                sharded_state_dict=checkpoint, checkpoint_dir=checkpoint_path, sharded_strategy=sharded_strategy
-            )
-
-            return checkpoint
+            return self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=checkpoint)
 
         # Legacy model parallel checkpointing logic, does not use megatron core
         else:
@@ -480,12 +456,8 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
 
     def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
         # check if filepath is a distributed checkpoint
-        if (
-            hasattr(self.lightning_module, 'sharded_state_dict')
-            and self.lightning_module.sharded_state_dict() is not None
-        ):
-            if self.is_global_zero:
-                shutil.rmtree(ckpt_to_dir(filepath), ignore_errors=True)
+        if self.use_distributed_checkpointing and self.is_global_zero:
+            self.checkpoint_io.remove_checkpoint(ckpt_to_dir(filepath))
 
         # legacy checkpoint logic, does not use megatron core
         else:
@@ -496,6 +468,25 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
                 logging.info(f'Removing checkpoint: {filepath}')
                 self.checkpoint_io.remove_checkpoint(filepath)
 
+    @property
+    def use_distributed_checkpointing(self):
+        has_dist_ckpt_io = HAVE_MEGATRON_CORE and isinstance(self.checkpoint_io, DistributedCheckpointIO)
+        has_sharded_state_dict = (
+            hasattr(self.lightning_module, 'sharded_state_dict')
+            and self.lightning_module.sharded_state_dict() is not None
+        )
+        if has_sharded_state_dict and not has_dist_ckpt_io:
+            logging.warning(
+                'Distributed checkpoints requires DistributedCheckpointIO plugin to be used. Setting up a default now.'
+            )
+            self.checkpoint_io = DistributedCheckpointIO(self.lightning_module.cfg.get('dist_ckpt_format', 'zarr'))
+        if not has_sharded_state_dict and has_dist_ckpt_io:
+            logging.warning(
+                'DistributedCheckpointIO configured but should not be used. Reverting back to TorchCheckpointIO'
+            )
+            self.checkpoint_io = TorchCheckpointIO()
+        return has_sharded_state_dict
+
     @property
     def distributed_sampler_kwargs(self):
         app_state = AppState()
@@ -887,14 +878,8 @@ def dummy():
                         if model.trainer.strategy.launcher is not None:
                             model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
                         model.trainer.strategy.setup_environment()
-                    sharded_strategy = (
-                        ('torch_dist', 1) if model.cfg.get("torch_distributed_checkpoint", False) else ('zarr', 1)
-                    )
-                    dist_checkpointing.save(
-                        sharded_state_dict=sharded_state_dict,
-                        checkpoint_dir=dist_ckpt_dir,
-                        sharded_strategy=sharded_strategy,
-                    )
+                    checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'zarr'))
+                    checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir)
 
             else:
 
@@ -1177,9 +1162,8 @@ def dummy():
                 tmp_model_weights_ckpt = os.path.join(tmpdir, self.model_weights_ckpt)
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
-                checkpoint = dist_checkpointing.load(
-                    sharded_state_dict=checkpoint, checkpoint_dir=tmp_model_weights_dir
-                )
+                checkpoint_io = DistributedCheckpointIO(conf.get('dist_ckpt_format', 'zarr'))
+                checkpoint = checkpoint_io.load_checkpoint(tmp_model_weights_dir, sharded_state_dict=checkpoint)
                 instance.on_load_checkpoint(checkpoint)
                 if hasattr(instance, 'setup_transformer_engine_tp_groups'):
                     instance.setup_transformer_engine_tp_groups()
diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
new file mode 100644
index 000000000000..7dff9b458a0d
--- /dev/null
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -0,0 +1,85 @@
+import shutil
+from typing import Any, Dict, Optional
+
+from lightning_fabric.plugins import CheckpointIO
+from lightning_fabric.utilities.cloud_io import get_filesystem
+from lightning_fabric.utilities.types import _PATH
+from megatron.core import dist_checkpointing
+from megatron.core.dist_checkpointing.strategies import tensorstore
+
+from nemo.utils import logging
+
+
+class DistributedCheckpointIO(CheckpointIO):
+    """ CheckpointIO for a distributed checkpoint format.
+
+    Args:
+        save_ckpt_format (str): Distributed checkpoint format to use for checkpoint saving.
+    """
+
+    def __init__(self, save_ckpt_format: str):
+        super().__init__()
+        self.save_ckpt_format = save_ckpt_format
+
+        self.save_sharded_strategy = self.determine_dist_ckpt_save_strategy()
+
+    def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
+        """ Saves a distributed checkpoint. Creates the checkpoint root directory if doesn't exist.
+
+        Args:
+            checkpoint (Dict[str, Any]): sharded state dict to save
+            path (_PATH): checkpoint directory
+            storage_options (Any, optional): Optional parameters when saving the checkpoint
+        """
+        fs = get_filesystem(path)
+        fs.makedirs(path, exist_ok=True)
+
+        dist_checkpointing.save(
+            sharded_state_dict=checkpoint, checkpoint_dir=path, sharded_strategy=self.save_sharded_strategy
+        )
+
+    def load_checkpoint(
+        self, path: _PATH, map_location: Optional[Any] = None, sharded_state_dict: Dict[str, Any] = None
+    ) -> Dict[str, Any]:
+        """ Loads a distributed checkpoint.
+
+        Args:
+            path (_PATH): checkpoint directory
+            map_location (Any, optional): required to be None in this implementation
+            sharded_state_dict (Dict[str, Any], optional): state dict which
+                defines the loading procedure for the distributed checkpoint.
+                Defaults to None to comply with the CheckpointIO interface,
+                but it's a required argument.
+
+        Returns:
+            Dist[str, Any]: loaded checkpoint.
+        """
+        if sharded_state_dict is None:
+            raise ValueError('DistributedCheckpointIO requires passing sharded_state_dict argument to load_checkpoint')
+        if map_location is not None:
+            raise ValueError('DistributedCheckpointIO doesnt handle map_location argument')
+
+        if self.save_ckpt_format == 'zarr':
+            sharded_strategy = tensorstore.TensorStoreLoadShardedStrategy(load_directly_on_device=True)
+        else:
+            sharded_strategy = None
+
+        return dist_checkpointing.load(
+            sharded_state_dict=sharded_state_dict, checkpoint_dir=path, sharded_strategy=sharded_strategy
+        )
+
+    def remove_checkpoint(self, path: _PATH) -> None:
+        """ Remove a distributed checkpoint.
+
+        Due to potentially large number of files, the implementation remove the whole directory at once.
+        """
+        shutil.rmtree(path, ignore_errors=True)
+
+    def determine_dist_ckpt_save_strategy(self):
+        """ Determine the saving strategy based on storage config.
+
+        For now only decides the checkpoint format.
+        """
+        save_strategy = (self.save_ckpt_format, 1)
+        logging.info(f'Using {save_strategy} dist-ckpt save strategy.')
+        return save_strategy
diff --git a/tests/core/test_dist_ckpt.py b/tests/core/test_dist_ckpt.py
new file mode 100644
index 000000000000..b6dc5ca89d3e
--- /dev/null
+++ b/tests/core/test_dist_ckpt.py
@@ -0,0 +1,98 @@
+import os
+import types
+from pathlib import Path
+
+import pytest
+import pytorch_lightning as pl
+import torch
+from lightning_fabric.plugins import TorchCheckpointIO
+from pytorch_lightning.demos.boring_classes import BoringModel
+
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils.callbacks.dist_ckpt_io import DistributedCheckpointIO
+
+
+class ExampleModel(BoringModel):
+    def on_validation_epoch_end(self) -> None:
+        self.log("val_loss", torch.tensor(1.0))
+
+
+class ExampleMCoreModel(ExampleModel):
+    def sharded_state_dict(self):
+        return {'a': 3}
+
+
+class MockDistributedCheckpointIO(DistributedCheckpointIO):
+    def __init__(self, save_ckpt_format):
+        super().__init__(save_ckpt_format)
+        self.save_checkpoint_called_args = None
+
+    def save_checkpoint(self, *args, **kwargs) -> None:
+        self.save_checkpoint_called_args = args, kwargs
+
+
+class MockTorchCheckpointIO(TorchCheckpointIO):
+    def __init__(self):
+        super().__init__()
+        self.save_checkpoint_called_args = None
+
+    def save_checkpoint(self, *args, **kwargs) -> None:
+        self.save_checkpoint_called_args = args, kwargs
+
+
+def _get_last_checkpoint_dir(root_dir: Path, model: pl.LightningModule, suffix: str = '') -> Path:
+    steps = len(model.train_dataloader().dataset) * model.trainer.max_epochs // torch.distributed.get_world_size()
+    return root_dir / 'checkpoints' / f'epoch=1-step={steps}{suffix}'
+
+
+class TestDistCkptIO:
+    @pytest.mark.run_only_on('GPU')
+    def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path):
+        strategy = NLPDDPStrategy()
+        # skip optimizer sharded state creation:
+        strategy.optimizer_sharded_state_dict = types.MethodType(
+            lambda self, unsharded_optim_state: unsharded_optim_state, strategy
+        )
+        checkpoint_io = MockDistributedCheckpointIO('xxx')
+
+        test_trainer = pl.Trainer(
+            enable_checkpointing=True,
+            logger=False,
+            max_epochs=2,
+            strategy=strategy,
+            plugins=[checkpoint_io],
+            default_root_dir=tmp_path,
+        )
+        model = ExampleMCoreModel()
+        test_trainer.fit(model)
+
+        assert isinstance(test_trainer.strategy.checkpoint_io, MockDistributedCheckpointIO)
+        assert checkpoint_io.save_checkpoint_called_args is not None
+        (state_dict, path), _ = checkpoint_io.save_checkpoint_called_args
+        # Ckpt path doesn't contain the .ckpt suffix
+        assert path.name == _get_last_checkpoint_dir(tmp_path, model).name, len(test_trainer.strategy.parallel_devices)
+
+    @pytest.mark.run_only_on('GPU')
+    def test_dist_ckpt_path_not_executed_for_non_core_models(self, tmp_path):
+        strategy = NLPDDPStrategy()
+        checkpoint_io = MockTorchCheckpointIO()
+
+        test_trainer = pl.Trainer(
+            enable_checkpointing=True,
+            logger=False,
+            max_epochs=2,
+            strategy=strategy,
+            plugins=[checkpoint_io],
+            default_root_dir=tmp_path,
+        )
+        model = ExampleModel()
+        test_trainer.fit(model)
+
+        assert isinstance(test_trainer.strategy.checkpoint_io, MockTorchCheckpointIO)
+        if test_trainer.is_global_zero:
+            assert checkpoint_io.save_checkpoint_called_args is not None
+            (state_dict, path), _ = checkpoint_io.save_checkpoint_called_args
+            # Ckpt path *does* contain the .ckpt suffix
+            assert os.path.basename(path) == _get_last_checkpoint_dir(tmp_path, model, suffix='.ckpt').name
+        else:
+            assert checkpoint_io.save_checkpoint_called_args is None

From c5a5a79ee154b1a7cd0caf4ef63b35b1ea24768d Mon Sep 17 00:00:00 2001
From: paul-gibbons <87940629+paul-gibbons@users.noreply.github.com>
Date: Fri, 3 May 2024 11:37:31 -0700
Subject: [PATCH 031/178] Video Neva Pretraining + Inference Implementation
 (#9095)

* video_neva pretrain

* support video neva inference

Signed-off-by: Vivian Chen <xuanzic@nvidia.com>

* yaml update, adding media_type

* yaml update, adding media_type

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* modify neva inference config

Signed-off-by: Vivian Chen <xuanzic@nvidia.com>

* modify based on review

Signed-off-by: Vivian Chen <xuanzic@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove video test asset

Signed-off-by: Vivian Chen <xuanzic@nvidia.com>

* video_neva doc, describing config changes.

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Revert "video_neva doc, describing config changes."

This reverts commit 1a02ccd3adf30e851b1f74b0780c4a785c92eb43.

* vneva brief doc

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* vneva doc update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* doc update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Revert "doc update"

This reverts commit 80af9a43a342fa3ab1c7a4f002694bb23fd2af91.

* doc update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Revert "doc update"

This reverts commit 8c885c7633b8b04ebdf3ce8280f2c3bb54ed0f20.

* doc update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Revert "doc update"

This reverts commit 94aba65911d518b083c9a238c8f02d06979ef1ec.

* doc update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* add inference doc to docs, resolve review

Signed-off-by: Vivian Chen <xuanzic@nvidia.com>

* modify inference config for other mlm

Signed-off-by: Vivian Chen <xuanzic@nvidia.com>

---------

Signed-off-by: Vivian Chen <xuanzic@nvidia.com>
Signed-off-by: paul-gibbons <paul@gibbonspaul.com>
Co-authored-by: Vivian Chen <xuanzic@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 docs/source/multimodal/mllm/video_neva.rst    | 134 +++++++++
 .../neva/conf/llava_config.yaml               |   1 +
 .../multimodal_llm/neva/conf/neva_config.yaml |   1 +
 .../neva/conf/neva_finetune.yaml              |   1 +
 .../neva/conf/neva_inference.yaml             |   5 +-
 .../multimodal_llm/neva/conf/neva_peft.yaml   |   1 +
 .../neva/conf/video_neva_config.yaml          | 222 ++++++++++++++
 .../multimodal_llm/neva/eval/gradio_server.py |   6 +-
 .../multimodal_llm/neva/eval/vqa_science.py   |  10 +-
 .../multimodal_llm/neva/neva_evaluation.py    |  22 +-
 .../multimodal/data/neva/conversation.py      |   1 +
 .../multimodal/data/neva/neva_dataset.py      | 274 +++++++++++++++---
 .../models/multimodal_llm/neva/neva_model.py  |   3 +-
 nemo/collections/multimodal/parts/utils.py    |  76 ++++-
 .../common/text_generation_strategy.py        |  15 +-
 .../modules/common/text_generation_utils.py   |   6 +-
 requirements/requirements_multimodal.txt      |   1 +
 tutorials/multimodal/NeVA Tutorial.ipynb      |   6 +-
 18 files changed, 702 insertions(+), 83 deletions(-)
 create mode 100644 docs/source/multimodal/mllm/video_neva.rst
 create mode 100644 examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml

diff --git a/docs/source/multimodal/mllm/video_neva.rst b/docs/source/multimodal/mllm/video_neva.rst
new file mode 100644
index 000000000000..b5831a45ab28
--- /dev/null
+++ b/docs/source/multimodal/mllm/video_neva.rst
@@ -0,0 +1,134 @@
+Video NeVA
+==========
+
+Model Introduction
+------------------
+
+Video NeVa adds support for video modality in NeVa by representing video as multiple image frames. 
+
+There is only a minor change done to :class:`~nemo.collections.multimodal.models.multimodal_llm.neva.neva_model.MegatronNevaModel` class in order to support pretraining on video input data.
+
+Representing video input as a series of images is done in :class:`~nemo.collections.multimodal.data.neva.TarOrFolderVideoLoader` class, using Decord which provides convenient video slicing methods. 
+
+
+Video Neva Configuration
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  data:
+    media_type: video
+    splice_single_frame: null
+    num_frames: 8
+    image_token_len: 256
+    image_folder: null
+    video_folder: null
+
+- ``media_type``: If set to `video`, NeVa's dataloader goes through the additional preprocessing steps to represent the input video data as a series of image frames.
+- ``splice_single_frame``: Can either be set as `first`, `middle` or `last`. This will result in only a single frame in that specific location of the video being selected.
+- ``image_token_len``: The NeVa dataloader calculates `image_token_len` based on the height and width of the preprocessed image frame and the patch size of the CLIP model being used. 
+
+.. code-block:: python
+
+  image_token_len = (224 // 14) * (224 // 14) = 16 * 16 = 256
+
+- ``num_frames``: This is used to select the number of image frames that will be used to represent the video.
+- ``video_folder``: This specifies the directory where the video files are located. This follows the same format as NeVa's `image_folder`.
+
+
+
+Inference with Video NeVA
+=========================
+
+We can run ``neva_evaluation.py`` located in ``NeMo/examples/multimodal/multimodal_llm/neva`` to generate inference results from the Video NeVA model.
+Currently, video NeVA supports both image and video inference by changing the config attribute ``inference.media_type`` in ``NeMo/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml`` to either ``image`` or ``video``, and adding the corresponding media path ``inference.media_base_path``.
+
+Inference with Pretrained Projectors with Base LM Model
+-------------------------------------------------------
+
+An example of an inference script execution:
+
+For running video inference::
+
+    CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_VISIBLE_DEVICES=0,1,2,3 python3 /path/to/neva_evaluation.py \
+    --config-path=/path/to/conf/ \
+    --config-name=neva_inference.yaml \
+    tensor_model_parallel_size=4 \
+    pipeline_model_parallel_size=1 \
+    neva_model_file=/path/to/projector/checkpoint \
+    base_model_file=/path/to/base/lm/checkpoint \
+    trainer.devices=4 \
+    trainer.precision=bf16 \
+    prompt_file=/path/to/prompt/file \
+    inference.media_base_path=/path/to/videos \
+    inference.media_type=video \
+    output_file=/path/for/output/file/ \
+    inference.temperature=0.2 \
+    inference.top_k=0 \
+    inference.top_p=0.9 \
+    inference.greedy=False \
+    inference.add_BOS=False \
+    inference.all_probs=False \
+    inference.repetition_penalty=1.2 \
+    inference.insert_media_token=right \
+    inference.tokens_to_generate=256 \
+    quantization.algorithm=awq \
+    quantization.enable=False
+
+Example format of ``.jsonl`` prompt_file::
+
+    {"video": "video_test.mp4", "text": "Can you describe the scene?", "category": "conv", "question_id": 0}
+
+input video file:: video_test.mp4
+
+Output::
+
+    <extra_id_0>System
+    A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+
+    <extra_id_1>User
+    Can you describe the scene?<video>
+    <extra_id_1>Assistant
+    <extra_id_2>quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4
+    CLEAN RESPONSE: Hand with a robot arm
+
+
+Inference with Finetuned Video NeVA Model (No Need to Specify Base LM)
+----------------------------------------------------------------------
+
+An example of an inference script execution:
+
+For running video inference::
+
+    CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_VISIBLE_DEVICES=0,1,2,3 python3 /path/to/neva_evaluation.py \
+    --config-path=/path/to/conf/ \
+    --config-name=neva_inference.yaml \
+    tensor_model_parallel_size=4 \
+    pipeline_model_parallel_size=1 \
+    neva_model_file=/path/to/video/neva/model \
+    trainer.devices=4 \
+    trainer.precision=bf16 \
+    prompt_file=/path/to/prompt/file \
+    inference.media_base_path=/path/to/videos \
+    inference.media_type=video \
+    output_file=/path/for/output/file/ \
+    inference.temperature=0.2 \
+    inference.top_k=0 \
+    inference.top_p=0.9 \
+    inference.greedy=False \
+    inference.add_BOS=False \
+    inference.all_probs=False \
+    inference.repetition_penalty=1.2 \
+    inference.insert_media_token=right \
+    inference.tokens_to_generate=256 \
+    quantization.algorithm=awq \
+    quantization.enable=False
+
+References
+----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
diff --git a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
index 83c5a4ba7106..68d554efb806 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
@@ -185,6 +185,7 @@ model:
     data_path:
     lazy_preprocess: True
     is_multimodal: True
+    media_type: image # currently supported: image
     sep_image_conv_front: False
     image_token_len: 256
     conv_template: llama_2 # check `nemo/collections/multimodal/data/neva/conversation.py`
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
index 0caf4beb6a12..b9904981a5db 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
@@ -187,6 +187,7 @@ model:
     data_path:
     lazy_preprocess: True
     is_multimodal: True
+    media_type: image # currently supported: image
     sep_image_conv_front: False
     image_token_len: 256
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_finetune.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_finetune.yaml
index cafee118a8bd..8cef53d5edf6 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_finetune.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_finetune.yaml
@@ -182,6 +182,7 @@ model:
     data_path:
     lazy_preprocess: True
     is_multimodal: True
+    media_type: image # currently supported: image
     sep_image_conv_front: False
     image_token_len: 256
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
index c822237f8fc9..145575d8a73b 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
@@ -10,8 +10,9 @@ inference:
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
-  images_base_path: /pwd/images
-  insert_image_token: null # `left` or `right` or `null`
+  media_base_path: /pwd/images # /path/to/images or /path/to/videos
+  insert_media_token: left # `left` or `right` or `null`
+  media_type: image # `image` or `video` 
 
 trainer:
   devices: 8
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
index add113cdc539..bde6718faf1a 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
@@ -193,6 +193,7 @@ model:
     data_path:
     lazy_preprocess: True
     is_multimodal: True
+    media_type: image
     sep_image_conv_front: False
     image_token_len: 256
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
diff --git a/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
new file mode 100644
index 000000000000..e2ba8494f2cd
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
@@ -0,0 +1,222 @@
+name: nemo_video_neva
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 10000  # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  check_val_every_n_epoch: null
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: nemo_video_neva
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
+model:
+  precision: ${trainer.precision}
+
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+
+  # Batch size guideline for different types of dataset
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 2 # will use more micro batches to reach global batch size
+
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  context_parallel_size: 1 # kqv model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  restore_from_path: null # used in fine-tuning
+
+  # Multimodal configs
+  mm_cfg:
+    llm:
+      from_pretrained:  #path to nemo checkpoint
+      freeze: True
+      model_type: llama_2 # `nvgpt` or `llama_2` supported
+    vision_encoder:
+      from_pretrained: "" # path or name
+      from_hf: True
+      patch_dim: 14
+      hidden_size: 1024 # could be found from model but tricky in code
+      vision_select_layer: -2   # default to the last layer
+      class_token_length: 1
+      freeze: True
+    pretrain_mm_mlp_adapter: null # path to pretrained mm adapter
+    mm_mlp_adapter_type: linear
+    use_im_start_end: False
+
+
+  # LLM configs
+  # use GPTModel from megatron.core
+  mcore_gpt: True
+
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: rope
+  num_layers: 32
+  hidden_size: 6144
+  ffn_hidden_size: 24576 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 48
+  init_method_std: 0.0134 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: layernorm1p # Type of normalization layers
+  layernorm_epsilon: 1e-5
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: False # Whether to use bias terms in all weight matrices.
+  activation: 'squared-relu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  rotary_percentage: 0.5 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  posistion_embedding_type: 'rope'
+  attention_type: 'multihead' # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: False # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
+  num_query_groups: 8 # Number of query groups for group query attention. If None, normal attention is used.
+  use_flash_attention: True
+
+  ## Activation Checkpointing
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: False
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # model fusions
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+  openai_gelu: False
+  bias_activation_fusion: False
+  megatron_legacy: False
+
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
+  async_grad_allreduce: False
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+
+  # miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: null
+    vocab_file: null
+    merge_file: null
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+    additional_special_tokens: null # ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>"]
+
+  data:
+    packed_sequence: False
+    num_workers: 8
+    dataloader_type: cyclic
+    data_path: null
+    lazy_preprocess: True
+    is_multimodal: True
+    media_type: video # currently supported: image or video
+    splice_single_frame: null # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded.
+    num_frames: 8 # selects the number of frames to use from the video
+    sep_token_between_frames: False # TODO: allow usage of separator tokens between frames
+    sep_image_conv_front: False
+    image_token_len: 256
+    conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
+    image_folder: null
+    video_folder: null
+    image_aspect_ratio: 'square'
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: fused_adam
+    lr: 2e-3
+    weight_decay: 0.
+    betas:
+      - 0.9
+      - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 140
+      constant_steps: 0
+      min_lr: 2e-4
diff --git a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
index b1308a7b0d3c..88cfdc4ed194 100644
--- a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
+++ b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
@@ -41,8 +41,8 @@
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
-  images_base_path: /pwd/images
-  insert_image_token: null # `left` or `right` or `null`
+  media_base_path: /pwd/images
+  insert_media_token: null # `left` or `right` or `null`
 
 cluster_type: BCP
 tensor_model_parallel_size: 1
@@ -58,7 +58,7 @@
 
 cfg = OmegaConf.create(CFG_STRING)
 cfg.neva_model_file = "/path/to/llava-v1.5-7b.nemo"
-model, image_processor = create_neva_model_and_processor(cfg)
+model, image_processor, _ = create_neva_model_and_processor(cfg)
 
 
 def predict(prompt, image_base64=None):
diff --git a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
index 62d8788067bb..a80c9e70f4ed 100644
--- a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
+++ b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
@@ -45,8 +45,8 @@
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
-  images_base_path: /pwd/images
-  insert_image_token: null # `left` or `right` or `null`
+  media_base_path: /pwd/images
+  insert_media_token: null # `left` or `right` or `null`
 
 cluster_type: BCP
 tensor_model_parallel_size: 1
@@ -77,12 +77,12 @@ def eval_model(args):
     cfg = OmegaConf.create(CFG_STRING)
     cfg.neva_model_file = args.model_path
     cfg.base_model_file = args.model_base
-    cfg.inference.images_base_path = args.image_folder
+    cfg.inference.media_base_path = args.image_folder
     cfg.tensor_model_parallel_size = args.tp
     cfg.pipeline_model_parallel_size = args.pp
     cfg.trainer.devices = args.tp * args.pp
 
-    model, image_processor = create_neva_model_and_processor(cfg)
+    model, image_processor, _ = create_neva_model_and_processor(cfg)
     length_params: LengthParam = {
         "max_length": cfg.inference.tokens_to_generate,
         "min_length": cfg.inference.min_tokens_to_generate,
@@ -113,7 +113,7 @@ def eval_model(args):
 
         if 'image' in line:
             cur_prompt = qs = '<image>' + cur_prompt
-            line['image'] = image_processor(os.path.join(cfg.inference.images_base_path, line['image']))
+            line['image'] = image_processor(os.path.join(cfg.inference.media_base_path, line['image']))
 
         if args.single_pred_prompt:
             qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
diff --git a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
index d9d9a71db757..179415392391 100644
--- a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
+++ b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
@@ -50,7 +50,7 @@ def __getitem__(self, idx):
 
 @hydra_runner(config_path="conf", config_name="neva_inference")
 def main(cfg) -> None:
-    model, image_processor = create_neva_model_and_processor(cfg)
+    model, image_processor, video_processor = create_neva_model_and_processor(cfg)
 
     length_params: LengthParam = {
         "max_length": cfg.inference.tokens_to_generate,
@@ -72,20 +72,26 @@ def main(cfg) -> None:
     with open(cfg.prompt_file, 'r') as f:
         lines = f.readlines()
 
-    insert_image_token = cfg.inference.get("insert_image_token", None)
+    media_type_token = cfg.inference.get("media_type", "image")
+    media_token = f"<{media_type_token}>"
+
+    insert_media_token = cfg.inference.get("insert_media_token", None)
     final_prompts = []
     for line in lines:
         prompt_dict = json.loads(line)
         assert 'prompt' in prompt_dict or 'text' in prompt_dict
         if 'prompt' not in prompt_dict:
             prompt_dict['prompt'] = prompt_dict['text']
-        if insert_image_token == 'left':
-            prompt_dict['prompt'] = '<image>' + prompt_dict['prompt']
-        elif insert_image_token == 'right':
-            prompt_dict['prompt'] = prompt_dict['prompt'] + '<image>'
+        if insert_media_token == 'left':
+            prompt_dict['prompt'] = media_token + prompt_dict['prompt']
+        elif insert_media_token == 'right':
+            prompt_dict['prompt'] = prompt_dict['prompt'] + media_token
         if 'image' in prompt_dict:
             prompt_dict['image_path'] = prompt_dict['image']
-            prompt_dict['image'] = image_processor(os.path.join(cfg.inference.images_base_path, prompt_dict['image']))
+            prompt_dict['image'] = image_processor(os.path.join(cfg.inference.media_base_path, prompt_dict['image']))
+        if 'video' in prompt_dict:
+            prompt_dict['video_path'] = prompt_dict['video']
+            prompt_dict['video'] = video_processor(os.path.join(cfg.inference.media_base_path, prompt_dict['video']))
         final_prompts.append(prompt_dict)
 
     responses = model.generate(
@@ -134,6 +140,8 @@ def forward_loop():
             prompt['model_id'] = cfg.neva_model_file
             if 'image_path' in prompt:
                 prompt['image'] = prompt.pop('image_path')
+            if 'video_path' in prompt:
+                prompt['video'] = prompt.pop('video_path')
             if 'answer_id' not in prompt:
                 prompt['answer_id'] = 0
             if 'metadata' not in prompt:
diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py
index d51a5f973f99..80a297a5b952 100644
--- a/nemo/collections/multimodal/data/neva/conversation.py
+++ b/nemo/collections/multimodal/data/neva/conversation.py
@@ -20,6 +20,7 @@
 DEFAULT_EOS_TOKEN = "<extra_id_7>"
 DEFAULT_UNK_TOKEN = "<unk>"
 DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
 DEFAULT_SYSTEM_TOKEN = "<extra_id_0>"
 DEFAULT_SEPARATOR_TOKEN = "<extra_id_1>"
 DEFAULT_LABELS_TOKEN = "<extra_id_2>"
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index ddd409e928b2..38ada63449ae 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -20,6 +20,8 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Sequence, Tuple, Union
 
+import decord
+import numpy as np
 import torch
 import torch.nn.functional as F
 import transformers
@@ -43,6 +45,7 @@
     DEFAULT_SEPARATOR_TOKEN,
     DEFAULT_SYSTEM_TOKEN,
     DEFAULT_UNK_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
 )
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
 
@@ -103,6 +106,87 @@ def open_image(self, file_name):
         return None
 
 
+class TarOrFolderVideoLoader:
+    """
+    A class for loading videos from a tar archive or a regular folder.
+
+    This class provides functionality to open and read videos from either a tar archive
+    (.tar file) or a standard directory with video files. It builds an index of videos
+    if the source is a tar archive for efficient access.
+
+    Attributes:
+        video_folder (str): The path to the tar archive or video folder.
+        data_cfg (dict): A dictionary of configuration options for video decoding to frames
+        tar_index (dict): A dictionary that maps file names to their tarfile member
+                          objects if the video source is a tar archive.
+
+    Methods:
+        __init__(self, video_folder): Initializes the loader with the specified video folder.
+        build_index(self): Builds an index of image file names and their corresponding
+                           tarfile member objects for a tar archive.
+        open_video(self, file_name): Opens and returns an video by its file name. The video
+                                     is returned as a list of RGB PIL Image objects.
+        flatten_frames(self, cap): Converts decord VideoReader video object to list of frame
+                                   images based on data config information.
+    """
+
+    def __init__(self, video_folder, data_cfg):
+        self.video_folder = video_folder
+        self.data_cfg = data_cfg
+        self.tar_index = {}
+        if self.video_folder.endswith('.tar'):
+            self.build_index()
+
+    def build_index(self):
+        with tarfile.open(self.video_folder, 'r') as tar:
+            for member in tar.getmembers():
+                self.tar_index[member.name] = member
+
+    def open_video(self, file_name):
+        if self.video_folder.endswith('.tar'):
+            with tarfile.open(self.video_folder, 'r') as tar:
+                member = self.tar_index.get(file_name)
+                if member:
+                    f = tar.extractfile(member)
+                    cap = decord.VideoReader(f)
+                    return self.flatten_frames(cap)
+        else:
+            cap = decord.VideoReader(os.path.join(self.video_folder, file_name))
+            return self.flatten_frames(cap)
+        return None
+
+    def flatten_frames(self, cap):
+        if self.data_cfg['splice_single_frame'] == 'first':
+            frame = cap[0].asnumpy()[:, :, ::-1]
+            return Image.fromarray(frame).convert('RGB')
+        elif self.data_cfg['splice_single_frame'] == 'middle':
+            frame = cap[len(cap) // 2].asnumpy()[:, :, ::-1]
+            return Image.fromarray(frame).convert('RGB')
+        elif self.data_cfg['splice_single_frame'] == 'last':
+            frame = cap[-1].asnumpy()[:, :, ::-1]
+            return Image.fromarray(frame).convert('RGB')
+        else:
+            if self.data_cfg['num_frames'] == -1:
+                frames = []
+                for frame in cap:
+                    rgb_frame = frame.asnumpy()[:, :, ::-1]
+                    img = Image.fromarray(rgb_frame).convert('RGB')
+                    frames.append(img)
+                return frames
+            else:
+                num_frames = min(len(cap), self.data_cfg['num_frames'])
+                indices = np.linspace(0, len(cap) - 1, num_frames, dtype=int)
+                frames = []
+                for i in indices:
+                    rgb_frame = cap[i].asnumpy()[:, :, ::-1]
+                    img = Image.fromarray(rgb_frame).convert('RGB')
+                    frames.append(img)
+
+                while len(frames) < self.data_cfg['num_frames']:
+                    frames.append(frames[-1])
+                return frames
+
+
 def tokenize(
     texts: Union[str, List[str]], tokenizer: Any, context_length: int, add_extra_token: int,
 ) -> torch.LongTensor:
@@ -167,33 +251,46 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
     - dict: The processed sources dictionary after applying multimodal preprocessing steps.
     """
     is_multimodal = multimodal_cfg['is_multimodal']
+    media_type = multimodal_cfg['media_type']
     image_token_len = cur_token_len
+    if media_type == 'image':
+        default_token = DEFAULT_IMAGE_TOKEN
+    elif media_type == 'video':
+        default_token = DEFAULT_VIDEO_TOKEN
+    else:
+        return sources
+
     if not is_multimodal:
         return sources
 
+    num_patches = image_token_len
+    if media_type == 'video':
+        num_patches *= multimodal_cfg['num_frames']
+
     if multimodal_cfg['use_im_start_end']:
-        replace_token = DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
+        replace_token = DEFAULT_IMAGE_PATCH_TOKEN * num_patches
     else:
-        replace_token = DEFAULT_IMAGE_PATCH_TOKEN * (image_token_len - 2)
+        replace_token = DEFAULT_IMAGE_PATCH_TOKEN * (num_patches - 2)
+
     replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
 
     for source in sources:
         conversation = source['conversations']
         if multimodal_cfg['sep_image_conv_front']:
-            assert DEFAULT_IMAGE_TOKEN in conversation[0]['value']
-            conversation[0]['value'] = conversation[0]['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
+            assert default_token in conversation[0]['value']
+            conversation[0]['value'] = conversation[0]['value'].replace(default_token, '').strip()
             conversation[0]['value'] = (
-                DEFAULT_IMAGE_TOKEN
+                default_token
                 + conversation_lib.default_conversation.sep
                 + conversation_lib.default_conversation.roles[0]
                 + ": "
                 + conversation[0]['value']
             )
         if use_plain:
-            assert DEFAULT_IMAGE_TOKEN in conversation[0]['value']
-            conversation[0]['value'] = DEFAULT_IMAGE_TOKEN
+            assert default_token in conversation[0]['value']
+            conversation[0]['value'] = default_token
         for turn in conversation:
-            turn["value"] = turn["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+            turn["value"] = turn["value"].replace(default_token, replace_token)
 
     return sources
 
@@ -626,11 +723,9 @@ def preprocess_plain(sources, tokenizer, cfg,) -> Dict:
 class LazySupervisedDataset(Dataset):
     """Dataset for supervised fine-tuning."""
 
-    def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict):
+    def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict, data_cfg: dict):
         super(LazySupervisedDataset, self).__init__()
-        logging.warning("Loading data...")
         if data_path is not None:
-            logging.warning("Loading data...")
             with open(data_path, "r") as file:
                 list_data_dict = json.load(file)
         else:
@@ -642,9 +737,11 @@ def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict):
         self.multimodal_cfg = multimodal_cfg
         self.conv_template = multimodal_cfg["conv_template"]
         self.image_folder = multimodal_cfg['image_folder']
+        self.video_folder = multimodal_cfg['video_folder']
         self.processor = multimodal_cfg["image_processor"]
 
         self.image_loader = TarOrFolderImageLoader(self.image_folder)
+        self.video_loader = TarOrFolderVideoLoader(self.video_folder, data_cfg)
 
     def __len__(self):
         return len(self.list_data_dict)
@@ -698,11 +795,11 @@ def expand2square(pil_img, background_color):
                     ), 'NeMo image transform with setting `image_aspect_ratio` to `square`.'
                     image = self.processor(image)
                 images.append(image)
-            images_tensors = torch.tensor([])
+            media_tensors = torch.tensor([])
             if images:
-                images_tensors = torch.stack(images)
-                cur_token_len = (images_tensors[0].shape[1] // 14) * (
-                    images_tensors[0].shape[2] // 14
+                media_tensors = torch.stack(images)
+                cur_token_len = (media_tensors[0].shape[1] // 14) * (
+                    media_tensors[0].shape[2] // 14
                 )  # FIXME: 14 is hardcoded patch size
                 sources = preprocess_multimodal(
                     copy.deepcopy(sources),
@@ -710,8 +807,66 @@ def expand2square(pil_img, background_color):
                     cur_token_len,
                     use_plain=(self.conv_template == "plain"),
                 )
+        elif 'video' in sources[0]:
+            if not isinstance(self.list_data_dict[i]['video'], list):
+                self.list_data_dict[i]['video'] = [self.list_data_dict[i]['video']]
+
+            videos = []
+            for video_file in self.list_data_dict[i]['video']:
+                frames = self.video_loader.open_video(video_file)
+                if frames is None:
+                    logging.warning(f"Video {video_file} could not be found!")
+                if isinstance(self.processor, CLIPImageProcessor):
+                    # image processor from HF
+                    if self.multimodal_cfg['image_aspect_ratio'] == 'keep':
+                        max_hw, min_hw = max(frames.size), min(frames.size)
+                        aspect_ratio = max_hw / min_hw
+                        max_len, min_len = 448, 224
+                        shortest_edge = int(min(max_len / aspect_ratio, min_len))
+                        frames = self.processor.preprocess(
+                            frames, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
+                        )['pixel_values']
+                    elif self.multimodal_cfg['image_aspect_ratio'] == 'pad':
+
+                        def expand2square(pil_img, background_color):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+
+                        frames = expand2square(frames, tuple(int(x * 255) for x in self.processor.image_mean))
+                        frames = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']
+                    else:
+                        frames = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']
+                else:
+                    assert (
+                        self.multimodal_cfg['image_aspect_ratio'] == 'square'
+                    ), 'NeMo image transform with setting `image_aspect_ratio` to `square`.'
+                    frames = self.processor(frames)
+                videos.append(frames)
+            media_tensors = frames
+            if videos:
+                media_tensors = torch.stack(videos)
+                cur_token_len = (media_tensors[0].shape[-1] // 14) * (
+                    media_tensors[0].shape[-2] // 14
+                )  # FIXME: 14 is hardcoded patch size
+                sources = preprocess_multimodal(
+                    copy.deepcopy(sources),
+                    self.multimodal_cfg,
+                    cur_token_len,
+                    use_plain=(self.conv_template == "plain"),
+                )
+
         else:
-            images_tensors = torch.tensor([])
+            logging.warning("media not found in sources")
+            media_tensors = torch.tensor([])
             sources = copy.deepcopy(sources)
 
         if self.conv_template in ["nvgpt", "nv_steerlm"]:
@@ -736,47 +891,55 @@ def expand2square(pil_img, background_color):
                 crop_size = [self.processor.crop_size['height'], self.processor.crop_size['width']]
             else:
                 crop_size = self.multimodal_cfg['crop_size']
-            # image does not exist in the data, but the model is multimodal
-            zero_padding = torch.zeros(
-                (MAX_NUM_IMAGES - len(images_tensors), 3, crop_size[0], crop_size[1]), dtype=torch.float
-            )
-            images_tensors = torch.cat((images_tensors, zero_padding), dim=0)
-            data_dict['image'] = images_tensors
+
+            # Image does not exist in the data, but the model is multimodal
+            # TODO, if there are different videos on T dimensions.
+            if media_tensors.shape[0] < MAX_NUM_IMAGES:
+                padding_size = MAX_NUM_IMAGES - media_tensors.shape[0]
+                zero_padding = torch.zeros((padding_size, 3, crop_size[0], crop_size[1]), dtype=torch.float)
+                media_tensors = torch.cat((media_tensors, zero_padding), dim=0)
+
+            if self.multimodal_cfg['media_type'] == 'image':
+                data_dict['image'] = media_tensors
+            elif self.multimodal_cfg['media_type'] == 'video':
+                data_dict['video'] = media_tensors
+
         return data_dict
 
 
 class NevaDataset(LazySupervisedDataset):
     """Dataset for supervised fine-tuning."""
 
-    def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict):
+    def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict, data_cfg: dict):
 
         if data_path.endswith(".json"):
-            super(NevaDataset, self).__init__(data_path, tokenizer, multimodal_cfg)
+            super(NevaDataset, self).__init__(data_path, tokenizer, multimodal_cfg, data_cfg)
 
         elif data_path.endswith(".jsonl"):
             super(NevaDataset, self).__init__(None, tokenizer, multimodal_cfg)
             logging.warning("Loading image inputs from SteerLM Dataset")
-            image_folder = multimodal_cfg['image_folder']
-            for line in open(data_path, "r"):
-                record = json.loads(line)
-
-                # This currently supports only a single image
-                # search for <img src="/absolute/path/to/image" in the conversation
-                #   add it as record['image'], remove src tag from the <img> tag
-
-                record['image'] = []
-                for turn in record['conversations']:
-                    matches = re.finditer('<img src="([^"]+)"', turn['value'])
-                    for match in matches:
-                        image_name = match.group(1).split("/")[-1]
-                        image_path = os.path.join(image_folder, image_name)
-                        if not os.path.isfile(image_path):
-                            logging.warning(f"Image not found: {image_path}")
-                            continue
-                        record['image'].append(image_name)  # url
-                    turn['value'] = re.sub('<img src="([^"]+)">', DEFAULT_IMAGE_TOKEN, turn['value'])
-
-                self.list_data_dict.append(record)
+            if multimodal_cfg['media_type'] == 'image':
+                image_folder = multimodal_cfg['image_folder']
+                for line in open(data_path, "r"):
+                    record = json.loads(line)
+
+                    # This currently supports only a single image
+                    # search for <img src="/absolute/path/to/image" in the conversation
+                    #   add it as record['image'], remove src tag from the <img> tag
+
+                    record['image'] = []
+                    for turn in record['conversations']:
+                        matches = re.finditer('<img src="([^"]+)"', turn['value'])
+                        for match in matches:
+                            image_name = match.group(1).split("/")[-1]
+                            image_path = os.path.join(image_folder, image_name)
+                            if not os.path.isfile(image_path):
+                                logging.warning(f"Image not found: {image_path}")
+                                continue
+                            record['image'].append(image_name)  # url
+                        turn['value'] = re.sub('<img src="([^"]+)">', DEFAULT_IMAGE_TOKEN, turn['value'])
+
+                    self.list_data_dict.append(record)
 
         else:
             raise ValueError(f"Formatting of {data_path} is not supported in Neva.")
@@ -818,7 +981,13 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
 
         tokens = batch['tokens']
         labels = batch['labels']
-        media = batch.get('image')
+        media_type = model_cfg.data.get('media_type')
+        if media_type == 'image':
+            media = batch.get('image')
+        elif media_type == 'video':
+            media = batch.get('video')
+        else:
+            raise ValueError(f"Unsupported media type {media_type}")
 
         if packed_sequence:
             cu_seqlens = batch["cu_seqlens"]
@@ -847,7 +1016,10 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         if media is None:
             raise NotImplementedError
         else:
-            media = rearrange(media, "b T c h w -> b T 1 c h w")
+            if media_type == 'image':
+                media = rearrange(media, "b T c h w -> b T 1 c h w")
+            elif media_type == 'video':
+                media = rearrange(media, "b T F c h w -> b T F c h w")
 
         batch = {
             'tokens': tokens,
@@ -888,11 +1060,19 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
             crop_size=crop_size,
             image_token_len=data_cfg.image_token_len,
             image_folder=data_cfg.image_folder,
+            video_folder=data_cfg.video_folder,
             image_aspect_ratio=data_cfg.image_aspect_ratio,
             use_im_start_end=getattr(model_cfg.mm_cfg, 'use_im_start_end', False),
             image_processor=image_processor,
             add_extra_token=add_extra_token,
             context_length=model_cfg.encoder_seq_length,
+            media_type=data_cfg.media_type,
+            num_frames=data_cfg.num_frames,
+        ),
+        data_cfg=dict(
+            splice_single_frame=data_cfg.splice_single_frame,
+            num_frames=data_cfg.num_frames,
+            sep_token_between_frames=data_cfg.sep_token_between_frames,
         ),
     )
 
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 5b50a8340b06..7192a1b018b1 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -168,7 +168,6 @@ def encode_vision_x(self, vision_x: torch.Tensor):
 
         assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
         b, T, F = vision_x.shape[:3]
-        assert F == 1, "Only single frame supported"
 
         vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
         vision_x = vision_x.to(self.vision_encoder.dtype)
@@ -195,7 +194,7 @@ def replace_media_embeddings(self, input_ids, inputs_embeds, media):
         # calculate media features without gradients
         media_features = self.encode_vision_x(media)  # b T F S(eq) H(idden)
         num_images_per_sample = media_features.size(1)
-        num_patches = media_features.size(3)
+        num_patches = media_features.size(3) * media_features.size(2)
         # flatten patches
         media_features = media_features.view(batch_size, -1, hidden_size)
 
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 71c28cf00855..2a739a96dbbf 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -15,6 +15,7 @@
 import tempfile
 from typing import Any, Callable, Tuple
 
+import numpy as np
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 from PIL import Image
@@ -25,6 +26,7 @@
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import AppState, logging
 from nemo.utils.model_utils import inject_model_parallel_rank
 
@@ -178,7 +180,6 @@ def setup_trainer_and_models_for_inference(
     # Create the NLPSaveRestoreConnector object for model saving and restoring.
     save_restore_connector = NLPSaveRestoreConnector()
 
-    print(f'Loading {cfg.models} models')
     models = []
     for single_model_cfg in cfg.models:
         if not single_model_cfg.restore_from_path:
@@ -389,6 +390,7 @@ def create_neva_model_and_processor(cfg):
             (
                 app_state.tensor_model_parallel_rank,
                 app_state.pipeline_model_parallel_rank,
+                app_state.expert_model_parallel_rank,
                 app_state.model_parallel_size,
                 app_state.data_parallel_size,
                 app_state.pipeline_model_parallel_split_rank,
@@ -459,13 +461,71 @@ def expand2square(pil_img, background_color):
         else:
             image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 
-        if neva_cfg.precision in [16, '16', '16-mixed']:
-            media = image.type(torch.float16)
-        elif neva_cfg.precision in [32, '32', '32-true']:
-            media = image.type(torch.float32)
+        media = image.type(torch_dtype_from_precision(neva_cfg.precision))
+        return media.unsqueeze(dim=0).unsqueeze(dim=0).unsqueeze(dim=0)
+
+    # add video processor for video neva
+    def video_processor(maybe_video_path):
+        from decord import VideoReader
+
+        if isinstance(maybe_video_path, str):
+            vr = VideoReader(maybe_video_path)
+            if neva_cfg.data.splice_single_frame == 'first':
+                frames = [Image.fromarray(vr[0].asnumpy()[:, :, ::-1]).convert('RGB')]
+            elif neva_cfg.data.splice_single_frame == 'middle':
+                frames = [Image.fromarray(vr[len(vr) // 2].asnumpy()[:, :, ::-1]).convert('RGB')]
+            elif neva_cfg.data.splice_single_frame == 'last':
+                frames = [Image.fromarray(vr[-1].asnumpy()[:, :, ::-1]).convert('RGB')]
+            else:
+                if neva_cfg.data.num_frames == -1:
+                    frames = [Image.fromarray(frame.asnumpy()[:, :, ::-1]).convert('RGB') for frame in vr]
+                else:
+                    num_frames = min(len(vr), neva_cfg.data.num_frames)
+                    indices = np.linspace(0, len(vr) - 1, num_frames, dtype=int)
+                    frames = [Image.fromarray(vr[i].asnumpy()[:, :, ::-1]).convert('RGB') for i in indices]
+
+                    while len(frames) < neva_cfg.data.num_frames:
+                        frames.append(frames[-1])
         else:
-            media = image.type(torch.bfloat16)
+            frames = maybe_video_path
 
-        return media.unsqueeze(dim=0).unsqueeze(dim=0).unsqueeze(dim=0)
+        if neva_cfg.mm_cfg.vision_encoder.from_hf:
+            processor = CLIPImageProcessor.from_pretrained(
+                neva_cfg.mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
+            )
+        else:
+            processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16)
+
+        # support single video inference
+        if neva_cfg.data.image_aspect_ratio == 'keep':
+            max_hw, min_hw = max(frames.size), min(frames.size)
+            aspect_ratio = max_hw / min_hw
+            max_len, min_len = 448, 224
+            shortest_edge = int(min(max_len / aspect_ratio, min_len))
+            frames = processor.preprocess(
+                frames, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
+            )['pixel_values']
+        elif neva_cfg.data.image_aspect_ratio == 'pad':
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            frames = expand2square(frames, tuple(int(x * 255) for x in processor.image_mean))
+            frames = processor.preprocess(frames, return_tensors='pt')['pixel_values']
+        else:
+            frames = processor.preprocess(frames, return_tensors='pt')['pixel_values']
+
+        media_tensors = frames.type(torch_dtype_from_precision(neva_cfg.precision))
+        return media_tensors.unsqueeze(dim=0).unsqueeze(dim=0)
 
-    return model, image_processor
+    return model, image_processor, video_processor
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index c6e96e94e6ff..fd32ac844274 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -421,14 +421,19 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
             'conversations': [{'from': 'User', 'value': prompt}, {'from': 'Assistant', 'value': '',},],
         }
 
-        for turn in record['conversations']:  #
+        for turn in record['conversations']:
             if turn.get('value') is not None:
                 turn['value'] = re.sub('<image>', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value'])
         list_data_dict.append(record)
 
-        sources = preprocess_multimodal(
-            copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
-        )  # HARDCODED FOR NOW
+        # overwrite the media_type in multimodal_cfg to image for image inference using video neva
+        # if the prompt does not contain video, then the media_type is image
+        if list_data_dict[0]['conversations'][0]['value'].find('video') == -1:
+            if multimodal_cfg.get('media_type') is not None and multimodal_cfg.get('num_frames') is not None:
+                multimodal_cfg['media_type'] = 'image'
+                multimodal_cfg['num_frames'] = 1
+
+        sources = preprocess_multimodal(copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents)
         if multimodal_cfg["conv_template"] in ["nvgpt", "nv_steerlm"]:
             data_dict = preprocess_nvgpt(sources, tokenizer, multimodal_cfg)
         else:
@@ -489,6 +494,8 @@ def __init__(self, model):
             image_processor=None,
             add_extra_token=add_extra_token,
             context_length=self.cfg.encoder_seq_length,
+            media_type=getattr(self.data_cfg, 'media_type', 'image'),
+            num_frames=getattr(self.data_cfg, 'num_frames', 1),
         )
 
     def clip_max_len(self, maxlen: int) -> int:
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index b50c9de682f7..850eb3d5c778 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -154,6 +154,8 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
     conv_template = model.cfg.data.get("conv_template", "nvgpt")
     final_response = []
     for idx, prompt_dict in enumerate(prompt_dict_list):
+        # determine the media type in the prompt_dict
+        media_type_token = inference_config.inference.get("media_type", "image")
         response = generate(
             model,
             inputs=prompt_dict.get('prompt'),
@@ -169,7 +171,7 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
             end_strings=sampling_params['end_strings'],
             min_tokens_to_generate=length_params['min_length'],
             compute_attention_mask=sampling_params.get("compute_attention_mask", True),
-            image_list=prompt_dict.get('image'),
+            image_list=prompt_dict.get(media_type_token),
             **strategy_args,
         )
 
@@ -181,7 +183,7 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
         pattern = re.compile(rf'{DEFAULT_IM_START_TOKEN}( ⁇ )+{DEFAULT_IM_END_TOKEN}')
         pattern_nvgpt = re.compile(rf'{DEFAULT_IM_START_TOKEN}({DEFAULT_IMAGE_PATCH_TOKEN})+{DEFAULT_IM_END_TOKEN}')
         combined_pattern = re.compile(f'{pattern.pattern}|{pattern_nvgpt.pattern}')
-        clean_text = re.sub(combined_pattern, '<image>', response['sentences'][0])
+        clean_text = re.sub(combined_pattern, f"<{media_type_token}>", response['sentences'][0])
 
         clean_response = clean_text
 
diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
index ee19c836e604..1fdce2c160d9 100644
--- a/requirements/requirements_multimodal.txt
+++ b/requirements/requirements_multimodal.txt
@@ -1,5 +1,6 @@
 addict
 clip
+decord
 diffusers>=0.19.3
 einops_exts
 imageio
diff --git a/tutorials/multimodal/NeVA Tutorial.ipynb b/tutorials/multimodal/NeVA Tutorial.ipynb
index 20b5e5a1c82c..5e2607dcd801 100644
--- a/tutorials/multimodal/NeVA Tutorial.ipynb	
+++ b/tutorials/multimodal/NeVA Tutorial.ipynb	
@@ -295,7 +295,7 @@
     "trainer.devices=1 \\\n",
     "trainer.precision=bf16 \\\n",
     "prompt_file=/path/to/prompt/file \\\n",
-    "inference.images_base_path=/path/to/image \\\n",
+    "inference.media_base_path=/path/to/image \\\n",
     "output_file=path/for/output/file/ \\\n",
     "inference.temperature=0.2 \\\n",
     "inference.top_k=0 \\\n",
@@ -304,7 +304,7 @@
     "inference.add_BOS=False \\\n",
     "inference.all_probs=False \\\n",
     "inference.repetition_penalty=1.2 \\\n",
-    "inference.insert_image_token=null \\\n",
+    "inference.insert_media_token=null \\\n",
     "inference.tokens_to_generate=256 \\\n",
     "quantization.algorithm=awq \\\n",
     "quantization.enable=False"
@@ -322,7 +322,7 @@
     "##### Inference Config Setup\n",
     "1. Modify `fw_inference` within `defaults` to use `neva/inference` \n",
     "2. In `stages`, ensure that `fw_inference` is included\n",
-    "3. Within the `inference.yaml` default NeVA inference config file, ensure that the path to the `prompt` file, `neva_model_file`, and `images_base_path` within `inference` are specified.\n",
+    "3. Within the `inference.yaml` default NeVA inference config file, ensure that the path to the `prompt` file, `neva_model_file`, and `media_base_path` within `inference` are specified.\n",
     "\n",
     "Once either the necessary checkpoints have been loaded or the training workflow is complete, inference can be executed within the launcher pipeline with the following command:"
    ]

From b4c5644f00b1734c13d99e1ef9e10290a62ff379 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Fri, 3 May 2024 11:53:16 -0700
Subject: [PATCH 032/178] HF to .nemo for Mixtral-8x22B-instruct (#9060)

* fixes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Make sentencepiece model if souce ckpt does not include it

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* limit --precision valid choises

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename functions

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove unused var

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../convert_mixtral_hf_to_nemo.py             | 149 ++++++++++++++++--
 1 file changed, 133 insertions(+), 16 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
index 98143c0328ec..8183b0d142c1 100644
--- a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
@@ -53,14 +53,15 @@ def get_args():
         "--input_name_or_path", type=str, default=None, required=True, help="Path to Huggingface Mixtral checkpoints",
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
-    parser.add_argument("--precision", type=str, default="32", help="Model precision")
+    valid_precision_values = [16, '16', 'bf16', '16-mixed', 'bf16-mixed', 32, '32']
+    parser.add_argument("--precision", type=str, default="32", choices=valid_precision_values, help="Model precision")
     parser.add_argument('--low-ram', action='store_true')
     parser.add_argument('--tmp-dir', default='/tmp/mixtral_ckpt_parts/')
     args = parser.parse_args()
     return args
 
 
-def load_model(cls, checkpoint, strict, **kwargs):
+def restore_model_from_checkpoint(cls, checkpoint, strict, **kwargs):
     try:
         if 'cfg' in kwargs:
             model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs)
@@ -68,7 +69,8 @@ def load_model(cls, checkpoint, strict, **kwargs):
             model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs)
             for name, module in model.named_parameters():
                 if name in checkpoint['state_dict']:
-                    module.data = checkpoint['state_dict'][name]
+                    # cast to target precision and
+                    module.data = checkpoint['state_dict'][name].to(dtype=module.data.dtype)
                     checkpoint['state_dict'].pop(name)
                 else:
                     print(f"Unexpected key: {name} not in checkpoint but in model.")
@@ -160,21 +162,24 @@ def load_mixtral_ckpt(in_dir, load_model=True):
     return model_args, ckpt, tokenizer
 
 
-def make_trainer(args, nemo_config):
-    model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path, load_model=False)
-    nemo_config = load_config(model_args, tokenizer.vocab_file)
-
-    if args.precision in ["32", "16"]:
-        precision = int(float(args.precision))
-    elif args.precision in ["bf16", "bf16-mixed"]:
+def parse_precision(precision):
+    if precision in ["32", "16"]:
+        return int(float(precision))
+    elif precision in ["bf16", "bf16-mixed"]:
         if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
-            precision = args.precision
+            return precision
         else:
             logging.warning("BF16 is not supported on this device. Using FP16 instead.")
-            precision = args.precision[2:]  # prune bf in string
+            return precision[2:]  # prune bf in string
     else:
-        precision = args.precision
+        return precision
+
 
+def make_trainer(args, nemo_config):
+    model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path, load_model=False)
+    nemo_config = load_config(model_args, tokenizer.vocab_file)
+
+    precision = parse_precision(args.precision)
     plugins = []
     if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
         scaler = None
@@ -363,11 +368,123 @@ def merge(a: dict, b: dict, path=[]):
     return a
 
 
+def init_spm(spm_model_cls):
+    from google.protobuf.json_format import Parse, ParseDict
+
+    src = {
+        "trainerSpec": {
+            "modelPrefix": "tok_v0",
+            "modelType": "BPE",
+            "vocabSize": 32000,
+            "selfTestSampleSize": 0,
+            "inputFormat": "text",
+            "characterCoverage": 0.99995,
+            "inputSentenceSize": "200000000",
+            "seedSentencepieceSize": 1000000,
+            "shrinkingFactor": 0.75,
+            "numThreads": 80,
+            "numSubIterations": 2,
+            "maxSentenceLength": 4192,
+            "shuffleInputSentence": True,
+            "maxSentencepieceLength": 16,
+            "splitByUnicodeScript": True,
+            "splitByWhitespace": True,
+            "splitByNumber": True,
+            "treatWhitespaceAsSuffix": False,
+            "splitDigits": True,
+            "allowWhitespaceOnlyPieces": True,
+            "vocabularyOutputPieceScore": True,
+            "hardVocabLimit": True,
+            "useAllVocab": False,
+            "byteFallback": True,
+            "requiredChars": "",
+            "unkId": 0,
+            "bosId": 1,
+            "eosId": 2,
+            "padId": -1,
+            "unkSurface": " \u2047 ",
+            "unkPiece": "<unk>",
+            "bosPiece": "<s>",
+            "eosPiece": "</s>",
+            "padPiece": "<pad>",
+            "trainExtremelyLargeCorpus": False,
+            "enableDifferentialPrivacy": False,
+            "differentialPrivacyNoiseLevel": 0.0,
+            "differentialPrivacyClippingThreshold": "0",
+            "pretokenizationDelimiter": "",
+        },
+        "normalizerSpec": {
+            "name": "identity",
+            "precompiledCharsmap": "",
+            "addDummyPrefix": True,
+            "removeExtraWhitespaces": False,
+            "normalizationRuleTsv": "",
+        },
+    }
+    return ParseDict(src, spm_model_cls.ModelProto())
+
+
+def make_sentencepiece_tokenizer(hf_tok):
+    import sys
+
+    sys.path.insert(0, 'sentencepiece/python/src/sentencepiece/')
+    try:
+        import sentencepiece_model_pb2 as spm_model_cls  # import model # sentencepiece_model as model
+    except ImportError:
+        # If this fails, download sentencepiece and extract it here.
+        print(
+            "Sentencepiece was not found; run `(cd scripts/checkpoint_converters; git clone https://github.com/google/sentencepiece.git)` & retry"
+        )
+        quit()
+
+    vocab = list(hf_tok.vocab.items())
+    vocab.sort(key=lambda x: x[1])
+
+    m = init_spm(spm_model_cls)
+    prefix = 0
+    found_boundary = False
+    for token, i in vocab:
+        new_token = spm_model_cls.ModelProto().SentencePiece()
+        # print(token, len(token), type(token), i)
+        new_token.piece = token
+        if token == '<unk>':
+            if not found_boundary:
+                prefix += 1
+            new_token.type = 2
+            new_token.score = 0
+        elif token in ['<s>', '</s>']:
+            if not found_boundary:
+                prefix += 1
+            new_token.type = 3
+            new_token.score = 0
+        elif len(token) == 6 and token.startswith('<0x') and token[-1] == '>':
+            if not found_boundary:
+                prefix += 1
+            new_token.type = 6
+            new_token.score = 0
+        elif set(token) == set(["▁"]):
+            if token == '▁▁':
+                found_boundary = True
+            new_token.score = -1e09
+        else:
+            new_token.score = -float(i) + prefix
+        m.pieces.append(new_token)
+
+    output_path = 'new.model'
+    with open(output_path, 'wb') as fp:
+        fp.write(m.SerializeToString())
+    return output_path
+
+
 def save_to_nemo(args, checkpoint):
 
     logging.info(f"loading checkpoint {args.input_name_or_path}")
     model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path, load_model=False)
+    if tokenizer.vocab_file is None:
+        tokenizer.vocab_file = make_sentencepiece_tokenizer(tokenizer)
     nemo_config = load_config(model_args, tokenizer.vocab_file)
+    nemo_config.precision = parse_precision(args.precision)
+    nemo_config.megatron_amp_O2 = True
     trainer, dtype = make_trainer(args, nemo_config)
 
     checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
@@ -379,13 +496,13 @@ def save_to_nemo(args, checkpoint):
         for key in keys:
             checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
 
-    model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
+    model = restore_model_from_checkpoint(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
 
     model._save_restore_connector = NLPSaveRestoreConnector()
 
-    # cast to target precision and disable cpu init
-    model = model.to(dtype=dtype)
+    # disable cpu init
     model.cfg.use_cpu_initialization = False
+    model.cfg.perform_initialization = True
 
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')

From 57c55f38569dfc44074c5c98cc67a02014c19c9f Mon Sep 17 00:00:00 2001
From: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Date: Fri, 3 May 2024 18:22:48 -0400
Subject: [PATCH 033/178] dehardcode test string (#8865)

* dehardcode test string

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo_utils.py

Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
---
 nemo/export/trt_llm/nemo_utils.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py
index fdcad3279213..ee2073fa518d 100644
--- a/nemo/export/trt_llm/nemo_utils.py
+++ b/nemo/export/trt_llm/nemo_utils.py
@@ -193,7 +193,9 @@ def nemo_llm_to_model_config(
     return model_configs, tokenizer
 
 
-def to_word_list_format(word_dict: List[List[str]], tokenizer=None):
+def to_word_list_format(
+    word_dict: List[List[str]], tokenizer=None, ref_str="<extra_id_1>",
+):
     '''
     format of word_dict
         len(word_dict) should be same to batch_size
@@ -207,10 +209,9 @@ def to_word_list_format(word_dict: List[List[str]], tokenizer=None):
 
     flat_ids = []
     offsets = []
-    # We use a similar trick as in NeMo to deal with the fact that the encoding of a single word
-    # can't always be trusted. See
+    # The encoding of a single word can't always be trusted. See
     #   https://github.com/NVIDIA/NeMo/blob/bb575b72fd0be51ae10cc77d9f89ddb9e9d3b96d/nemo/collections/nlp/modules/common/text_generation_strategy.py#L229
-    ids_ref = tokenizer.encode("<extra_id_1>")
+    ids_ref = tokenizer.encode(ref_str)
     for word_dict_item in word_dict:
         item_flat_ids = []
         item_offsets = []
@@ -220,7 +221,7 @@ def to_word_list_format(word_dict: List[List[str]], tokenizer=None):
 
         words = list(csv.reader(word_dict_item))[0]
         for word in words:
-            ids = tokenizer.encode(f"<extra_id_1>{word}")
+            ids = tokenizer.encode(f"{ref_str}{word}")
             if ids[0 : len(ids_ref)] == ids_ref:
                 # It worked! We can obtain the token(s) associated to `word` by stripping the prefix tokens.
                 ids = ids[len(ids_ref) :]

From be606a0b298cb6b8a7238354bcb3210a723e0bbc Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Sat, 4 May 2024 05:41:03 +0300
Subject: [PATCH 034/178] mcore ds updates (#8951)

* add function-converter

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove swp file

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mcore ds updates

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mcore_gpt to mock data gen test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix retro dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .github/workflows/cicd-main.yml                    |  3 ++-
 Jenkinsfile                                        |  2 +-
 .../language_modeling/megatron/retro_dataset.py    |  3 ++-
 .../models/language_modeling/megatron_gpt_model.py | 14 ++++++++++----
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index ad6a1faf78ae..70c8cc3b1cdd 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -118,7 +118,7 @@ jobs:
             # Megatron Core installation
             git clone https://github.com/NVIDIA/Megatron-LM.git && \
                 pushd Megatron-LM && \
-                git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \
+                git checkout 709472117364eed93b6a767e2ac343e229d3aa89 && \
                 pip install . && \
                   pushd megatron/core/datasets && \
                   make && \
@@ -6070,6 +6070,7 @@ jobs:
                 trainer.limit_val_batches=7 \
                 trainer.val_check_interval=10 \
                 exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+                model.mcore_gpt=True \
                 model.data.data_impl=mock \
                 model.data.data_prefix=[]
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
diff --git a/Jenkinsfile b/Jenkinsfile
index cbc52d20c41c..3de178e3a12b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -88,7 +88,7 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \
+             git checkout 709472117364eed93b6a767e2ac343e229d3aa89 && \
              pip install . && \
              cd megatron/core/datasets && \
              make'
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py
index 377bff309b7c..0f8d3410398d 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py
@@ -40,6 +40,7 @@
         MultiSplitGPTDatasetConfig,
     )
     from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets
+    from megatron.core.datasets.utils import get_blend_from_list
     from megatron.core.models.retro import RetroConfig
 
     from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
@@ -190,7 +191,7 @@ def is_dataset_built_on_rank():
     data_config = MultiSplitGPTDatasetConfig(
         random_seed=cfg.seed,
         sequence_length=cfg.data.seq_length,
-        blend=cfg.data.data_prefix,
+        blend=get_blend_from_list(cfg.data.data_prefix),
         split=cfg.data.splits_string,
         split_preprocessing=cfg.data.retro_data.retro_split_preprocessing,
         path_to_cache=None,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index d7f489abf158..551ca42f3f4d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -89,6 +89,7 @@
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
+    from megatron.core.datasets.utils import get_blend_from_list
     from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace
     from megatron.core.dist_checkpointing.mapping import LocalNonpersitentObject, ShardedObject
 
@@ -1403,12 +1404,17 @@ def build_train_valid_test_datasets(self):
                 "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
             }
 
+            data_prefix = self.cfg.data.data_prefix
+
             # support for dict data input type
-            if isinstance(self.cfg.data.data_prefix, DictConfig):
-                _pref = self.cfg.data.data_prefix
-                kwargs['blend_per_split'] = [_pref['train'], _pref['validation'], _pref['test']]
+            if isinstance(data_prefix, DictConfig):
+                kwargs['blend_per_split'] = [
+                    get_blend_from_list(data_prefix.train),
+                    get_blend_from_list(data_prefix.validation),
+                    get_blend_from_list(data_prefix.test),
+                ]
             else:
-                kwargs['blend'] = self.cfg.data.data_prefix
+                kwargs['blend'] = data_prefix if mock_dataset else get_blend_from_list(data_prefix)
                 kwargs["split"] = self.cfg.data.splits_string
 
             if self.cfg.data.get('add_fim', False):

From b198a11f0b142a47ef309ec89c34af092be0c672 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 3 May 2024 19:57:08 -0700
Subject: [PATCH 035/178] [Nemo CICD] Try trigger cicd run on comment (#9111)

* try trigger cicd run on comment

* try trigger cicd run on comment
---
 .github/workflows/cicd-main.yml | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 70c8cc3b1cdd..1e0d9925be7c 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -17,6 +17,8 @@ on:
   pull_request:
     branches: [ "main" ]
     types: [ labeled ]
+  issue_comment:
+    types: [ created ]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -25,7 +27,15 @@ concurrency:
 jobs:
   gpu-test:
     runs-on: self-hosted-azure
-    if: ${{ github.event.label.name == 'Run CICD' }}
+    if: >
+      (
+        github.event.label.name == 'Run CICD'
+      ) || (
+        github.event_name == 'issue_comment' &&
+        github.event.issue.pull_request &&
+        contains(fromJSON('["MEMBER", "COLLABORATOR", "OWNER"]'), github.event.comment.author_association) &&
+        contains(github.event.comment.body, '/cicd')
+      )
     steps:
     - name: Run nvidia-smi test
       run: |
@@ -34,7 +44,15 @@ jobs:
 
   cicd-cluster-clean:
     runs-on: self-hosted-azure-builder
-    if: ${{ github.event.label.name == 'Run CICD' }}
+    if: >
+      (
+        github.event.label.name == 'Run CICD'
+      ) || (
+        github.event_name == 'issue_comment' &&
+        github.event.issue.pull_request &&
+        contains(fromJSON('["MEMBER", "COLLABORATOR", "OWNER"]'), github.event.comment.author_association) &&
+        contains(github.event.comment.body, '/cicd')
+      )
     steps:
     - name: Clean server from old files
       run: |
@@ -57,7 +75,15 @@ jobs:
   cicd-test-container-setup:
     needs: [cicd-cluster-clean]
     runs-on: self-hosted-azure-builder
-    if: ${{ github.event.label.name == 'Run CICD' }}
+    if: >
+      (
+        github.event.label.name == 'Run CICD'
+      ) || (
+        github.event_name == 'issue_comment' &&
+        github.event.issue.pull_request &&
+        contains(fromJSON('["MEMBER", "COLLABORATOR", "OWNER"]'), github.event.comment.author_association) &&
+        contains(github.event.comment.body, '/cicd')
+      )
     # uses: actions/cache@v2
     #container:
 #      image: nvcr.io/nvidia/pytorch:24.02-py3

From 2454ea0012bb46abce7b066309c6b0311ee87fa1 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 3 May 2024 20:29:11 -0700
Subject: [PATCH 036/178] RADTTS test optional (#9112)

---
 .github/workflows/cicd-main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 1e0d9925be7c..684ddbc81f41 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -6287,8 +6287,8 @@ jobs:
             ~trainer.check_val_every_n_epoch \
             ~model.text_normalizer \
             ~model.text_normalizer_call_kwargs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+        #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+        #  if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_Mixer-TTS:
     needs: [cicd-test-container-setup]
@@ -6530,7 +6530,7 @@ jobs:
       - L2_TTS_Fast_dev_runs_1_Tacotron_2
       - L2_TTS_Fast_dev_runs_1_WaveGlow
       - L2_TTS_Fast_dev_runs_1_FastPitch
-      - L2_TTS_Fast_dev_runs_1_RADTTS
+      #- L2_TTS_Fast_dev_runs_1_RADTTS
       - L2_TTS_Fast_dev_runs_1_Mixer-TTS
       - L2_TTS_Fast_dev_runs_1_Hifigan
       - Speech_Checkpoints_tests

From 3f272dc083425f09148697b588bce507b00f26c8 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Sat, 4 May 2024 01:20:29 -0500
Subject: [PATCH 037/178] Alit/griffin perf (#9107)

---
 .../conf/megatron_griffin_config.yaml         |   3 +
 .../megatron_griffin_finetuning_config.yaml   |   7 ++
 .../megatron_griffin_generate_config.yaml     |   3 +
 .../megatron/gpt_sft_dataset.py               |  11 +-
 .../megatron/griffin/griffin_model.py         |  42 +++++--
 .../megatron/griffin/recurrent_module.py      | 118 ++++++++++--------
 .../megatron_gpt_sft_model.py                 |   1 +
 7 files changed, 120 insertions(+), 65 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
index ea23cf630f8b..c080ff846ba1 100644
--- a/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
@@ -42,6 +42,7 @@ exp_manager:
 model:
   restore_from_path: null
   # model parallelism 
+  mcore_gpt: True
   micro_batch_size: 2
   global_batch_size: 2
   tensor_model_parallel_size: 1
@@ -153,6 +154,8 @@ model:
     masked_lm_prob: 0.15 # Probability of replacing a token with mask.
     short_seq_prob: 0.1 # Probability of producing a short sequence.
     ceil_to_power_2: True
+    get_attention_mask_from_fusion: True
+    pad_to_max_length: True
   
   optim:
     name: fused_adam
diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
index 64d1b67bc148..e144c784fb0c 100644
--- a/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
@@ -50,6 +50,7 @@ exp_manager:
 model:
   restore_from_path: 
   # model parallelism 
+  mcore_gpt: True
   micro_batch_size: 2
   global_batch_size: 2
   tensor_model_parallel_size: 1
@@ -215,6 +216,8 @@ model:
       prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
       truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
       ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
     validation_ds:
         file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
         names: null # Names of the corresponding datasets used to log metrics.
@@ -239,6 +242,8 @@ model:
         tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
         truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
         ceil_to_power_2: True
+        get_attention_mask_from_fusion: True
+        pad_to_max_length: True
         metric:
           name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
           average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
@@ -266,6 +271,8 @@ model:
       prompt_template: ${model.data.train_ds.prompt_template}
       tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
       truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
       metric:
         name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
         average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
index 4b3c14c846d1..b09cce5671c9 100644
--- a/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
@@ -50,6 +50,7 @@ exp_manager:
 model:
   restore_from_path: null
   # model parallelism 
+  mcore_gpt: True
   micro_batch_size: 2
   global_batch_size: 2
   tensor_model_parallel_size: 1
@@ -209,6 +210,8 @@ model:
       tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
       truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
       ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
 
       metric:
         name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index 6354387c18e7..faaa10606aa0 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -62,6 +62,7 @@ def __init__(
         is_test: bool = False,
         output_original_text: bool = False,
         ceil_to_power_2: bool = False,
+        get_attention_mask_from_fusion: bool = False,
     ):
         """
         file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
@@ -112,6 +113,7 @@ def __init__(
         self.is_test = is_test
         self.output_original_text = output_original_text
         self.ceil_to_power_2 = ceil_to_power_2
+        self.get_attention_mask_from_fusion = get_attention_mask_from_fusion
 
         if special_tokens is None:
             self.special_tokens = {
@@ -464,8 +466,9 @@ def collate_fn(self, batch):
             max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, self.pad_seq_length_to_mult))
         assert max_length <= self.max_seq_length
 
-        attention_mask = [self._create_attention_mask(max_length) for _ in batch]
-        attention_mask = torch.stack(attention_mask)
+        if not self.get_attention_mask_from_fusion:
+            attention_mask = [self._create_attention_mask(max_length) for _ in batch]
+            attention_mask = torch.stack(attention_mask)
         position_ids = [list(range(max_length)) for _ in batch]
         position_ids = torch.LongTensor(position_ids)
         input_ids = torch.LongTensor(
@@ -479,7 +482,6 @@ def collate_fn(self, batch):
         processed_batch = {
             'tokens': input_ids,
             'labels': labels,
-            'attention_mask': attention_mask,
             'loss_mask': loss_mask,
             'position_ids': position_ids,
             'contexts': contexts,
@@ -489,6 +491,9 @@ def collate_fn(self, batch):
             'token_count': token_count,
         }
 
+        if not self.get_attention_mask_from_fusion:
+            processed_batch['attention_mask'] = attention_mask
+
         return processed_batch
 
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
index 9f00fb9dd156..4531b64d1d96 100755
--- a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
@@ -15,6 +15,7 @@
 import math
 
 import torch
+from megatron.core import tensor_parallel
 from megatron.core.jit import jit_fuser
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
 from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
@@ -35,7 +36,9 @@ def __init__(
         max_sequence_length: int = 1024,
         rotary_percent: float = 0.5,
         rotary_base: int = 10000,
-        pre_process=True,
+        pre_process: bool = True,
+        post_process: bool = True,
+        share_embeddings_and_output_weights: bool = True,
     ):
 
         super().__init__(config)
@@ -44,8 +47,8 @@ def __init__(
         self.logits_soft_cap = logits_soft_cap
         self.position_embedding_type = position_embedding_type
         self.pre_process = pre_process
-        self.post_process = False
-        self.share_embeddings_and_output_weights = True
+        self.post_process = post_process
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
 
         if pre_process:
             self.embedding = LanguageModelEmbedding(
@@ -66,6 +69,22 @@ def __init__(
 
         self.decoder = GriffinStack(self.config)
 
+        if self.post_process:
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights,
+                embedding_activation_buffer=None,
+                grad_output_buffer=None,
+            )
+
+        if self.pre_process or self.post_process:
+            self.setup_embeddings_and_output_layer()
+
     def shared_embedding_or_output_weight(self) -> Tensor:
         """Gets the emedding weight or output logit weights when share embedding and output weights set to True.
 
@@ -117,13 +136,6 @@ def _embedding_decode_(self, logits, transpose):
             logits = logits.transpose(0, 1)
         return logits.contiguous()
 
-    def embedding_decode(self, x, transpose):
-        x = x.permute(1, 0, 2)
-        logits = x @ self.embedding.word_embeddings.state_dict()['weight'].T
-        logits = self._embedding_decode_(logits, transpose)
-
-        return logits
-
     def forward(
         self,
         input_ids: Tensor,
@@ -145,7 +157,15 @@ def forward(
 
         hidden_states = self.decoder(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
 
-        logits = self.embedding_decode(hidden_states, labels is not None)
+        if not self.post_process:
+            return hidden_states
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+        logits = self._embedding_decode_(logits, labels is None)
 
         if labels is None:
             # [b s h]
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
index 6cd9eeaadc63..d91c07718917 100755
--- a/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
@@ -16,12 +16,12 @@
 from dataclasses import dataclass
 from typing import Union
 
-import einops
 import torch
-from accelerated_scan.ref import scan
+from accelerated_scan.triton import scan
 from causal_conv1d import causal_conv1d_fn
 from einops import rearrange
 from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
+from megatron.core.jit import jit_fuser
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
@@ -65,20 +65,40 @@ def w_init_(self, w: torch.Tensor) -> None:
     def forward(self, x):
         """Calls the BlockDiagonalLinear."""
         # Split x to blocks.
-        x = einops.rearrange(x, "... (h i) -> ... h i", h=self.num_blocks)
+        bs, seq_l = x.shape[0], x.shape[1]
+        x = (
+            x.reshape(bs, seq_l, self.num_blocks, self.block_width)
+            .permute(2, 0, 1, 3)
+            .reshape(self.num_blocks, bs * seq_l, self.block_width)
+        )
+        x = (torch.bmm(x, self.w).permute(1, 0, 2) + self.b).reshape(bs, seq_l, self.num_blocks * self.block_width)
+        out = torch.sigmoid(x)
+        return out
 
-        # Linear layer over each block + bias.
-        y = torch.einsum("... h i, h i j -> ... h j", x, self.w) + self.b
 
-        # Flatten the output.
-        return einops.rearrange(y, "... h j -> ... (h j)", h=self.num_blocks)
+# Class copied from https://github.com/google-deepmind/recurrentgemma
 
 
-# Class copied from https://github.com/google-deepmind/recurrentgemma
+@jit_fuser
+def _scan_preprocess_(a, x, reset):
+    assert x.ndim == 3
+    assert a.shape == x.shape[-a.ndim :]
+    assert a.dtype == x.dtype
+    assert type(a) is type(x)
+
+    # Multiply `a` by the reset.
+    a = a * (1 - reset)
+
+    # Using scan in linear mode.
+    x = x.permute(0, 2, 1)
+    a = a.permute(0, 2, 1)
+    x = x.contiguous()
+    a = a.contiguous()
+    return a, x
 
 
 def rnn_scan(
-    x, a, reset, h0,
+    x, a, reset,
 ):
     """Runs the recurrence of a linear RNN.
 
@@ -92,26 +112,9 @@ def rnn_scan(
   Returns:
     The output of the linear recurrence.
   """
-
-    assert x.ndim == 3
-    assert a.shape == x.shape[-a.ndim :]
-    assert a.dtype == x.dtype
-    assert type(a) is type(x)
-
-    # Multiply `a` by the reset.
-    a = a * (1 - reset)[..., None]
-
-    if x.shape[1] == 1:
-        # Using scan in sampling mode.
-        y = a * h0[:, None] + x
-    else:
-        # Using scan in linear mode.
-        x = x.permute(0, 2, 1)
-        a = a.permute(0, 2, 1)
-        x = x.contiguous()
-        a = a.contiguous()
-        y = scan(a.float(), x.float()).type_as(x)
-        y = y.permute(0, 2, 1)
+    a, x = _scan_preprocess_(a, x, reset)
+    y = scan(a.float(), x.float()).type_as(x)
+    y = y.permute(0, 2, 1)
     return y, None
 
 
@@ -168,6 +171,18 @@ def a_param_init(self) -> torch.Tensor:
         """Initializes the `A` parameter of the RG-LRU."""
         return rnn_param_init(width=self.width, min_rad=0.9, max_rad=0.999)
 
+    @jit_fuser
+    def _fused_pst_gates_(self, x, gate_a, gate_x, reset):
+
+        log_a = -8.0 * gate_a * nn.functional.softplus(self.a_param)
+        a = torch.exp(log_a)
+        gated_x = x * gate_x
+        multiplier = torch.sqrt((1 - torch.exp(2 * log_a)) + 1e-6)
+        multiplier = reset + (1 - reset) * multiplier
+        normalized_x = gated_x * multiplier.type(x.dtype)
+
+        return normalized_x, a
+
     def __call__(
         self, x, segment_pos, prev_h,
     ):
@@ -186,26 +201,15 @@ def __call__(
 
         bs, l, d = x.shape
         assert segment_pos.shape == (bs, l)
-        reset = (segment_pos == 0).type(torch.int32)
-        prev_h = torch.zeros(size=(bs, d)) if prev_h is None else prev_h
-        prev_h = prev_h.cuda()
+        reset = (segment_pos == 0).type(torch.int32).unsqueeze(-1)
+
         # Gates for x and a.
-        gate_x = torch.sigmoid(self.input_gate(x))
-        gate_a = torch.sigmoid(self.a_gate(x))
+        gate_x = self.input_gate(x)
+        gate_a = self.a_gate(x)
 
         # Compute the parameter `A` of the recurrence.
-        log_a = -8.0 * gate_a * nn.functional.softplus(self.a_param)
-        a = torch.exp(log_a)
-
-        # Gate the input.
-        gated_x = x * gate_x
-
-        # Apply gamma normalization to the input.
-        multiplier = torch.sqrt((1 - torch.exp(2 * log_a)) + 1e-6)
-        multiplier = reset[..., None] + (1 - reset)[..., None] * multiplier
-        normalized_x = gated_x * multiplier.type(x.dtype)
-
-        y, last_h = rnn_scan(x=normalized_x, a=a, reset=reset, h0=prev_h,)
+        normalized_x, a = self._fused_pst_gates_(x, gate_a, gate_x, reset)
+        y, last_h = rnn_scan(x=normalized_x, a=a, reset=reset)
 
         return y, last_h
 
@@ -248,6 +252,20 @@ def gelu(x: torch.Tensor) -> torch.Tensor:
     return nn.functional.gelu(x, approximate="tanh")
 
 
+@jit_fuser
+def _fused_permute_add_(x, b):
+    x = x + b
+    x = x.permute(1, 0, 2)
+    return x
+
+
+@jit_fuser
+def _fused_permute_mult_(x, y):
+    x = x.permute(1, 0, 2)
+    x = x * y
+    return x
+
+
 class RecurrentLayer(MegatronModule):
     def __init__(
         self,
@@ -258,7 +276,7 @@ def __init__(
         **kwargs,
     ):
         """
-        Top level Mamba Layer
+        Top level Recurrent Layer
         """
         super().__init__(config)
         self.config = config
@@ -306,16 +324,14 @@ def forward(self, hidden_states, attention_mask=None, rotary_pos_emb=None):
 
         y = bias_gelu_impl(y_intermidiate_parallel, y_bias_parallel)
 
-        x = x_intermidiate_parallel + x_bias_parallel
-        x = x.permute(1, 0, 2)
+        x = _fused_permute_add_(x_intermidiate_parallel, x_bias_parallel)
 
         x, _ = self.conv_1d(x=x, segment_pos=segment_pos, prev_x=None)
 
         x, _ = self.rg_lru(x=x, segment_pos=segment_pos, prev_h=None,)
 
-        x = x.permute(1, 0, 2)
+        x = _fused_permute_mult_(x, y)
 
-        x = x * y
         x_intermidiate_parallel, x_bias_parallel = self.linear_out(x)
 
         return x_intermidiate_parallel, x_bias_parallel
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 32b22df22d2c..ee5bdf88c943 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -291,6 +291,7 @@ def _build_dataset(self, data_cfg, is_train=True):
                 index_mapping_dir=data_cfg.get('index_mapping_dir', None),
                 prompt_template=data_cfg.get('prompt_template', None),
                 ceil_to_power_2=data_cfg.get('ceil_to_power_2', False),
+                get_attention_mask_from_fusion=data_cfg.get('get_attention_mask_from_fusion', False),
                 virtual_tokens=self.virtual_tokens,
                 tokens_to_generate=data_cfg.get(
                     'tokens_to_generate', 0

From 00023c42a8364a1468a1bb31c1f8f35bc7b0c11e Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Mon, 6 May 2024 09:33:04 -0700
Subject: [PATCH 038/178] Add assert for max_steps to be positive in
 MegatronGPTSFTModel (#9110)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_sft_model.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index ee5bdf88c943..b9eeccb4ea5f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -74,6 +74,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             raise ImportError(
                 "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
             )
+        assert trainer.max_steps > 0, "max_steps for SFT can't be negative as its required to build the dataset"
         super().__init__(cfg, trainer=trainer)
         self.sep_id = cfg.get('sep_id', 49704)
         if hasattr(self.cfg.data, "validation_ds"):

From 957c988fe8950a34aafd91169c837b2f6aab7332 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Mon, 6 May 2024 14:03:50 -0400
Subject: [PATCH 039/178] Lhotse dataloading: RIR augmentation and nemo/tarred
 input support for RIR and noise aug (#9109)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Lhotse dataloading: RIR augmentation and nemo/tarred input support for RIR and noise aug

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix for RIR: currently lhotse requires RecordingSet

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Unit tests and fixes

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Unit test for RIR

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
---
 nemo/collections/common/data/lhotse/cutset.py |  38 ++++++
 .../common/data/lhotse/dataloader.py          |  24 +++-
 .../common/test_lhotse_dataloading.py         | 108 +++++++++++++++++-
 3 files changed, 163 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 0ed0f67beaaa..cb2efe0312d2 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -19,6 +19,7 @@
 from pathlib import Path
 from typing import Sequence, Tuple, Union
 
+import omegaconf
 from lhotse import CutSet, Features, Recording
 from lhotse.array import Array, TemporalArray
 from lhotse.cut import Cut, MixedCut, PaddingCut
@@ -447,3 +448,40 @@ def mux(
     else:
         cuts = CutSet.mux(*[cs.repeat() for cs in cutsets], weights=weights, seed=seed)
     return cuts
+
+
+def guess_parse_cutset(inp: Union[str, dict, omegaconf.DictConfig]) -> CutSet:
+    """
+    Utility function that supports opening a CutSet from:
+    * a string path to YAML input spec (see :func:`read_dataset_config` for details)
+    * a string path to Lhotse non-tarred JSONL manifest
+    * a string path to NeMo non-tarred JSON manifest
+    * a dictionary specifying inputs with keys available in :class:`nemo.collections.common.data.lhotse.dataloader.LhotseDataLoadingConfig`
+
+    It's intended to be used in a generic context where we are not sure which way the user will specify the inputs.
+    """
+    from nemo.collections.common.data.lhotse.dataloader import make_structured_with_schema_warnings
+
+    if isinstance(inp, (dict, omegaconf.DictConfig)):
+        try:
+            config = make_structured_with_schema_warnings(OmegaConf.from_dotlist([f"{k}={v}" for k, v in inp.items()]))
+            cuts, _ = read_cutset_from_config(config)
+            return cuts
+        except Exception as e:
+            raise RuntimeError(
+                f"Couldn't open CutSet based on dict input {inp} (is it compatible with LhotseDataLoadingConfig?)"
+            ) from e
+    elif isinstance(inp, str):
+        if inp.endswith(".yaml"):
+            # Path to YAML file with the input configuration
+            config = make_structured_with_schema_warnings(OmegaConf.from_dotlist([f"input_cfg={inp}"]))
+        elif inp.endswith(".jsonl") or inp.endswith(".jsonl.gz"):
+            # Path to a Lhotse non-tarred manifest
+            config = make_structured_with_schema_warnings(OmegaConf.from_dotlist([f"cuts_path={inp}"]))
+        else:
+            # Assume anything else is a NeMo non-tarred manifest
+            config = make_structured_with_schema_warnings(OmegaConf.from_dotlist([f"manifest_filepath={inp}"]))
+        cuts, _ = read_cutset_from_config(config)
+        return cuts
+    else:
+        raise RuntimeError(f'Unsupported input type: {type(inp)} (expected a dict or a string)')
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 191ac54589e5..9efd6444aecd 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 import torch
-from lhotse import CutSet
+from lhotse import CutSet, RecordingSet
 from lhotse.cut import Cut
 from lhotse.cut.text import TextExample, TextPairExample
 from lhotse.dataset import (
@@ -27,6 +27,7 @@
     DynamicBucketingSampler,
     DynamicCutSampler,
     IterableDatasetWrapper,
+    ReverbWithImpulseResponse,
     make_worker_init_fn,
 )
 from lhotse.dataset.dataloading import resolve_seed
@@ -35,7 +36,7 @@
 from lhotse.utils import fastcopy, fix_random_seed
 from omegaconf import DictConfig, OmegaConf
 
-from nemo.collections.common.data.lhotse.cutset import read_cutset_from_config
+from nemo.collections.common.data.lhotse.cutset import guess_parse_cutset, read_cutset_from_config
 from nemo.utils import logging
 
 
@@ -94,7 +95,7 @@ class LhotseDataLoadingConfig:
 
     # 4. Optional Lhotse data augmentation.
     #   a. On-the-fly noise/audio mixing.
-    noise_path: str | None = None
+    noise_path: Any | None = None  # str | dict where dict can have any of keys: manifest_filepath, tarred_audio_filepaths, cuts_path, shar_path
     noise_snr: tuple[float, float] = (10.0, 20.0)
     noise_mix_prob: float = 0.5
     #   b. On-the-fly 3-way speed perturbation.
@@ -114,6 +115,11 @@ class LhotseDataLoadingConfig:
     cut_into_windows_hop: Optional[float] = None
     #       III) common options
     keep_excessive_supervisions: bool = True  # when a cut is truncated in the middle of a supervision, should we keep them.
+    #   e. RIR augmentation (synthetic RIR if rir_path is None)
+    #   at the moment supports only Lhotse recording manifests, e.g. https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/rir_noise.py
+    rir_enabled: bool = False
+    rir_path: str | None = None  # str, must point to a lhotse RecordingSet manifest
+    rir_prob: float = 0.5
 
     # 5. Other Lhotse options.
     text_field: str = "text"  # key to read the transcript from
@@ -185,10 +191,10 @@ def get_lhotse_dataloader_from_config(
     # 2. Optional augmentations.
     # 2.a. Noise mixing.
     if config.noise_path is not None:
-        noise = CutSet.from_file(config.noise_path)
+        noise = guess_parse_cutset(config.noise_path)
         cuts = cuts.mix(
             cuts=noise,
-            snr=config.noise_snr,
+            snr=tuple(config.noise_snr),
             mix_prob=config.noise_mix_prob,
             seed=config.shard_seed,
             random_mix_offset=True,
@@ -292,6 +298,14 @@ def get_lhotse_dataloader_from_config(
         if config.concatenate_merge_supervisions:
             sampler = sampler.map(_merge_supervisions)
 
+    if config.rir_enabled:
+        sampler = sampler.map(
+            ReverbWithImpulseResponse(
+                rir_recordings=RecordingSet.from_file(config.rir_path) if config.rir_path is not None else None,
+                p=config.rir_prob,
+            )
+        )
+
     # 4. Creating dataloader.
     if is_tarred:
         # Wrapper here is necessary when using NeMo tarred data or Lhotse Shar data,
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index 8eaebb2af68a..744e2884d015 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -21,9 +21,9 @@
 import numpy as np
 import pytest
 import torch
-from lhotse import CutSet, NumpyFilesWriter, Recording
+from lhotse import CutSet, MonoCut, NumpyFilesWriter, Recording
 from lhotse.audio import AudioLoadingError
-from lhotse.cut import Cut
+from lhotse.cut import Cut, MixedCut
 from lhotse.cut.text import TextPairExample
 from omegaconf import OmegaConf
 
@@ -1388,3 +1388,107 @@ def test_multimodal_text_audio_dataloading(
             assert isinstance(ex.target.text, str)
             assert isinstance(ex.source.tokens, np.ndarray)
             assert isinstance(ex.target.tokens, np.ndarray)
+
+
+def test_dataloader_with_noise_nemo_json(cutset_path: Path, nemo_manifest_path: Path):
+    config = OmegaConf.create(
+        {
+            "cuts_path": str(cutset_path),
+            "noise_path": str(nemo_manifest_path),
+            "noise_mix_prob": 1.0,
+            "noise_snr": [-5.0, 5.0],
+            "batch_size": 2,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    batch = next(iter(dl))
+    assert isinstance(batch, CutSet)
+    assert len(batch) == 2
+    cut = batch[0]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+    cut = batch[1]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+
+
+def test_dataloader_with_noise_lhotse_jsonl(cutset_path: Path):
+    config = OmegaConf.create(
+        {
+            "cuts_path": str(cutset_path),
+            "noise_path": str(cutset_path),
+            "noise_mix_prob": 1.0,
+            "noise_snr": [-5.0, 5.0],
+            "batch_size": 2,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    batch = next(iter(dl))
+    assert isinstance(batch, CutSet)
+    assert len(batch) == 2
+    cut = batch[0]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+    cut = batch[1]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+
+
+def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_path_multi: Path):
+    noise_json, noise_tar = nemo_tarred_manifest_path_multi
+    config = OmegaConf.create(
+        {
+            "cuts_path": str(cutset_path),
+            "noise_path": {"manifest_filepath": noise_json, "tarred_audio_filepaths": noise_tar,},
+            "noise_mix_prob": 1.0,
+            "noise_snr": [-5.0, 5.0],
+            "batch_size": 2,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    batch = next(iter(dl))
+    assert isinstance(batch, CutSet)
+    assert len(batch) == 2
+    cut = batch[0]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+    cut = batch[1]
+    assert isinstance(cut, MixedCut)
+    assert -5.0 < cut.tracks[1].snr < 5.0
+
+
+def test_dataloader_with_synth_rir(cutset_path: Path):
+    config = OmegaConf.create(
+        {
+            "cuts_path": str(cutset_path),
+            "rir_enabled": True,
+            "rir_prob": 0.5,
+            "batch_size": 4,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    batch = next(iter(dl))
+    assert isinstance(batch, CutSet)
+    assert len(batch) == 4
+    cut = batch[0]
+    assert isinstance(cut, MonoCut)
+    assert cut.recording.transforms is None
+    cut = batch[1]
+    assert isinstance(cut, MonoCut)
+    assert cut.recording.transforms is None
+    cut = batch[2]
+    assert isinstance(cut, MonoCut)
+    assert isinstance(cut.recording.transforms, list) and len(cut.recording.transforms) == 1
+    assert cut.recording.transforms[0]["name"] == "ReverbWithImpulseResponse"
+    cut = batch[3]
+    assert isinstance(cut, MonoCut)
+    assert isinstance(cut.recording.transforms, list) and len(cut.recording.transforms) == 1
+    assert cut.recording.transforms[0]["name"] == "ReverbWithImpulseResponse"

From f609553f17de1e1573d1554d96b5421ed8e2a467 Mon Sep 17 00:00:00 2001
From: Slyne Deng <slynedeng@gmail.com>
Date: Mon, 6 May 2024 12:02:26 -0700
Subject: [PATCH 040/178] mixtral evaluation PR (#8989)

* add mixtral evaluation
Co-authored-by: PannuMuthu <pmuthukumar@nvidia.com>

Signed-off-by: Slyne Deng <slyned@nvidia.com>

* fix format & remove unused import

Signed-off-by: Slyne Deng <slyned@nvidia.com>

* move mixtral as judge evaluation document

Signed-off-by: Slyne Deng <slyned@nvidia.com>

---------

Signed-off-by: Slyne Deng <slyned@nvidia.com>
Co-authored-by: Slyne Deng <slyned@nvidia.com>
---
 docs/source/multimodal/mllm/intro.rst         |   2 +-
 docs/source/multimodal/mllm/video_neva.rst    |  72 ++++-
 .../multimodal_llm/neva/eval/mixtral_eval.py  | 245 ++++++++++++++++++
 3 files changed, 317 insertions(+), 2 deletions(-)
 create mode 100644 examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py

diff --git a/docs/source/multimodal/mllm/intro.rst b/docs/source/multimodal/mllm/intro.rst
index 687ecd930a9e..be564a81a826 100644
--- a/docs/source/multimodal/mllm/intro.rst
+++ b/docs/source/multimodal/mllm/intro.rst
@@ -10,4 +10,4 @@ The endeavor to extend Language Models (LLMs) into multimodal domains by integra
    configs
    checkpoint
    neva
-
+   video_neva
diff --git a/docs/source/multimodal/mllm/video_neva.rst b/docs/source/multimodal/mllm/video_neva.rst
index b5831a45ab28..eb0624545a3e 100644
--- a/docs/source/multimodal/mllm/video_neva.rst
+++ b/docs/source/multimodal/mllm/video_neva.rst
@@ -79,7 +79,7 @@ Example format of ``.jsonl`` prompt_file::
 
     {"video": "video_test.mp4", "text": "Can you describe the scene?", "category": "conv", "question_id": 0}
 
-input video file:: video_test.mp4
+input video file: video_test.mp4
 
 Output::
 
@@ -124,6 +124,76 @@ For running video inference::
     quantization.algorithm=awq \
     quantization.enable=False
 
+
+
+Evaluation with Mixtral as a judge
+==================================
+
+We can run ``mixtral_eval.py`` localted in ``NeMo/examples/multimodal/multimodal_llm/neva`` to call mixtral api to give scores for the generated responses of two models.
+Here we use ``llava-bench-in-the-wild`` as an example.
+
+Set up
+------
+Before running the script, we need to set up ``NGC API KEY`` for calling the foundation models on NVIDIA NGC. Once you set up your account on NGC, you can login in and go to `here: <https://build.nvidia.com/mistralai/mixtral-8x7b-instruct/>`_ and click ``Get API Key``. Save the key.
+
+
+Download dataset
+----------------
+
+We first download ``llava-bench-in-the-wild`` dataset:
+
+.. code-block:: bash
+
+    git clone https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild
+
+
+And download the `rule.json <https://huggingface.co/spaces/LanguageBind/Video-LLaVA/blob/main/llava/eval/table/rule.json>`_.
+
+
+Notice the answer file in ``llava-bench-in-the-wild`` is consisted of rows of json string::
+
+    {"question_id": 0, "prompt": "What is the name of this famous sight in the photo?", "answer_id": "TeyehNxHw5j8naXfEWaxWd", "model_id": "gpt-4-0314", "metadata": {}, "text": "The famous sight in the photo is Diamond Head."}
+
+
+You may also have your own response file as::
+
+    {"response_id": 0, "response": "The famous sight in the photo is Diamond Head."}
+
+
+Both formats are ok.
+
+Evaluation
+----------
+
+Install package:
+
+.. code-block:: bash
+
+    pip install shortuuid
+
+
+Now you can run the script simply by:
+
+.. code-block:: bash
+
+    API_TOKEN=nvapi-<the api you just saved> python3 NeMo/examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py --model-name-list gpt bard --media-type image  \
+        --question-file llava-bench-in-the-wild/questions.jsonl \  # the question file
+        --responses-list llava-bench-in-the-wild/answers_gpt4.jsonl llava-bench-in-the-wild/bard_0718.jsonl  \   # two answer files / response files
+        --answers-dir ./  \  # to save the answers
+        --context-file llava-bench-in-the-wild/context.jsonl \  # context file
+        --output ./output.json  # the generated mixtral reviews for the two models
+
+
+You'll see the result like::
+
+    all 84.8 72.4
+    llava_bench_complex 77.0 69.0
+    llava_bench_conv 91.8 77.1
+    llava_bench_detail 91.3 73.2
+
+
+Notice when you start a new comparison, you should remove the ``output.json`` file
+
 References
 ----------
 
diff --git a/examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py b/examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py
new file mode 100644
index 000000000000..d3fd8c644afb
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/eval/mixtral_eval.py
@@ -0,0 +1,245 @@
+"""Script to query Mixtral-8x7B as a judge via NGC API for evaluation"""
+import argparse
+import json
+import math
+import os
+from collections import defaultdict
+
+import numpy as np
+import requests
+import shortuuid
+from tqdm import tqdm
+
+"""Usage: (for image inference)
+API_TOKEN=xxx python3 --model-name-list name-of-model-1 name-of-model-2
+                      --media-type image
+                      --question-file path/to/prompts.jsonl
+                      --responses-list path/to/responses-1.jsonl path/to/responses-2.jsonl
+                      --answers-dir path/to/desired/preprocessed/answers/dir
+                      --context-file path/to/context.jsonl
+                      --rule-file path/to/rule.json
+                      --output path/to/desired/output.json
+"""
+
+invoke_url = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/8f4118ba-60a8-4e6b-8574-e38a4067a4a3"
+
+API_TOKEN = os.getenv("API_TOKEN", "")  # ADD NGC API TOKEN HERE
+
+headers = {
+    "Authorization": f"Bearer {API_TOKEN}",
+    "accept": "text/event-stream",
+    "content-type": "application/json",
+}
+
+
+def summarize(review_files):
+    for review_file in sorted(review_files):
+        scores = defaultdict(list)
+        with open(review_file) as f:
+            for review_str in f:
+                review = json.loads(review_str)
+                if 'category' in review:
+                    scores[review['category']].append(review['tuple'])
+                    scores['all'].append(review['tuple'])
+                else:
+                    if 'tuple' in review:
+                        scores['all'].append(review['tuple'])
+                    else:
+                        scores['all'].append(review['score'])
+        for k, v in sorted(scores.items()):
+            stats = np.asarray(v).mean(0).tolist()
+            stats = [round(x, 3) for x in stats]
+            # print(k, round(stats[1] / stats[0] * 100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
+            print(k, round(stats[0] * 10, 1), round(stats[1] * 10, 1))
+        print('=================================')
+
+
+def get_eval(content: str, max_tokens: int):
+    payload = {
+        "messages": [
+            {
+                'role': 'system',
+                'content': 'You are a helpful and precise assistant for checking the quality of the answer.',
+            },
+            {'role': 'user', 'content': content,},
+        ],
+        "temperature": 0.2,
+        "top_p": 0.7,
+        "max_tokens": max_tokens,
+        "seed": 42,
+        "stream": True,
+    }
+    response = requests.post(invoke_url, headers=headers, json=payload, stream=True)
+    output = ""
+    for line in response.iter_lines():
+        if line:
+            try:
+                res = json.loads(line.decode("utf-8").split("data: ")[1])
+            except:
+                continue
+            output += res['choices'][0]['delta']['content']
+    print(output)
+    return output
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split()
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e.messsage)
+        print('error', review)
+        return [-1, -1]
+
+
+def generate_prompt(args, answer_list):
+    f_q = open(os.path.expanduser(args.question_file))
+    f_ans1 = open(os.path.expanduser(answer_list[0]))
+    f_ans2 = open(os.path.expanduser(answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule_file), 'r'))
+
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+
+    review_file = open(f'{args.output}', 'a')
+
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context_file))]
+    image_to_context = {context['image']: context for context in context_list}
+
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        inst = image_to_context[ques['image']]
+
+        if isinstance(inst['caption'], list):
+            cap_str = '\n'.join(inst['caption'])
+        else:
+            cap_str = inst['caption']
+
+        category = 'llava_bench_' + json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (
+            f'[Context]\n{cap_str}\n\n'
+            f'[Question]\n{ques["text"]}\n\n'
+            f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+            f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+            f'[System]\n{prompt}\n\n'
+        )
+        cur_js = {
+            'id': idx + 1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category,
+        }
+        if idx >= len(cur_reviews):
+            print(content)
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
+
+    return args.output
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def preprocess(args, response_file, model_name):
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    responses = [json.loads(r) for r in open(os.path.expanduser(response_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    responses = get_chunk(responses, args.num_chunks, args.chunk_idx)
+    base, ext = os.path.splitext(os.path.basename(response_file))
+    answer_file = os.path.join(args.answers_dir, f'{base}_answer{ext}')
+    answers_file = os.path.expanduser(answer_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for line, resp in tqdm(zip(questions, responses), total=len(questions)):
+        idx = line["question_id"]
+        resp_key = "response_id"
+        resp_text_key = "response"
+        if resp_key not in resp:
+            resp_key = "question_id"
+            resp_text_key = "text"
+        resp_idx = resp[resp_key]
+
+        if int(idx) == int(resp_idx):
+            # image_file = line[args.media_type]
+            qs = line["text"]
+            cur_prompt = qs
+            outputs = resp[resp_text_key]
+            ans_id = shortuuid.uuid()
+            ans_file.write(
+                json.dumps(
+                    {
+                        "question_id": idx,
+                        "prompt": cur_prompt,
+                        "text": outputs,
+                        "answer_id": ans_id,
+                        "model_id": model_name,
+                        "metadata": {},
+                    }
+                )
+                + "\n"
+            )
+            ans_file.flush()
+    ans_file.close()
+
+    return answer_file
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name-list", nargs='+', default=[])
+    parser.add_argument("--media-type", type=str, default="image")
+    parser.add_argument("--question-file", type=str, default="question.jsonl")
+    parser.add_argument('--responses-list', nargs='+', default=[])
+    parser.add_argument("--answers-dir", type=str, default="answers")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument('--context-file', type=str, default="context.jsonl")
+    parser.add_argument('--rule-file', type=str, default="rule.json")
+    parser.add_argument('--output', type=str, default="output.json")
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    answer_list = []
+    for response, model_name in zip(args.responses_list, args.model_name_list):
+        answer = preprocess(args, response, model_name)
+        answer_list.append(answer)
+
+    review = generate_prompt(args, answer_list)
+
+    summarize([review])

From 48a2a6b394935a8b4a2b220b34646bf157c6d83b Mon Sep 17 00:00:00 2001
From: Valerie Sarge <vsarge@nvidia.com>
Date: Mon, 6 May 2024 12:42:03 -0700
Subject: [PATCH 041/178] Extend sequence length padding for GPT SFT to account
 for context parallel (#8869)

* Pad outputs from G GPTSFTDataset / GPTSFTPackedDataset to match TE requirements when using fp8

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Account for SP + CP case

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_sft_model.py      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index b9eeccb4ea5f..3c3259beff92 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -254,9 +254,11 @@ def _build_dataset(self, data_cfg, is_train=True):
 
         # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8
         # When using sequence parallel, sequence will further be split by TP size
+        # When using context parallel, sequence is split by CP size as well
         pad_seq_length_to_mult = (
             8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16
         )
+        pad_seq_length_to_mult *= self.cfg.get('context_parallel_size', 1)
 
         dataset_kwargs = {}
         for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset):

From e316b0e50432ea27fe86eb008524845b9cd4631f Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Mon, 6 May 2024 14:02:01 -0700
Subject: [PATCH 042/178] [Nemo CICD] Revert: run GHA cicd on comment (#9119)

* RADTTS test optional

* Revert "[Nemo CICD] Try trigger cicd run on comment (#9111)"

This reverts commit b198a11f0b142a47ef309ec89c34af092be0c672.
---
 .github/workflows/cicd-main.yml | 32 +++-----------------------------
 1 file changed, 3 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 684ddbc81f41..2e91e69b4489 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -17,8 +17,6 @@ on:
   pull_request:
     branches: [ "main" ]
     types: [ labeled ]
-  issue_comment:
-    types: [ created ]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -27,15 +25,7 @@ concurrency:
 jobs:
   gpu-test:
     runs-on: self-hosted-azure
-    if: >
-      (
-        github.event.label.name == 'Run CICD'
-      ) || (
-        github.event_name == 'issue_comment' &&
-        github.event.issue.pull_request &&
-        contains(fromJSON('["MEMBER", "COLLABORATOR", "OWNER"]'), github.event.comment.author_association) &&
-        contains(github.event.comment.body, '/cicd')
-      )
+    if: ${{ github.event.label.name == 'Run CICD' }}
     steps:
     - name: Run nvidia-smi test
       run: |
@@ -44,15 +34,7 @@ jobs:
 
   cicd-cluster-clean:
     runs-on: self-hosted-azure-builder
-    if: >
-      (
-        github.event.label.name == 'Run CICD'
-      ) || (
-        github.event_name == 'issue_comment' &&
-        github.event.issue.pull_request &&
-        contains(fromJSON('["MEMBER", "COLLABORATOR", "OWNER"]'), github.event.comment.author_association) &&
-        contains(github.event.comment.body, '/cicd')
-      )
+    if: ${{ github.event.label.name == 'Run CICD' }}
     steps:
     - name: Clean server from old files
       run: |
@@ -75,15 +57,7 @@ jobs:
   cicd-test-container-setup:
     needs: [cicd-cluster-clean]
     runs-on: self-hosted-azure-builder
-    if: >
-      (
-        github.event.label.name == 'Run CICD'
-      ) || (
-        github.event_name == 'issue_comment' &&
-        github.event.issue.pull_request &&
-        contains(fromJSON('["MEMBER", "COLLABORATOR", "OWNER"]'), github.event.comment.author_association) &&
-        contains(github.event.comment.body, '/cicd')
-      )
+    if: ${{ github.event.label.name == 'Run CICD' }}
     # uses: actions/cache@v2
     #container:
 #      image: nvcr.io/nvidia/pytorch:24.02-py3

From 57cedb60c4699a597bf321a6796d53a4dbee4709 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Mon, 6 May 2024 20:01:00 -0700
Subject: [PATCH 043/178] Comment out flaky test: running too long (#9123)

---
 .github/workflows/cicd-main.yml | 61 +++++++++++++++++----------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 2e91e69b4489..afc5135ff4c1 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -458,36 +458,37 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  L2_PTQ_Llama2_INT4_AWQ:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options:
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-            tensor_model_parallel_size=1 \
-            trainer.devices=1 \
-            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-            quantization.algorithm=int4_awq \
-            quantization.num_calib_size=8 \
-            inference.batch_size=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
-
-            rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+  #L2_PTQ_Llama2_INT4_AWQ:
+  #  needs: [cicd-test-container-setup]
+  #  runs-on: self-hosted-azure
+  #  timeout-minutes: 15
+  #  container:
+  #    image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+  #    options:
+  #      # --user 0:128
+  #      --device=/dev/nvidia0
+  #      --gpus all
+  #      --shm-size=8g
+  #      --env TRANSFORMERS_OFFLINE=0
+  #      --env HYDRA_FULL_ERROR=1
+  #      --volume /mnt/datadrive/TestData:/home/TestData
+  #  steps:
+  #      - name: Checkout repository
+  #        uses: actions/checkout@v4
+  #      - run: |
+  #          python examples/nlp/language_modeling/megatron_llama_quantization.py \
+  #          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+  #          tensor_model_parallel_size=1 \
+  #          trainer.devices=1 \
+  #          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+  #          quantization.algorithm=int4_awq \
+  #          quantization.num_calib_size=8 \
+  #          inference.batch_size=2 \
+  #          model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
+  #
+  #          rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
+        #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+        #  if: "failure()"
 
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:

From 67d48817f3b6e1dfccf958166109d37eb79c2d2e Mon Sep 17 00:00:00 2001
From: Dong Hyuk Chang <thomaschang26@tutanota.com>
Date: Tue, 7 May 2024 09:47:03 -0400
Subject: [PATCH 044/178] Update gpt dataset config parameter for mock (#9118)

* Update gpt dataset config parameter for mock

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>

* Dataset arg blend should be None if mock dataset

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>

---------

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Co-authored-by: Dong Hyuk Chang <donghyukc@nvidia.com>
---
 .github/workflows/cicd-main.yml                                | 2 +-
 .../nlp/models/language_modeling/megatron_gpt_model.py         | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index afc5135ff4c1..cf108434ce86 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -118,7 +118,7 @@ jobs:
             # Megatron Core installation
             git clone https://github.com/NVIDIA/Megatron-LM.git && \
                 pushd Megatron-LM && \
-                git checkout 709472117364eed93b6a767e2ac343e229d3aa89 && \
+                git checkout c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9 && \
                 pip install . && \
                   pushd megatron/core/datasets && \
                   make && \
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 551ca42f3f4d..d828ee4fde53 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1400,7 +1400,6 @@ def build_train_valid_test_datasets(self):
                 "reset_position_ids": self.reset_position_ids,
                 "reset_attention_mask": self.reset_attention_mask,
                 "eod_mask_loss": self.eod_mask_loss,
-                "mock": mock_dataset,
                 "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
             }
 
@@ -1414,7 +1413,7 @@ def build_train_valid_test_datasets(self):
                     get_blend_from_list(data_prefix.test),
                 ]
             else:
-                kwargs['blend'] = data_prefix if mock_dataset else get_blend_from_list(data_prefix)
+                kwargs['blend'] = None if mock_dataset else get_blend_from_list(data_prefix)
                 kwargs["split"] = self.cfg.data.splits_string
 
             if self.cfg.data.get('add_fim', False):

From fb850d1b11cd5b6e915c047410870c2bd8a073bc Mon Sep 17 00:00:00 2001
From: gdengk <160076886+gdengk@users.noreply.github.com>
Date: Tue, 7 May 2024 11:29:56 -0700
Subject: [PATCH 045/178] Add Mcore DistributedDataParallel and distributed
 optimizer into Nemo (#9034)

* merge mcore dist optim

Signed-off-by: Gao Deng <gdeng@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: Gao Deng <gdeng@nvidia.com>

* address comments

Signed-off-by: Gao Deng <gdeng@nvidia.com>

* fix import and CodeQL comments

Signed-off-by: Gao Deng <gdeng@nvidia.com>

* remove two type check

Signed-off-by: Gao Deng <gdeng@nvidia.com>

---------

Signed-off-by: Gao Deng <gdeng@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../language_modeling/megatron_base_model.py  | 17 ++--
 .../language_modeling/megatron_gpt_model.py   | 90 ++++++++++++++++---
 .../nlp/parts/megatron_trainer_builder.py     |  8 +-
 nemo/collections/nlp/parts/nlp_overrides.py   |  1 +
 nemo/core/classes/modelPT.py                  | 52 +++++++++--
 nemo/core/optim/__init__.py                   |  1 +
 nemo/core/optim/mcore_optim.py                | 87 ++++++++++++++++++
 7 files changed, 230 insertions(+), 26 deletions(-)
 create mode 100644 nemo/core/optim/mcore_optim.py

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index f431d43716b9..09cf25db61fc 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -59,6 +59,7 @@
 
 try:
     from megatron.core import ModelParallelConfig, parallel_state
+    from megatron.core.distributed import DistributedDataParallel as McoreDDP
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import init_method_normal, scaled_init_method_normal
@@ -147,7 +148,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
         # set the megatron core model parallel config
         self.model_parallel_config: ModelParallelConfig = self.build_model_parallel_config()
 
-        self.with_distributed_adam = cfg.optim.get('name') == 'distributed_fused_adam'
+        self.use_mcore_dist_optim = cfg.optim.get('name') == 'mcore_distributed_optim'
+        self.with_distributed_adam = cfg.optim.get('name') == 'distributed_fused_adam' or self.use_mcore_dist_optim
         self.with_megatron_fused_adam = cfg.optim.get('name') == 'megatron_fused_adam'
 
         # used in NVIDIA NGC PyTorch containers
@@ -301,7 +303,6 @@ def _wrap_model_for_O2(self):
         }
 
         args = mcore_args if is_mcore_model else nemo_args
-
         # Model wrapper to convert both model and inputs to half precision
         if isinstance(self.model, list):
             converted_model = []
@@ -312,13 +313,12 @@ def _wrap_model_for_O2(self):
         else:
             args['module'] = self.model
             self.model = Float16Wrapper(**args)
-
         args.pop('module')
 
     def get_model_module_list(self):
         if isinstance(self.model, list):
             return [
-                model.module if isinstance(model, (Float16Module, MCoreFloat16Module)) else model
+                model.module if isinstance(model, (Float16Module, MCoreFloat16Module, McoreDDP)) else model
                 for model in self.model
             ]
         elif isinstance(self.model, (Float16Module, MCoreFloat16Module)):
@@ -612,7 +612,7 @@ def configure_gradient_clipping(self, *args, **kwargs):
         if clip_val <= 0:
             return
 
-        if self.with_megatron_fused_adam:
+        if self.with_megatron_fused_adam or self.use_mcore_dist_optim:
             # Gradient clipping is done in optimizer step
             return
 
@@ -847,7 +847,7 @@ def configure_optimizers(self):
             )
 
         # Configure distributed optimizer
-        if self.with_distributed_adam:
+        if self.with_distributed_adam and not self.use_mcore_dist_optim:
 
             # Initialize param buckets if explicitly provided
             if getattr(self, 'distributed_adam_buckets', None) is not None:
@@ -930,7 +930,10 @@ def _validate_and_override_config(self):
         # async grad allreduce. This should be fixed!
         # For now we must disable it whenever using the baseline implementaion.
         # The distributed adam from apex does work with gradient accumulation fusion.
-        distributed_fused_adam = self.cfg.optim.get('name', 'fused_adam') == 'distributed_fused_adam'
+        distributed_fused_adam = (
+            self.cfg.optim.get('name', 'fused_adam') == 'distributed_fused_adam'
+            or self.cfg.optim.get('name', 'fused_adam') == 'mcore_distributed_optim'
+        )
         pipeline_model_parallel_size = self.cfg.get('pipeline_model_parallel_size', 1)
         data_parallel_size = app_state.data_parallel_size
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index d828ee4fde53..7fbb1f9e6d95 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -92,6 +92,8 @@
     from megatron.core.datasets.utils import get_blend_from_list
     from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace
     from megatron.core.dist_checkpointing.mapping import LocalNonpersitentObject, ShardedObject
+    from megatron.core.distributed import DistributedDataParallel as McoreDDP
+    from megatron.core.distributed import DistributedDataParallelConfig, finalize_model_grads
 
     # NeMo's implementation of the get_gpt_layer_ammo_spec function is temporarily used
     # from megatron.core.inference.gpt.model_specs import get_gpt_layer_ammo_spec
@@ -103,7 +105,12 @@
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
-    from megatron.core.utils import drain_embedding_wgrad_compute, init_method_normal, scaled_init_method_normal
+    from megatron.core.utils import (
+        drain_embedding_wgrad_compute,
+        get_model_config,
+        init_method_normal,
+        scaled_init_method_normal,
+    )
 
     HAVE_MEGATRON_CORE = True
 
@@ -303,9 +310,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         if not self.megatron_amp_O2 and self.cfg.get('expert_model_parallel_size', 1) > 1:
             raise ValueError('Expert parallelism is only supported when using megatron_amp_O2')
 
-        # TODO(akoumparouli): this is temporary and will be removed in the future.
         if self.cfg.get('expert_model_parallel_size', 1) > 1 and self.with_distributed_adam:
-            raise ValueError('Expert parallelism is currently not supporting distributed optimizer')
+            if not self.use_mcore_dist_optim:
+                raise ValueError(
+                    'Expert parallelism is currently not supporting Apex distributed optimizer, use Mcore distributed optimizer instead'
+                )
 
         self.transformer_engine = cfg.get('transformer_engine', False)
         if self.megatron_amp_O2 and not self.transformer_engine:
@@ -332,7 +341,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                 )
 
         # if we're not using interleaved, then self.model is a module.
-        if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None:
+        if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None and (not self.use_mcore_dist_optim):
             self.model = self.model[0]
 
         if self.megatron_amp_O2:
@@ -495,9 +504,39 @@ def setup_optimizer_param_groups(self):
         else:
             self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model)
 
+    def setup_mcore_distributed_parallel(self):
+        """Set up mcore distributed data parallel """
+        if self.with_distributed_adam and self.use_mcore_dist_optim:
+            config = get_model_config(self.model[0])
+            ddp_config = DistributedDataParallelConfig(
+                grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'),
+                overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False),
+                use_distributed_optimizer=True,
+                check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False),
+                # mcore bucket_size is based on num of parameters, therefore not
+                # using bucket_cap_mb to configure bucket_size here
+                bucket_size=self.cfg.optim.get('ddp_bucket_size', None),
+            )
+            self.model = [
+                McoreDDP(
+                    config,
+                    ddp_config,
+                    model_chunk,
+                    data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
+                    expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
+                    # Turn off bucketing for model_chunk 2 onwards, since communication for these
+                    # model chunks is overlapped with compute anyway.
+                    disable_bucketing=(model_chunk_idx > 0),
+                )
+                for (model_chunk_idx, model_chunk) in enumerate(self.model)
+            ]
+
+            # (TODO) Broadcast params from data parallel src rank to other data parallel ranks.
+            # by calling model_module.broadcast_params() if the model is randomly initialized.
+
     def configure_optimizers(self):
 
-        if self.with_distributed_adam:
+        if self.with_distributed_adam and not self.use_mcore_dist_optim:
 
             # Special handling for embedding grads
             modules = self.get_model_module_list()
@@ -597,16 +636,32 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
             if forward_only:
                 if self.validation_param_sync_overlap:
                     param_sync_func = self.sync_overlap_parameters
-            else:
+            elif not self.use_mcore_dist_optim:
                 no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
                 grad_sync_func = self.reduce_overlap_gradients
                 param_sync_func = self.sync_overlap_parameters
+            else:
+                if self.cfg.optim.get("overlap_grad_sync", False):
+                    no_sync_func = [model_chunk.no_sync for model_chunk in self.model]
+                    no_sync_func = no_sync_func[0] if len(self.model) == 1 else no_sync_func
+
+                    if self.cfg.optim.get("delay_grad_reduce", True):
+                        grad_sync_func = [model_chunk.start_grad_sync for model_chunk in self.model]
+                        grad_sync_func = grad_sync_func[0] if len(self.model) == 1 else grad_sync_func
+                if self.cfg.optim.get("overlap_param_sync", False) and self.cfg.optim.get("delay_param_gather", False):
+                    param_sync_func = [
+                        lambda x, model_index=model_index: self._optimizer.finish_param_sync(model_index, x)
+                        for model_index in range(len(self.model))
+                    ]
+                    param_sync_func = param_sync_func[0] if len(self.model) == 1 else param_sync_func
 
         # pipeline schedules will get these from self.model.config
         for module in self.get_model_module_list():
             module.config.no_sync_func = no_sync_func
             module.config.grad_sync_func = grad_sync_func
             module.config.param_sync_func = param_sync_func
+            if self.use_mcore_dist_optim:
+                module.config.finalize_model_grads_func = finalize_model_grads
 
         # run forward and backwards passes for an entire global batch
         # we do this inside training_step to support pipeline parallelism
@@ -700,10 +755,15 @@ def training_step(self, dataloader_iter):
             if self.prev_global_batch_size != current_global_batch_size and self.prev_global_batch_size:
                 self.trainer.should_stop = True
 
+        # zero out the mcore grad buf
+        if self.use_mcore_dist_optim:
+            for model_chunk in self.model:
+                model_chunk.zero_grad_buffer()
+
         # we zero grads here because we also call backward in the megatron-core fwd/bwd functions
         self._optimizer.zero_grad()
 
-        if self.with_distributed_adam:
+        if self.with_distributed_adam and not self.use_mcore_dist_optim:
             # hack to enable overlapping param sync and forward compute
             # note: the distributed optimizer monkey-patches each
             # parameter's __getattribute__ function so that it can
@@ -779,10 +839,12 @@ def training_step(self, dataloader_iter):
             # Reduce the gradients omitted from FSDP-sharding
             self.allreduce_fsdp_sharding_omitted_gradients()
         elif self.with_distributed_adam:
-            # synchronize asynchronous grad reductions
-            # note: not necessary, but reduces performance degradation
-            # from multiple simultaneous NCCL calls
-            self._optimizer._finish_bucket_grad_sync()
+            if not self.use_mcore_dist_optim:
+                # synchronize asynchronous grad reductions
+                # note: not necessary, but reduces performance degradation
+                # from multiple simultaneous NCCL calls
+                self._optimizer._finish_bucket_grad_sync()
+            # else: Mcore distributed optim calls finalize_model_grads to finish grad sync
         elif self.megatron_amp_O2:
             # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
             if (
@@ -798,8 +860,10 @@ def training_step(self, dataloader_iter):
             self.allreduce_gradients()  # @sangkug we think this is causing memory to blow up (hurts perf)
         self.megatron_timer_stop('gradient_allreduce')
 
-        if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and self.cfg.get(
-            'share_embeddings_and_output_weights', True
+        if (
+            not self.use_mcore_dist_optim
+            and self.cfg.get('pipeline_model_parallel_size', 1) > 1
+            and self.cfg.get('share_embeddings_and_output_weights', True)
         ):
             self.megatron_timer_start('allreduce_first_last_embeddings', log_level=1)
             # when using pipeline parallelism the first and last stage must keep embeddings in sync
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index ad184157abc3..367cf46c6fd0 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -56,6 +56,7 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]:
         if self.cfg.model.get('fsdp', False):
             assert (
                 not self.cfg.model.optim.get('name') == 'distributed_fused_adam'
+                and not self.cfg.model.optim.get('name') == 'mcore_distributed_optim'
             ), 'Distributed optimizer cannot be used with FSDP.'
             sharded_checkpoint = self.cfg.model.get('fsdp_sharded_checkpoint', False)
             if self.cfg.model.get('tensor_model_parallel_size', 1) > 1:
@@ -100,7 +101,12 @@ def _plugins(self) -> list:
         """
         megatron_amp_O2 = self.cfg.model.get('megatron_amp_O2', False)
         with_distributed_adam = (
-            self.cfg.model.optim.get('name') == 'distributed_fused_adam' if self.cfg.model.get('optim') else False
+            (
+                self.cfg.model.optim.get('name') == 'distributed_fused_adam'
+                or self.cfg.model.optim.get('name') == 'mcore_distributed_optim'
+            )
+            if self.cfg.model.get('optim')
+            else False
         )
 
         plugins = []
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index b477c64a7510..84c0e23542c7 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -239,6 +239,7 @@ def configure_ddp(self):
             hasattr(self.model, 'with_distributed_adam') and self.model.with_distributed_adam
         ):
             # do not use DDP if using megatron amp O2 or distributed optimizer
+            self.model.setup_mcore_distributed_parallel()
             self._model = self.model
         else:
             app_state = AppState()
diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index d5cd18179e8b..95a3b3309315 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -24,6 +24,17 @@
 
 import hydra
 import torch
+
+try:
+    from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
+    from megatron.core.utils import get_model_config
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.utilities import model_summary, rank_zero_only
@@ -32,7 +43,7 @@
 from nemo.core import optim
 from nemo.core.classes.common import Model
 from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
-from nemo.core.optim import prepare_lr_scheduler
+from nemo.core.optim import McoreDistributedOptimizer, prepare_lr_scheduler
 from nemo.utils import logging, model_utils
 from nemo.utils.app_state import AppState
 from nemo.utils.debug_hook import register_debug_hooks
@@ -570,6 +581,31 @@ def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict]):
             if self._test_dl is not None and type(self._test_dl) in [list, tuple]:
                 self._test_names = ['test_{}_'.format(idx) for idx in range(len(self._test_dl))]
 
+    def setup_megatron_optimization(self, optim_config: Union[Dict[str, Any], DictConfig]):
+        """
+        Setup mcore optimizer config.
+
+        Args:
+            optim_config: Nemo optim args used to set up Mcore optimizer options.
+        """
+
+        config = get_model_config(self.model[0])
+
+        megatron_optim_config = OptimizerConfig(
+            fp16=config.fp16,
+            bf16=config.bf16,
+            params_dtype=config.params_dtype,
+            lr=optim_config['lr'],
+            weight_decay=optim_config['weight_decay'],
+            adam_beta1=optim_config['betas'][0],
+            adam_beta2=optim_config['betas'][1],
+            clip_grad=self.trainer.gradient_clip_val,
+            use_distributed_optimizer=self.use_mcore_dist_optim,
+            overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False),
+            overlap_param_gather=self.cfg.optim.get('overlap_param_sync', False),
+        )
+        return megatron_optim_config
+
     def setup_optimization(
         self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None,
     ):
@@ -718,14 +754,20 @@ def setup_optimization(
                     raise e
 
         else:
-            optimizer = optim.get_optimizer(optimizer_name)
-            optimizer = optimizer(self._optimizer_param_groups, **optimizer_args)
+            if optimizer_name == 'mcore_distributed_optim':
+                # setup megatron_optim_config and get Mcore based optimizer with the wrapper
+                megatron_optim_config = self.setup_megatron_optimization(optimizer_args)
+                _megatron_optimizer = get_megatron_optimizer(megatron_optim_config, self.model,)
+                optimizer = McoreDistributedOptimizer(_megatron_optimizer)
 
-            logging.info("Optimizer config = %s", str(optimizer))
+            else:
+                optimizer = optim.get_optimizer(optimizer_name)
+                optimizer = optimizer(self._optimizer_param_groups, **optimizer_args)
+
+                logging.info("Optimizer config = %s", str(optimizer))
 
             self._optimizer = optimizer
 
-        # Try to instantiate scheduler for optimizer
         self._scheduler = prepare_lr_scheduler(
             optimizer=self._optimizer, scheduler_config=scheduler_config, train_dataloader=self._train_dl
         )
diff --git a/nemo/core/optim/__init__.py b/nemo/core/optim/__init__.py
index 79c4a8dc58ca..488f4f57ea58 100644
--- a/nemo/core/optim/__init__.py
+++ b/nemo/core/optim/__init__.py
@@ -28,6 +28,7 @@
     WarmupPolicy,
     prepare_lr_scheduler,
 )
+from nemo.core.optim.mcore_optim import McoreDistributedOptimizer
 from nemo.core.optim.novograd import Novograd
 from nemo.core.optim.optimizer_with_main_params import MainParamsOptimizerWrapper
 from nemo.core.optim.optimizers import get_optimizer, parse_optimizer_args, register_optimizer
diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py
new file mode 100644
index 000000000000..0d4b524049ca
--- /dev/null
+++ b/nemo/core/optim/mcore_optim.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+try:
+    from megatron.core.optimizer.optimizer import MegatronOptimizer
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+
+class McoreDistributedOptimizer(torch.optim.Optimizer):
+    """
+    A wrapper for Mcore distributed optimizer.
+
+    Arguments:
+        optim: distributed optimizer from Megatron core.
+    """
+
+    def __init__(self, optim):
+        self.defaults = {}
+        self.mcore_optimizer = optim
+        self.param_groups = self.mcore_optimizer.param_groups
+        self.state = self.mcore_optimizer.state
+
+    def zero_grad(self, set_to_none: bool = True):
+        """We only need to zero the model related parameters, i.e.,
+        float16_groups & fp32_from_fp32_groups. We additionally zero
+        fp32_from_float16_groups as a memory optimization to reduce
+        fragmentation; in the case of set_to_none==True, the space
+        used by this field can be safely deallocated at this point."""
+        self.mcore_optimizer.zero_grad(set_to_none)
+
+    def reload_model_params(self):
+        self.mcore_optimizer.reload_model_params()
+
+    def state_dict(self):
+        return self.mcore_optimizer.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self.mcore_optimizer.load_state_dict(state_dict)
+
+    def sharded_state_dict(self, model_sharded_state_dict, is_loading: bool = False, **kwargs):
+        return self.mcore_optimizer.sharded_state_dict(model_sharded_state_dict, is_loading, **kwargs)
+
+    def step(self, closure):
+        """Clip gradients (if needed) and step the base optimizer.
+        Always return successful since there is no overflow."""
+        # Apply closure
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        # return unused update_successful, grad_norm, num_zeros_in_grad
+        self.mcore_optimizer.step()
+
+        return loss
+
+    def save_parameter_state(self, filename: str):
+        self.mcore_optimizer.save_parameter_state(filename)
+
+    def load_parameter_state(self, filename: str):
+        self.mcore_optimizer.load_parameter_state(filename)
+
+    def finish_param_sync(self, model_index):
+        self.mcore_optimizer.finish_param_sync(model_index)
+
+    def disable_pre_hook(self):
+        self.mcore_optimizer.disable_pre_hook()
+
+    def enable_pre_hook(self):
+        self.mcore_optimizer.enable_pre_hook()

From 141ddd722429129e01e5d6590d6b412a6ee02b00 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Tue, 7 May 2024 14:34:13 -0700
Subject: [PATCH 046/178] Revert "Add assert for max_steps to be positive in
 MegatronGPTSFTModel (#9110)" (#9128)

This reverts commit 00023c42a8364a1468a1bb31c1f8f35bc7b0c11e.
---
 .../nlp/models/language_modeling/megatron_gpt_sft_model.py       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 3c3259beff92..17938beb449e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -74,7 +74,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             raise ImportError(
                 "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
             )
-        assert trainer.max_steps > 0, "max_steps for SFT can't be negative as its required to build the dataset"
         super().__init__(cfg, trainer=trainer)
         self.sep_id = cfg.get('sep_id', 49704)
         if hasattr(self.cfg.data, "validation_ds"):

From b1fba460c75746d4d5273072b648242c42c384c1 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Tue, 7 May 2024 15:59:18 -0700
Subject: [PATCH 047/178] add timeout to unit tests (#9132)

---
 .github/workflows/cicd-main.yml | 119 +++++++++++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index cf108434ce86..af87dad6582f 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -226,6 +226,7 @@ jobs:
   L0_Setup_Test_Data_And_Models:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options:
@@ -250,6 +251,7 @@ jobs:
   L2_Community_LLM_Checkpoints_tests_Llama:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -274,6 +276,7 @@ jobs:
   L2_Community_LLM_Checkpoints_tests_Llama3:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -299,6 +302,7 @@ jobs:
   L2_Community_LLM_Checkpoints_tests_StarCoder:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -323,6 +327,7 @@ jobs:
   L2_Community_LLM_Checkpoints_tests_Falcon:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -374,6 +379,7 @@ jobs:
   L2_PTQ_Llama2_Export_Only:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options:
@@ -400,6 +406,7 @@ jobs:
   L2_PTQ_Llama2_FP8:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options:
@@ -432,6 +439,7 @@ jobs:
   L2_PTQ_Llama2_INT8_SQ:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options:
@@ -461,7 +469,7 @@ jobs:
   #L2_PTQ_Llama2_INT4_AWQ:
   #  needs: [cicd-test-container-setup]
   #  runs-on: self-hosted-azure
-  #  timeout-minutes: 15
+  #  timeout-minutes: 10
   #  container:
   #    image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
   #    options:
@@ -494,6 +502,7 @@ jobs:
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -522,6 +531,7 @@ jobs:
   ASR_dev_run_Speech_to_Text_WPE_-_CitriNet:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -553,6 +563,7 @@ jobs:
   ASR_dev_run_Speech_Pre-training_-_CitriNet:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -582,6 +593,7 @@ jobs:
   ASR_dev_run_Speech_To_Text_Finetuning:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -613,6 +625,7 @@ jobs:
   ASR_dev_run_Speech_To_Text_HF_Finetuning:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -660,6 +673,7 @@ jobs:
   ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -694,6 +708,7 @@ jobs:
   ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -728,6 +743,7 @@ jobs:
   L2_Speech_to_Text_EMA:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -807,6 +823,7 @@ jobs:
   L2_Speaker_dev_run_Speaker_Recognition:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -839,6 +856,7 @@ jobs:
   L2_Speaker_dev_run_Speaker_Diarization:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -872,6 +890,7 @@ jobs:
   L2_Speaker_dev_run_Speech_to_Label:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -908,6 +927,7 @@ jobs:
   L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -939,6 +959,7 @@ jobs:
   L2_Speaker_dev_run_Clustering_Diarizer_Inference:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -970,6 +991,7 @@ jobs:
   L2_Speaker_dev_run_Neural_Diarizer_Inference:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -997,6 +1019,7 @@ jobs:
   L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1027,6 +1050,7 @@ jobs:
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1057,6 +1081,7 @@ jobs:
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1096,6 +1121,7 @@ jobs:
   L2_ASR_Adapters_Linear_Adapters:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1128,6 +1154,7 @@ jobs:
   L2_ASR_Adapters_RelPos_MHA_Adapters:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1163,6 +1190,7 @@ jobs:
   L2_Speech_Transcription_Speech_to_Text_Transcribe:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1190,6 +1218,7 @@ jobs:
   L2_Transducer_alignment_Running_pytest:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1212,6 +1241,7 @@ jobs:
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1244,6 +1274,7 @@ jobs:
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1278,6 +1309,7 @@ jobs:
   L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1344,6 +1376,7 @@ jobs:
   L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1427,6 +1460,7 @@ jobs:
   L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1466,6 +1500,7 @@ jobs:
   L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1504,6 +1539,7 @@ jobs:
   L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1545,6 +1581,7 @@ jobs:
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1587,6 +1624,7 @@ jobs:
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1622,6 +1660,7 @@ jobs:
   L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1657,6 +1696,7 @@ jobs:
   L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1698,6 +1738,7 @@ jobs:
   L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1776,6 +1817,7 @@ jobs:
   L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1812,6 +1854,7 @@ jobs:
   L2_Duplex_Text_Normalization_with_Tarred_dataset:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1882,6 +1925,7 @@ jobs:
   L2_BERT_Text_Classification_with_BERT_Test:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1916,6 +1960,7 @@ jobs:
   L2_Parallel_BERT_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1957,6 +2002,7 @@ jobs:
   L2_Parallel_BERT_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1996,6 +2042,7 @@ jobs:
   L2_Parallel_BART_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2037,6 +2084,7 @@ jobs:
   L2_Parallel_BART_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2076,6 +2124,7 @@ jobs:
   L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2117,6 +2166,7 @@ jobs:
   L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2156,6 +2206,7 @@ jobs:
   L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2186,6 +2237,7 @@ jobs:
   L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2322,6 +2374,7 @@ jobs:
   L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2353,6 +2406,7 @@ jobs:
   L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2389,6 +2443,7 @@ jobs:
   L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2418,6 +2473,7 @@ jobs:
   L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2442,6 +2498,7 @@ jobs:
   L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2473,6 +2530,7 @@ jobs:
   L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2533,6 +2591,7 @@ jobs:
   Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2598,6 +2657,7 @@ jobs:
   Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2724,6 +2784,7 @@ jobs:
   Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2755,6 +2816,7 @@ jobs:
   L2_Pretraining_BERT_pretraining_from_Text:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2797,6 +2859,7 @@ jobs:
   L2_Pretraining_BERT_from_Preprocessed:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2839,6 +2902,7 @@ jobs:
   L2_Entity_Linking_Self_Alignment_Pretraining_BERT:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2873,6 +2937,7 @@ jobs:
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2951,6 +3016,7 @@ jobs:
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2991,6 +3057,7 @@ jobs:
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3030,6 +3097,7 @@ jobs:
   L2_NMT_Attention_is_All_You_Need_Inference:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3058,6 +3126,7 @@ jobs:
   L2_NMT_Attention_is_All_You_Need_Finetuning:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3101,6 +3170,7 @@ jobs:
   L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3142,6 +3212,7 @@ jobs:
   L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3173,6 +3244,7 @@ jobs:
   L2_Megatron_NMT_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3282,6 +3354,7 @@ jobs:
   L2_Megatron_BART_Perceiver_MIM_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3600,6 +3673,7 @@ jobs:
   L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3681,6 +3755,7 @@ jobs:
   L2_Megatron_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3763,6 +3838,7 @@ jobs:
   L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3847,6 +3923,7 @@ jobs:
   L2_Megatron_RETRO_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -3916,6 +3993,7 @@ jobs:
   L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4093,6 +4171,7 @@ jobs:
   L2_BioMegatron_Bert_NER_Task:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4120,6 +4199,7 @@ jobs:
   L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4213,6 +4293,7 @@ jobs:
   L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4403,6 +4484,7 @@ jobs:
   L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4499,6 +4581,7 @@ jobs:
   L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4595,6 +4678,7 @@ jobs:
   L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4695,6 +4779,7 @@ jobs:
   L2_Megatron_GPT_Finetuning_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4776,6 +4861,7 @@ jobs:
   L2_Megatron_GPT_Finetuning_StarCoder_PP1:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4820,6 +4906,7 @@ jobs:
   L2_Megatron_GPT_Embedding:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4874,6 +4961,7 @@ jobs:
   L2_Megatron_GPT_PEFT_Lora_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4920,6 +5008,7 @@ jobs:
   L2_Megatron_GPT_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -4982,6 +5071,7 @@ jobs:
   L2_Megatron_GPT_Eval:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5008,6 +5098,7 @@ jobs:
   L2_Megatron_GPT_Eval_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5036,6 +5127,7 @@ jobs:
   L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5099,6 +5191,7 @@ jobs:
   L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5128,6 +5221,7 @@ jobs:
   L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5157,6 +5251,7 @@ jobs:
   L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5264,6 +5359,7 @@ jobs:
   L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5371,6 +5467,7 @@ jobs:
   L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5478,6 +5575,7 @@ jobs:
   L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5559,6 +5657,7 @@ jobs:
   L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5613,6 +5712,7 @@ jobs:
   L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5704,6 +5804,7 @@ jobs:
   L2_Megatron_T5_Eval:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5728,6 +5829,7 @@ jobs:
   L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5808,6 +5910,7 @@ jobs:
   L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5894,6 +5997,7 @@ jobs:
   L2_Megatron_T5_GLUE_RTE:  
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5938,6 +6042,7 @@ jobs:
   L2_Megatron_T5_GLUE_XNLI:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5987,6 +6092,7 @@ jobs:
   L2_Megatron_T5_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -6052,6 +6158,7 @@ jobs:
   L2_Megatron_Mock_Data_Generation_MockGPTDataset:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -6080,6 +6187,7 @@ jobs:
   L2_Megatron_Mock_Data_Generation_MockT5Dataset:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -6110,6 +6218,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_Tacotron_2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -6148,6 +6257,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_WaveGlow:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -6182,6 +6292,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_FastPitch:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -6226,7 +6337,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_RADTTS:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
-    timeout-minutes: 15
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -6268,6 +6379,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_Mixer-TTS:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -6306,6 +6418,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_Hifigan:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -6370,6 +6483,7 @@ jobs:
   Speech_Checkpoints_tests:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
+    timeout-minutes: 10
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -6514,4 +6628,3 @@ jobs:
     steps:
         # This should depend on all the tests so we block/unblock based on all tests passing
       - run: exit 0
-

From 254f8451b98b1c0162858ae3c79c677dc7f54d53 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 8 May 2024 08:10:13 -0700
Subject: [PATCH 048/178] Indicate optional test in name (prefix) (#9139)

---
 .github/workflows/cicd-main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index af87dad6582f..53e0571ab334 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -6334,7 +6334,7 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  L2_TTS_Fast_dev_runs_1_RADTTS:
+  OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     timeout-minutes: 10
@@ -6619,7 +6619,7 @@ jobs:
       - L2_TTS_Fast_dev_runs_1_Tacotron_2
       - L2_TTS_Fast_dev_runs_1_WaveGlow
       - L2_TTS_Fast_dev_runs_1_FastPitch
-      #- L2_TTS_Fast_dev_runs_1_RADTTS
+      #- OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS
       - L2_TTS_Fast_dev_runs_1_Mixer-TTS
       - L2_TTS_Fast_dev_runs_1_Hifigan
       - Speech_Checkpoints_tests

From acfa90d86d90f7fadedb3d07a5fdb198f9307aaf Mon Sep 17 00:00:00 2001
From: paul-gibbons <87940629+paul-gibbons@users.noreply.github.com>
Date: Wed, 8 May 2024 08:30:34 -0700
Subject: [PATCH 049/178] video neva null image+video folder path fix (#9116)

* vneva image+video loader fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* num_frames null fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* supervised_data_module cfg fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* setting default media type to image, fixing image .jsonl path

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../multimodal/data/neva/neva_dataset.py      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 38ada63449ae..caaab2c5d67e 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -740,8 +740,8 @@ def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict, data_cfg: di
         self.video_folder = multimodal_cfg['video_folder']
         self.processor = multimodal_cfg["image_processor"]
 
-        self.image_loader = TarOrFolderImageLoader(self.image_folder)
-        self.video_loader = TarOrFolderVideoLoader(self.video_folder, data_cfg)
+        self.image_loader = TarOrFolderImageLoader(self.image_folder) if self.image_folder else None
+        self.video_loader = TarOrFolderVideoLoader(self.video_folder, data_cfg) if self.video_folder else None
 
     def __len__(self):
         return len(self.list_data_dict)
@@ -916,7 +916,7 @@ def __init__(self, data_path: str, tokenizer, multimodal_cfg: dict, data_cfg: di
             super(NevaDataset, self).__init__(data_path, tokenizer, multimodal_cfg, data_cfg)
 
         elif data_path.endswith(".jsonl"):
-            super(NevaDataset, self).__init__(None, tokenizer, multimodal_cfg)
+            super(NevaDataset, self).__init__(None, tokenizer, multimodal_cfg, data_cfg)
             logging.warning("Loading image inputs from SteerLM Dataset")
             if multimodal_cfg['media_type'] == 'image':
                 image_folder = multimodal_cfg['image_folder']
@@ -1059,20 +1059,20 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
             conv_template=data_cfg.get("conv_template", "nvgpt"),
             crop_size=crop_size,
             image_token_len=data_cfg.image_token_len,
-            image_folder=data_cfg.image_folder,
-            video_folder=data_cfg.video_folder,
+            image_folder=data_cfg.get('image_folder', None),
+            video_folder=data_cfg.get('video_folder', None),
             image_aspect_ratio=data_cfg.image_aspect_ratio,
             use_im_start_end=getattr(model_cfg.mm_cfg, 'use_im_start_end', False),
             image_processor=image_processor,
             add_extra_token=add_extra_token,
             context_length=model_cfg.encoder_seq_length,
-            media_type=data_cfg.media_type,
-            num_frames=data_cfg.num_frames,
+            media_type=data_cfg.get('media_type', 'image'),
+            num_frames=data_cfg.get('num_frames', -1),
         ),
         data_cfg=dict(
-            splice_single_frame=data_cfg.splice_single_frame,
-            num_frames=data_cfg.num_frames,
-            sep_token_between_frames=data_cfg.sep_token_between_frames,
+            splice_single_frame=data_cfg.get('splice_single_frame', None),
+            num_frames=data_cfg.get('num_frames', -1),
+            sep_token_between_frames=data_cfg.get('sep_token_between_frames', False),
         ),
     )
 

From fd009b928664062f819e36b845da8d814f403d66 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Wed, 8 May 2024 11:33:49 -0400
Subject: [PATCH 050/178] [NeMo-UX] Add data module (#9133)

* add datamodules

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add test and fix pretraining

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/lightning/base.py               |  54 ++++++++++-
 nemo/llm/gpt/data/__init__.py        |   6 +-
 nemo/llm/gpt/data/core.py            |  57 +++++++++++
 nemo/llm/gpt/data/dolly.py           | 122 +++++++++++++++++++++++
 nemo/llm/gpt/data/fine_tuning.py     | 105 ++++++++++++++++++++
 nemo/llm/gpt/data/pre_training.py    | 138 +++++++++++++++++++++++++++
 nemo/llm/gpt/data/squad.py           | 126 ++++++++++++++++++++++++
 nemo/llm/gpt/model/base.py           |   3 +-
 tests/lightning/test_data.py         |  65 +++++++++++++
 tests/lightning/test_strategy_lib.py |   2 +-
 10 files changed, 673 insertions(+), 5 deletions(-)
 create mode 100644 nemo/llm/gpt/data/core.py
 create mode 100644 nemo/llm/gpt/data/dolly.py
 create mode 100644 nemo/llm/gpt/data/fine_tuning.py
 create mode 100644 nemo/llm/gpt/data/pre_training.py
 create mode 100644 nemo/llm/gpt/data/squad.py
 create mode 100644 tests/lightning/test_data.py

diff --git a/nemo/lightning/base.py b/nemo/lightning/base.py
index ab9fe40eb7a2..65bc1310f426 100644
--- a/nemo/lightning/base.py
+++ b/nemo/lightning/base.py
@@ -1,13 +1,16 @@
 import gc
+import inspect
 import os
 from pathlib import Path
-from typing import Optional
+from typing import Generic, Optional, Type, TypeVar
 
 import torch
 import torch.distributed
-from pytorch_lightning import Trainer
+from pytorch_lightning import LightningModule, Trainer
 from torch import nn
 
+from nemo import io
+
 DEFAULT_NEMO_CACHE_HOME = Path.home() / ".cache" / "nemo"
 NEMO_CACHE_HOME = Path(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME))
 DEFAULT_NEMO_DATASETS_CACHE = NEMO_CACHE_HOME / "datasets"
@@ -15,6 +18,53 @@
 DEFAULT_NEMO_MODELS_CACHE = NEMO_CACHE_HOME / "models"
 NEMO_MODELS_CACHE = Path(os.getenv("NEMO_MODELS_CACHE", DEFAULT_NEMO_MODELS_CACHE))
 
+#
+# @dataclass
+# class DataConfig:
+#     seq_length: int
+#     micro_batch_size: int = 4
+#     global_batch_size: int = 8
+#     rampup_batch_size: Optional[List[int]] = None
+#     train_drop_last: bool = True
+#     val_drop_last: bool = True
+#     test_drop_last: bool = True
+#     num_workers: int = 8
+#     pin_memory: bool = True
+#     persistent_workers: bool = False
+#
+#     @property
+#     def num_microbatches(self) -> int:
+#         from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+#
+#         return get_num_microbatches()
+#
+#
+ModelT = TypeVar("ModelT", bound=LightningModule)
+
+
+class ModelConfig(Generic[ModelT], io.IOMixin):
+    def model_cls(self) -> Type[ModelT]:
+        raise NotImplementedError("Must be implemented by subclass")
+
+    @property
+    def model_type(self) -> Type[ModelT]:
+        return self.model_cls()
+
+    def init(self, *args, data=None, cpu: bool = False, **kwargs) -> ModelT:
+        model_cls = self.model_cls()
+        if data:
+            kwargs.update(data.model_kwargs())
+
+        signature = inspect.signature(model_cls.__init__)
+        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature.parameters}
+
+        model = model_cls(self, *args, **filtered_kwargs)
+
+        if not cpu:
+            model.cuda(torch.cuda.current_device())
+
+        return model
+
 
 def get_vocab_size(config, vocab_size: int, make_vocab_size_divisible_by: int = 128,) -> int:
     from nemo.utils import logging
diff --git a/nemo/llm/gpt/data/__init__.py b/nemo/llm/gpt/data/__init__.py
index e9b7c07c16cc..1c1c9ce5d525 100644
--- a/nemo/llm/gpt/data/__init__.py
+++ b/nemo/llm/gpt/data/__init__.py
@@ -1,3 +1,7 @@
+from nemo.llm.gpt.data.dolly import DollyDataModule
+from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule
 from nemo.llm.gpt.data.mock import MockDataModule
+from nemo.llm.gpt.data.pre_training import PreTrainingDataModule
+from nemo.llm.gpt.data.squad import SquadDataModule
 
-__all__ = ["MockDataModule"]
+__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"]
diff --git a/nemo/llm/gpt/data/core.py b/nemo/llm/gpt/data/core.py
new file mode 100644
index 000000000000..c8ce328c1e0b
--- /dev/null
+++ b/nemo/llm/gpt/data/core.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+
+from nemo.lightning.base import NEMO_DATASETS_CACHE
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
+
+
+def get_dataset_root(name: str) -> Path:
+    output = Path(NEMO_DATASETS_CACHE) / name
+    output.mkdir(parents=True, exist_ok=True)
+
+    return output
+
+
+def create_sft_dataset(
+    path: Path,
+    tokenizer: "TokenizerSpec",
+    seq_length: int = 2048,
+    add_bos: bool = False,
+    add_eos: bool = True,
+    add_sep: bool = False,
+    seed: int = 1234,
+    label_key: str = 'output',
+    answer_only_loss: bool = True,
+    truncation_field: str = 'input',
+    pad_to_max_length: bool = False,
+    index_mapping_dir: Optional[str] = None,
+    prompt_template: str = '{input} {output}',
+    truncation_method: str = 'right',
+    memmap_workers: int = 2,
+    hf_dataset: bool = False,
+    **kwargs
+) -> "GPTSFTDataset":
+    from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
+
+    return GPTSFTDataset(
+        file_path=str(path),
+        tokenizer=tokenizer,
+        max_seq_length=seq_length,
+        memmap_workers=memmap_workers,
+        hf_dataset=hf_dataset,
+        add_bos=add_bos,
+        add_eos=add_eos,
+        add_sep=add_sep,
+        seed=seed,
+        label_key=label_key,
+        answer_only_loss=answer_only_loss,
+        truncation_field=truncation_field,
+        pad_to_max_length=pad_to_max_length,
+        index_mapping_dir=index_mapping_dir,
+        prompt_template=prompt_template,
+        truncation_method=truncation_method,
+        **kwargs
+    )
diff --git a/nemo/llm/gpt/data/dolly.py b/nemo/llm/gpt/data/dolly.py
new file mode 100644
index 000000000000..2e3dcaffbf0a
--- /dev/null
+++ b/nemo/llm/gpt/data/dolly.py
@@ -0,0 +1,122 @@
+import json
+import shutil
+from typing import TYPE_CHECKING, List, Optional
+
+import numpy as np
+from datasets import load_dataset
+
+from nemo.llm.gpt.data.core import get_dataset_root
+from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.utils import logging
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+
+
+class DollyDataModule(FineTuningDataModule):
+    """A data module for fine-tuning on the Dolly dataset.
+
+    This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models on the
+    "databricks/databricks-dolly-15k" dataset. It handles data download, preprocessing, splitting, and preparing the data
+    in a format suitable for training, validation, and testing.
+
+    Args:
+        force_redownload (bool, optional): Whether to force re-download the dataset even if it exists locally. Defaults to False.
+        delete_raw (bool, optional): Whether to delete the raw downloaded dataset after preprocessing. Defaults to True.
+        See FineTuningDataModule for the other args
+    """
+
+    def __init__(
+        self,
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        force_redownload: bool = False,
+        delete_raw: bool = True,
+        seed: int = 1234,
+        memmap_workers: int = 1,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+    ):
+        self.force_redownload = force_redownload
+        self.delete_raw = delete_raw
+
+        super().__init__(
+            dataset_root=get_dataset_root("dolly"),
+            seq_length=seq_length,
+            tokenizer=tokenizer,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+            seed=seed,
+            memmap_workers=memmap_workers,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+        )
+
+    def prepare_data(self) -> None:
+        # if train file is specified, no need to do anything
+        if self.train_path.exists() and not self.force_redownload:
+            return
+
+        dset = self._download_data()
+        self._preprocess_and_split_data(dset)
+
+    def _download_data(self):
+        logging.info(f"Downloading {self.__class__.__name__}...")
+        return load_dataset(
+            "databricks/databricks-dolly-15k",
+            cache_dir=str(self.dataset_root),
+            download_mode="force_redownload" if self.force_redownload else None,
+        )
+
+    def _preprocess_and_split_data(self, dset, train_ratio: float = 0.80, val_ratio: float = 0.15):
+        logging.info(f"Preprocessing {self.__class__.__name__} to jsonl format and splitting...")
+
+        test_ratio = 1 - train_ratio - val_ratio
+        save_splits = {}
+        dataset = dset.get('train')
+        split_dataset = dataset.train_test_split(test_size=val_ratio + test_ratio, seed=self.seed)
+        split_dataset2 = split_dataset['test'].train_test_split(
+            test_size=test_ratio / (val_ratio + test_ratio), seed=self.seed
+        )
+        save_splits['training'] = split_dataset['train']
+        save_splits['validation'] = split_dataset2['train']
+        save_splits['test'] = split_dataset2['test']
+
+        for split_name, dataset in save_splits.items():
+            output_file = self.dataset_root / f"{split_name}.jsonl"
+            with output_file.open("w", encoding="utf-8") as f:
+                for example in dataset:
+                    context = example["context"].strip()
+                    if context != "":
+                        # Randomize context and instruction order.
+                        context_first = np.random.randint(0, 2) == 0
+                        if context_first:
+                            instruction = example["instruction"].strip()
+                            assert instruction != ""
+                            _input = f"{context}\n\n{instruction}"
+                            _output = example["response"]
+                        else:
+                            instruction = example["instruction"].strip()
+                            assert instruction != ""
+                            _input = f"{instruction}\n\n{context}"
+                            _output = example["response"]
+                    else:
+                        _input = example["instruction"]
+                        _output = example["response"]
+
+                    f.write(json.dumps({"input": _input, "output": _output, "category": example["category"]}) + "\n")
+
+            logging.info(f"{split_name} split saved to {output_file}")
+
+        if self.delete_raw:
+            for p in self.dataset_root.iterdir():
+                if p.is_dir():
+                    shutil.rmtree(p)
+                elif '.jsonl' not in str(p.name):
+                    p.unlink()
diff --git a/nemo/llm/gpt/data/fine_tuning.py b/nemo/llm/gpt/data/fine_tuning.py
new file mode 100644
index 000000000000..1e4ab0432847
--- /dev/null
+++ b/nemo/llm/gpt/data/fine_tuning.py
@@ -0,0 +1,105 @@
+from functools import lru_cache
+from pathlib import Path
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+from nemo.llm.gpt.data.core import create_sft_dataset
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+
+
+class FineTuningDataModule(pl.LightningDataModule):
+    """Base class for fine-tuning an LLM.
+
+    This class provides a foundation for building custom data modules for fine-tuning Nemo NLP models. It inherits from
+    `pl.LightningDataModule` from the PyTorch Lightning library and handles data loading, preprocessing, and batch creation
+    for training, validation, and testing.
+
+    Args:
+        dataset_root (Union[str, Path]): The root directory containing the training, validation, and test data.
+        seq_length (int, optional): The maximum sequence length for the input and output text. Defaults to 2048.
+        tokenizer (Optional[TokenizerSpec], optional): The tokenizer to use for preprocessing the text. Defaults to None.
+            If not provided, a Megatron GPT2 BPE tokenizer will be used.
+        micro_batch_size (int, optional): The micro batch size for training. Defaults to 4.
+        global_batch_size (int, optional): The global batch size for training. Defaults to 8.
+        rampup_batch_size (Optional[List[int]], optional): A list of batch sizes for ramping up during training. Defaults to None.
+        seed (int, optional): The random seed for data shuffling. Defaults to 1234.
+        memmap_workers (int, optional): The number of worker processes for loading data using TextMemMapDataset. Defaults to 1.
+        num_workers (int, optional): The number of worker processes for data loading. Defaults to 8.
+        pin_memory (bool, optional): Whether to pin memory during data loading for faster GPU training. Defaults to True.
+        persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        dataset_root: Union[str, Path],
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        seed: int = 1234,
+        memmap_workers: int = 1,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+    ):
+        super().__init__()
+        self.seq_length = seq_length
+        self.seed = seed
+        self.dataset_root = Path(dataset_root)
+
+        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
+        self.memmap_workers = memmap_workers
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+        )
+
+    def train_dataloader(self) -> DataLoader:
+        return self._create_dataloader(self._create_dataset(str(self.train_path)))
+
+    def val_dataloader(self) -> DataLoader:
+        return self._create_dataloader(self._create_dataset(str(self.validation_path)))
+
+    def test_dataloader(self) -> DataLoader:
+        return self._create_dataloader(self._create_dataset(str(self.test_path), tokens_to_generate=32, is_test=True,))
+
+    @lru_cache
+    def _create_dataset(self, path, **kwargs):
+        return create_sft_dataset(
+            path, tokenizer=self.tokenizer, seq_length=self.seq_length, memmap_workers=self.memmap_workers, **kwargs
+        )
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            **kwargs
+        )
+
+    @property
+    def train_path(self) -> Path:
+        return self.dataset_root / "training.jsonl"
+
+    @property
+    def validation_path(self) -> Path:
+        return self.dataset_root / "validation.jsonl"
+
+    @property
+    def test_path(self) -> Path:
+        return self.dataset_root / "test.jsonl"
diff --git a/nemo/llm/gpt/data/pre_training.py b/nemo/llm/gpt/data/pre_training.py
new file mode 100644
index 000000000000..d5d05955078b
--- /dev/null
+++ b/nemo/llm/gpt/data/pre_training.py
@@ -0,0 +1,138 @@
+from pathlib import Path
+from typing import TYPE_CHECKING, List, Optional
+
+import pytorch_lightning as pl
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils.data import DataLoader
+
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+
+if TYPE_CHECKING:
+    from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+class PreTrainingDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        path: Path,
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        num_train_samples: int = 10_000,
+        num_val_samples: int = 10_000,
+        num_test_samples: int = 10_000,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+        reset_position_ids: bool = False,
+        reset_attention_mask: bool = False,
+        eod_mask_loss: bool = False,
+        seed: int = 1234,
+        split: str = "900,50,50",
+    ) -> None:
+        super().__init__()
+        self.path = path
+        self.seq_length = seq_length
+        self.tokenizer = tokenizer
+        self.num_train_samples = num_train_samples
+        self.num_val_samples = num_val_samples
+        self.num_test_samples = num_test_samples
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.reset_position_ids = reset_position_ids
+        self.reset_attention_mask = reset_attention_mask
+        self.eod_mask_loss = eod_mask_loss
+        self.seed = seed
+        self.split = split
+
+        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+        )
+
+    def setup(self, stage: str = "") -> None:
+        from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+        from megatron.core.datasets.gpt_dataset import GPTDataset
+
+        assert (
+            hasattr(self, "trainer") and self.trainer is not None
+        ), "Setup should be completed when trainer and config are attached."
+
+        # Trainer API
+        max_train_steps = self.trainer.max_steps
+        assert max_train_steps > 0, "Please specify trainer.max_steps"
+        eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches
+        test_iters = self.trainer.limit_test_batches
+        num_train_samples = max_train_steps * self.data_sampler.global_batch_size
+        num_val_samples = eval_iters * self.data_sampler.global_batch_size
+        num_test_samples = test_iters * self.data_sampler.global_batch_size
+
+        if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
+            # This is to make sure we only have one epoch on every validation iteration
+            num_val_samples = 1
+
+        train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples]
+        self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
+            GPTDataset, train_valid_test_num_samples, is_built_on_rank=lambda: True, config=self.gpt_dataset_config,
+        ).build()
+
+    # uncomment once fabric API is merged
+    # def fabric_setup(
+    #     self,
+    #     fabric: fl.Fabric,
+    #     num_train_samples: int,
+    #     num_val_samples: int,
+    #     num_test_samples: int,
+    # ) -> None:
+    #     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+    #     from megatron.core.datasets.gpt_dataset import GPTDataset
+    #
+    #     del fabric
+    #     train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples]
+    #     self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
+    #         GPTDataset, train_valid_test_num_samples, self.gpt_dataset_config,
+    #     ).build()
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        return self._create_dataloader(self._train_ds)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._validation_ds)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._test_ds)
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            **kwargs,
+        )
+
+    @property
+    def gpt_dataset_config(self) -> "GPTDatasetConfig":
+        from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+
+        return GPTDatasetConfig(
+            blend=[[str(self.path)], [1.0]],
+            random_seed=self.seed,
+            sequence_length=self.seq_length,
+            tokenizer=self.tokenizer,
+            split=self.split,
+            path_to_cache=None,
+            reset_position_ids=self.reset_position_ids,
+            reset_attention_mask=self.reset_attention_mask,
+            eod_mask_loss=self.eod_mask_loss,
+        )
diff --git a/nemo/llm/gpt/data/squad.py b/nemo/llm/gpt/data/squad.py
new file mode 100644
index 000000000000..c5235905b4ed
--- /dev/null
+++ b/nemo/llm/gpt/data/squad.py
@@ -0,0 +1,126 @@
+import json
+import shutil
+from typing import TYPE_CHECKING, List, Optional
+
+from datasets import DatasetDict, load_dataset
+
+from nemo.llm.gpt.data.core import get_dataset_root
+from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.utils import logging
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+
+
+class SquadDataModule(FineTuningDataModule):
+    """A data module for fine-tuning on the Squad dataset.
+
+    This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models on the
+    Stanford Question Answering Dataset (SQuAD). It handles data download, preprocessing, splitting, and preparing the data
+    in a format suitable for training, validation, and testing.
+
+    Args:
+        force_redownload (bool, optional): Whether to force re-download the dataset even if it exists locally. Defaults to False.
+        delete_raw (bool, optional): Whether to delete the raw downloaded dataset after preprocessing. Defaults to True.
+        See FineTuningDataModule for the other args
+    """
+
+    def __init__(
+        self,
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        force_redownload: bool = False,
+        delete_raw: bool = True,
+        seed: int = 1234,
+        memmap_workers: int = 1,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+    ):
+        self.force_redownload = force_redownload
+        self.delete_raw = delete_raw
+
+        super().__init__(
+            dataset_root=get_dataset_root("squad"),
+            seq_length=seq_length,
+            tokenizer=tokenizer,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+            seed=seed,
+            memmap_workers=memmap_workers,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+        )
+
+    def prepare_data(self) -> None:
+        # if train file is specified, no need to do anything
+        if self.train_path.exists() and not self.force_redownload:
+            return
+
+        dset = self._download_data()
+        self._preprocess_and_split_data(dset)
+
+    def _download_data(self):
+        logging.info(f"Downloading {self.__class__.__name__}...")
+        return load_dataset(
+            "squad",
+            cache_dir=str(self.dataset_root),
+            download_mode="force_redownload" if self.force_redownload else None,
+        )
+
+    def _preprocess_and_split_data(
+        self, dset: DatasetDict, split_val_from_train: bool = True, val_proportion: float = 0.05
+    ):
+        """Preprocesses and splits the downloaded dataset into training, validation, and test sets.
+
+        Args:
+            dset (DatasetDict): The downloaded dataset object.
+            split_val_from_train (bool, optional): Whether to split the validation set from the training set.
+                If False, the validation set is split from the test set. Defaults to True.
+            val_proportion (float, optional): The proportion of the training or test set to be used for the validation split.
+                Defaults to 0.05.
+        """
+        logging.info(f"Preprocessing {self.__class__.__name__} to jsonl format and splitting...")
+        save_splits = {}
+        train_set = dset.get('train')
+        val_set = dset.get('validation')
+
+        if split_val_from_train:
+            split_dataset = train_set.train_test_split(test_size=val_proportion, seed=self.seed)
+            save_splits['training'] = split_dataset['train']
+            save_splits['validation'] = split_dataset['test']
+            save_splits['test'] = val_set
+        else:
+            split_dataset = val_set.train_test_split(test_size=val_proportion, seed=self.seed)
+            save_splits['training'] = train_set
+            save_splits['validation'] = split_dataset['test']
+            save_splits['test'] = split_dataset['train']
+
+        for split_name, dataset in save_splits.items():
+            output_file = self.dataset_root / f"{split_name}.jsonl"
+
+            with output_file.open("w", encoding="utf-8") as f:
+                for example in dataset:
+                    json_line = {}
+                    # Write each example as a JSON line in the output file
+                    json_line["input"] = (
+                        "Context: " + example["context"] + " Question: " + example['question'] + " Answer:"
+                    )
+                    json_line["output"] = example["answers"]["text"][0]
+                    if split_name == "test":
+                        json_line["original_answers"] = example["answers"]["text"]
+                    f.write(json.dumps(json_line) + "\n")
+
+            logging.info(f"{split_name} split saved to {output_file}")
+
+        if self.delete_raw:
+            for p in self.dataset_root.iterdir():
+                if p.is_dir():
+                    shutil.rmtree(p)
+                elif '.jsonl' not in str(p.name):
+                    p.unlink()
diff --git a/nemo/llm/gpt/model/base.py b/nemo/llm/gpt/model/base.py
index 554870712a36..7aaac96fdc4f 100644
--- a/nemo/llm/gpt/model/base.py
+++ b/nemo/llm/gpt/model/base.py
@@ -9,6 +9,7 @@
 
 from nemo import io
 from nemo.lightning import get_vocab_size
+from nemo.lightning.base import ModelConfig
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 
 if TYPE_CHECKING:
@@ -18,7 +19,7 @@
 
 
 @dataclass
-class GPTConfig(TransformerConfig, io.IOMixin):
+class GPTConfig(TransformerConfig, ModelConfig):
     # From megatron.core.models.gpt.gpt_model.GPTModel
     fp16_lm_cross_entropy: bool = False
     parallel_output: bool = True
diff --git a/tests/lightning/test_data.py b/tests/lightning/test_data.py
new file mode 100644
index 000000000000..e3143b6da03c
--- /dev/null
+++ b/tests/lightning/test_data.py
@@ -0,0 +1,65 @@
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+
+@patch(
+    'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
+)
+def test_finetuning_module(mock_gpt_sft_dataset) -> None:
+    from nemo.llm.gpt.data import FineTuningDataModule
+
+    dataset_root = 'random_root'
+    datamodule = FineTuningDataModule(
+        dataset_root, seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,
+    )
+
+    datamodule.train_dataloader()
+    mock_gpt_sft_dataset.assert_called_once()
+
+
+@patch(
+    'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
+)
+def test_dolly_module(mock_gpt_sft_dataset) -> None:
+    from nemo.llm.gpt.data import DollyDataModule
+
+    datamodule = DollyDataModule(seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,)
+
+    datamodule.train_dataloader()
+    mock_gpt_sft_dataset.assert_called_once()
+
+
+@patch(
+    'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
+)
+def test_squad_module(mock_gpt_sft_dataset) -> None:
+    from nemo.llm.gpt.data import SquadDataModule
+
+    datamodule = SquadDataModule(seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,)
+
+    datamodule.train_dataloader()
+    mock_gpt_sft_dataset.assert_called_once()
+
+
+# TODO @chcui fix test for pretrain data module
+# @patch('megatron.core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder')
+# @patch('nemo.lightning.pytorch.trainer.Trainer')
+# def test_pretraining_module(mock_pretraining_dataset_builder, mock_trainer) -> None:
+#     from nemo.llm.gpt.data import PreTrainingDataModule
+#
+#     datamodule = PreTrainingDataModule(
+#         path=Path('random_path'),
+#         seq_length=2048,
+#         micro_batch_size=4,
+#         global_batch_size=8,
+#         seed=1234,
+#     )
+#     mock_trainer.max_steps = 100
+#     mock_trainer.val_check_interval = 5
+#     mock_trainer.limit_val_batches = 10
+#     mock_trainer.limit_test_batches = 10
+#     datamodule.trainer = mock_trainer
+#
+#     datamodule.setup()
+#     datamodule.train_dataloader()
+#     mock_pretraining_dataset_builder.assert_called_once()
diff --git a/tests/lightning/test_strategy_lib.py b/tests/lightning/test_strategy_lib.py
index 96f5f2920bcf..b59930ab023d 100644
--- a/tests/lightning/test_strategy_lib.py
+++ b/tests/lightning/test_strategy_lib.py
@@ -75,7 +75,7 @@ def test_init_model_parallel(mock_mpu, *args):
     )
 
 
-# TODO @chcui uncomment after DataConfig is merged
+# TODO @chcui uncomment after fabric API is merged
 # @patch('nemo.lightning._strategy_lib.DataLoader', return_value=MagicMock())
 # @patch('megatron.core.parallel_state')
 # def test_process_dataloader(mock_mpu, mock_dataloader) -> None:

From 6442bb67275759f5ece6bd5e366966216e050cfe Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithya.r@gmail.com>
Date: Wed, 8 May 2024 09:56:51 -0700
Subject: [PATCH 051/178] scripts to convert HF lora to nemo (#9102)

* convert nemo to hf and hf to nemo

Signed-off-by: arendu <adithya.r@gmail.com>

* example usage

Signed-off-by: arendu <adithya.r@gmail.com>

* update

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: arendu <adithya.r@gmail.com>

* canonicanl lora in nemo updates

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../conf/megatron_gpt_generate_config.yaml    |   2 +-
 .../common/megatron/adapters/mcore_mixins.py  |  20 +--
 .../megatron/adapters/parallel_adapters.py    |  89 ++++------
 .../convert_hf_to_canonical.py                | 133 ++++++++++++++
 .../convert_nemo_to_canonical.py              | 164 +++++++++++-------
 5 files changed, 280 insertions(+), 128 deletions(-)
 create mode 100644 scripts/checkpoint_converters/lora_converters/convert_hf_to_canonical.py

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
index 592eed6c4420..654fa785ec22 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
@@ -145,7 +145,7 @@ inference:
   top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
   temperature: 1.0 # sampling temperature
   all_probs: False  # whether return the log prob for all the tokens in vocab
-  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  repetition_penalty: 1.0  # The parameter for repetition penalty. 1.0 means no penalty.
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   outfile_path: output.txt
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index 16ded8e2c682..fe9e900f4ad0 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -109,20 +109,12 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
 
         # LoRA logic
         if self.is_adapter_available():
-            lora_adapter = None
             lora_kqv_adapter = self.get_adapter_module(AdapterName.LORA_KQV_ADAPTER)
-            lora_unfused_kqv_adapter = self.get_adapter_module(AdapterName.LORA_UNFUSED_KQV_ADAPTER)
             if lora_kqv_adapter and self.adapter_cfg[AdapterName.LORA_KQV_ADAPTER]['enabled']:
-                lora_adapter = lora_kqv_adapter
-            if lora_unfused_kqv_adapter and self.adapter_cfg[AdapterName.LORA_UNFUSED_KQV_ADAPTER]['enabled']:
-                assert lora_adapter is None, "Expected only one of lora_kqv_adapter or lora_unfused_kqv_adapter"
-                lora_adapter = lora_unfused_kqv_adapter
-
-            if lora_adapter:
                 if layernorm_output is not None:
-                    lora_mixed_qkv = lora_adapter(layernorm_output)
+                    lora_mixed_qkv = lora_kqv_adapter(layernorm_output)
                 else:
-                    lora_mixed_qkv = lora_adapter(hidden_states)
+                    lora_mixed_qkv = lora_kqv_adapter(hidden_states)
 
                 mixed_qkv = mixed_qkv + lora_mixed_qkv
 
@@ -171,6 +163,14 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
                 vls = value.shape
                 value = value_infused_adapter(value.reshape(vls[0], vls[1], -1)).reshape(vls).to(query.dtype)
 
+            lora_unfused_kqv_adapter = self.get_adapter_module(AdapterName.LORA_UNFUSED_KQV_ADAPTER)
+            if lora_unfused_kqv_adapter and self.adapter_cfg[AdapterName.LORA_UNFUSED_KQV_ADAPTER]['enabled']:
+                assert lora_kqv_adapter is None
+                if layernorm_output is not None:
+                    lq, lk, lv = lora_unfused_kqv_adapter(layernorm_output)
+                else:
+                    lq, lk, lv = lora_unfused_kqv_adapter(hidden_states)
+                query, key, value = query + lq, key + lk, value + lv
         return query, key, value
 
     def forward(
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 2a5372d11ab5..51510f1b881e 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -532,6 +532,8 @@ def __init__(
         self,
         in_features: int,
         dim: int,
+        num_query_groups: int,
+        kv_channels: int,
         activation: str = 'swish',
         norm_position: Optional[str] = 'post',
         norm_type: Optional[str] = 'mixedfusedlayernorm',
@@ -544,8 +546,6 @@ def __init__(
         alpha: float | None = None,
         dropout_position: str = 'post',
         a2a_experimental: bool = False,  # TODO: should rename this or make it a default feature
-        num_query_groups: Optional[int] = None,
-        kv_channels: Optional[int] = None,
         **kwargs,
     ):
         super().__init__()
@@ -554,71 +554,46 @@ def __init__(
         else:
             out_features = in_features
 
-        self.q_adapter = ParallelLinearAdapter(
-            in_features,
-            in_features,
-            dim,
-            activation,
-            norm_position,
-            norm_type,
-            column_init_method,
-            row_init_method,
-            gather_output,
-            input_is_parallel,
-            dropout,
-            model_parallel_config,
-            alpha,
-            dropout_position,
-            a2a_experimental,
-        )
-
-        self.k_adapter = ParallelLinearAdapter(
-            in_features,
-            out_features,
-            dim,
-            activation,
-            norm_position,
-            norm_type,
-            column_init_method,
-            row_init_method,
-            gather_output,
-            input_is_parallel,
-            dropout,
-            model_parallel_config,
-            alpha,
-            dropout_position,
-            a2a_experimental,
-        )
-        self.v_adapter = ParallelLinearAdapter(
-            in_features,
-            out_features,
-            dim,
-            activation,
-            norm_position,
-            norm_type,
-            column_init_method,
-            row_init_method,
-            gather_output,
-            input_is_parallel,
-            dropout,
-            model_parallel_config,
-            alpha,
-            dropout_position,
-            a2a_experimental,
-        )
+        self.kv_channels = kv_channels
+        adapter_args = {
+            "in_features": in_features,
+            "out_features": in_features,
+            "dim": dim,
+            "activation": activation,
+            "norm_position": norm_position,
+            "norm_type": norm_type,
+            "column_init_method": column_init_method,
+            "row_init_method": row_init_method,
+            "gather_output": gather_output,
+            "input_is_parallel": input_is_parallel,
+            "dropout": dropout,
+            "model_parallel_config": model_parallel_config,
+            "alpha": alpha,
+            "dropout_position": dropout_position,
+            "a2a_experimental": a2a_experimental,
+        }
+
+        self.q_adapter = ParallelLinearAdapter(**adapter_args)
+        adapter_args["out_features"] = out_features
+        self.k_adapter = ParallelLinearAdapter(**adapter_args)
+        self.v_adapter = ParallelLinearAdapter(**adapter_args)
 
     def forward(self, x):
         qx = self.q_adapter(x)
         kx = self.k_adapter(x)
         vx = self.v_adapter(x)
-        x = torch.concat([qx, kx, vx], dim=2)
-        return x
+        qx = qx.reshape(qx.shape[0], qx.shape[1], -1, self.kv_channels)
+        kx = kx.reshape(kx.shape[0], kx.shape[1], -1, self.kv_channels)
+        vx = vx.reshape(vx.shape[0], vx.shape[1], -1, self.kv_channels)
+        return qx, kx, vx
 
 
 @dataclass
 class LoraUnfusedKQVAdapterConfig(AdapterConfig):
     in_features: int
     dim: int
+    num_query_groups: int
+    kv_channels: int
     activation: str = 'swish'
     norm_position: Optional[str] = 'post'
     norm_type: Optional[str] = 'mixedfusedlayernorm'
@@ -631,8 +606,6 @@ class LoraUnfusedKQVAdapterConfig(AdapterConfig):
     alpha: float | None = None
     network_alpha: int | None = None
     a2a_experimental: bool = False
-    num_query_groups: Optional[int] = None
-    kv_channels: Optional[int] = None
     _target_: str = "{0}.{1}".format(LoraUnfusedKQVAdapter.__module__, LoraUnfusedKQVAdapter.__name__)
 
 
diff --git a/scripts/checkpoint_converters/lora_converters/convert_hf_to_canonical.py b/scripts/checkpoint_converters/lora_converters/convert_hf_to_canonical.py
new file mode 100644
index 000000000000..7b3099714669
--- /dev/null
+++ b/scripts/checkpoint_converters/lora_converters/convert_hf_to_canonical.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example usage of this script:
+/checkpoints/bin/ is a folder containing the HF lora checkpoint (usually named adapter_model.bin)
+and a HF lora config file (usually named adapter_config.json)
+python scripts/checkpoint_converters/lora_converters/convert_hf_to_canonical.py \
+    --hf_lora_path /checkpoints/bin/ \
+    --output_path output_dir/converted_lora.nemo \
+    --nemo_config model_config.yaml
+"""
+
+import json
+import tempfile
+from argparse import ArgumentParser
+from typing import Dict
+
+import torch
+from omegaconf import OmegaConf, open_dict
+
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+
+target_map = {
+    "all": ["gate_proj", "o_proj", "up_proj", "down_proj", "k_proj", "q_proj", "v_proj"],
+    "attention_qkv": ["k_proj", "q_proj", "v_proj"],
+    "attention_dense": ["gate_proj", "o_proj", "up_proj"],
+}
+
+
+def map_target_modules_to_canonical(target_modules):
+    tm = set(target_modules)
+    for k, v in target_map.items():
+        if tm == set(v):
+            return [k]
+    raise ValueError(f"Unknown target modules: {target_modules}")
+
+
+def reformat_module_names_to_canonical(tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    new_tensors = dict()
+    for module_name, module_weight in tensors.items():
+        # map linear_in and linear_out to lora_a/lora_b counterparts
+        new_module_name = (
+            module_name.replace("lora_A", "linear_in").replace("lora_B", "linear_out").replace("base_model.", "")
+        )
+
+        new_module_name = new_module_name.replace(".q_proj", ".adapter_layer.lora_unfused_kqv_adapter.q_adapter")
+        new_module_name = new_module_name.replace(".k_proj", ".adapter_layer.lora_unfused_kqv_adapter.k_adapter")
+        new_module_name = new_module_name.replace(".v_proj", ".adapter_layer.lora_unfused_kqv_adapter.v_adapter")
+        new_module_name = new_module_name.replace(".o_proj", ".adapter_layer.lora_dense_attention_adapter")
+        new_module_name = new_module_name.replace(".down_proj", ".adapter_layer.lora_4htoh_adapter")
+        new_module_name = new_module_name.replace(
+            ".gate_proj", ".adapter_layer.lora_unfused_hto4h_adapter.gate_adapter"
+        )
+        new_module_name = new_module_name.replace(".up_proj", ".adapter_layer.lora_unfused_hto4h_adapter.up_adapter")
+        new_module_name = new_module_name.replace("self_attn", "self_attention")
+        new_module_name = new_module_name.replace("model.model", "model.decoder")
+
+        new_tensors[new_module_name] = module_weight
+    return new_tensors
+
+
+def convert_lora(lora_hf_path, save_path, lora_yaml):
+    config_file = f"{lora_hf_path}/adapter_config.json"
+    model_file = f"{lora_hf_path}/adapter_model.bin"
+    hf_lora_config = json.loads(open(config_file).read())
+    model = torch.load(model_file)
+    # TODO: currently suport tp=1
+    lora_state_dict = reformat_module_names_to_canonical(model)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        nemo_lora_config = OmegaConf.load(lora_yaml)
+        with open_dict(nemo_lora_config):
+            nemo_lora_config.peft.lora_tuning.variant = "canonical"
+            nemo_lora_config.peft.lora_tuning.adapter_dim = hf_lora_config["r"]
+            nemo_lora_config.peft.lora_tuning.alpha = hf_lora_config["lora_alpha"]
+            nemo_lora_config.peft.lora_tuning.target_modules = map_target_modules_to_canonical(
+                hf_lora_config["target_modules"]
+            )
+
+        with open(f"{tmpdir}/model_config.yaml", "w") as f:
+            OmegaConf.save(nemo_lora_config, f)
+        torch.save(lora_state_dict, f"{tmpdir}/model_weights.ckpt")
+        NLPSaveRestoreConnector._make_nemo_file_from_folder(save_path, tmpdir)
+
+    return True
+
+
+def fix_for_O2(state_dict):
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if "model.module." not in k:
+            new_state_dict[k.replace('model.', 'model.module.')] = v
+    return new_state_dict
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--hf_lora_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to NeMo style (fused) lora checkpoint in .nemo file format",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to save the canonical (unfused) lora .nemo file.",
+    )
+    parser.add_argument("--nemo_config", type=str, help="a model_config.yaml file which this script will update.")
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert_lora(args.hf_lora_path, args.output_path, args.nemo_config)
diff --git a/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py b/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py
index f2974aca1642..65a00fd56d22 100644
--- a/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py
+++ b/scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py
@@ -19,13 +19,20 @@
 
 Example usage:
 python scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py \
-    --lora_path nemo_style_lora_model.nemo \
+    --nemo_lora_path nemo_style_lora_model.nemo \
     --output_path ./canonical_style_lora_model.nemo 
 
+Example usage to also convert into huggingface format (the script expects a adapter_config.json file which is standard in HF):
+python scripts/checkpoint_converters/lora_converters/convert_nemo_to_canonical.py \
+    --nemo_lora_path nemo_style_lora_model.nemo \
+    --output_path ./canonical_style_lora_model.nemo \
+    --hf_format --hf_config checkpoints/bin/adapter_config.json
 """
+import json
 import tempfile
 from argparse import ArgumentParser
-from typing import Dict
+from pathlib import Path
+from typing import Any, Dict
 
 import torch
 from omegaconf import OmegaConf, open_dict
@@ -33,6 +40,12 @@
 
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 
+target_map = {
+    "all": ["gate_proj", "o_proj", "up_proj", "down_proj", "k_proj", "q_proj", "v_proj"],
+    "attention_qkv": ["k_proj", "q_proj", "v_proj"],
+    "attention_dense": ["gate_proj", "o_proj", "up_proj"],
+}
+
 
 def rename_keys(key):
     new_keys = []
@@ -46,6 +59,14 @@ def rename_keys(key):
     return new_keys
 
 
+def rename_qkv_keys(key):
+    new_keys = []
+    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.q_adapter."))
+    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.k_adapter."))
+    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.v_adapter."))
+    return new_keys
+
+
 def reformat_module_names_to_hf(tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
     new_tensors = dict()
     for module_name, module_weight in tensors.items():
@@ -72,61 +93,78 @@ def reformat_module_names_to_hf(tensors: Dict[str, torch.Tensor]) -> Dict[str, t
     return new_tensors
 
 
-def convert_hto4h(lora_weights, lora_config):
-    assert len(lora_weights) == 1, "Only single TP supported for now"
-    keys_to_update = []
-    for key in lora_weights[0].keys():
-        if "lora_hto4h_adapter" in key:
-            keys_to_update.append(key)
+def convert_lora_weights_to_canonical(
+    config: Dict[str, Any], lora_weights: Dict[str, torch.Tensor]
+) -> Dict[str, torch.Tensor]:
+    """This function converts nemo style (fused) lora weights to canonical (unfused)
+    LoRA weights. Namely, it unfuses the QKV adapter layers and the H-to-4H adapter layers.
 
-    for key in keys_to_update:
-        if "linear_in" in key:
-            for new_key in rename_keys(key):
-                lora_weights[0][new_key] = lora_weights[0][key]
-                print(new_key, lora_weights[0][new_key].shape)
-        elif "linear_out" in key:
-            for idx, new_key in enumerate(rename_keys(key)):
-                orginal_shape = lora_weights[0][key].shape[0]
-                lora_weights[0][new_key] = lora_weights[0][key][
-                    idx * (orginal_shape // 2) : (idx + 1) * (orginal_shape // 2)
-                ]
-                print(new_key, lora_weights[0][new_key].shape)
-
-        lora_weights[0].pop(key)
-    return lora_weights
+    Returns:
+        Dict[str, torch.Tensor]: The new LoRA weights with unfused layers.
+    """
+
+    hidden_size = int(config["hidden_size"])
+    num_heads = int(config["num_attention_heads"])
+    head_size = hidden_size // num_heads
+    num_query_groups = int(config.get("num_query_groups", num_heads))  # num_kv_heads
+
+    heads_per_group = num_heads // num_query_groups
+    qkv_total_dim = num_heads + 2 * num_query_groups
 
+    adapter_size = config['peft']['lora_tuning']['adapter_dim']
+
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * group_idx, (heads_per_group + 2) * group_idx + heads_per_group)
+            for group_idx in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, heads_per_group + 2)
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, heads_per_group + 2)
 
-def convert_qkv(lora_weights, lora_model_cfg):
-    assert len(lora_weights) == 1, "Only single TP supported for now"
-    if (
-        lora_model_cfg.get("num_query_groups", lora_model_cfg.num_attention_heads)
-        != lora_model_cfg.num_attention_heads
-    ):
-        kv_channels = int(lora_model_cfg.hidden_size / lora_model_cfg.num_attention_heads)
-        kv_size = int(lora_model_cfg.num_query_groups * kv_channels)
-    else:
-        kv_size = int(lora_model_cfg.hidden_size)
-    q_size = lora_model_cfg.hidden_size
-    k_size, v_size = kv_size, kv_size
-
-    keys_to_update = []
-    for key in lora_weights[0].keys():
+    qkv_keys_to_update = []
+    hto4h_keys_to_update = []
+    for key in lora_weights.keys():
         if "lora_kqv_adapter" in key:
-            keys_to_update.append(key)
+            qkv_keys_to_update.append(key)
+        if "lora_hto4h_adapter" in key:
+            hto4h_keys_to_update.append(key)
 
-    for key in keys_to_update:
+    # unfuse QKV layer
+    for key in qkv_keys_to_update:
         if "linear_in" in key:
-            for new_key in rename_keys(key):
-                lora_weights[0][new_key] = lora_weights[0][key]
-                print(new_key, lora_weights[0][new_key].shape)
+            assert lora_weights[key].size(0) == adapter_size
+            for new_key in rename_qkv_keys(key):
+                lora_weights[new_key] = lora_weights[key]
+                assert len(lora_weights[new_key].size()) == 2
         elif "linear_out" in key:
-            srt = 0
-            for new_key, size in zip(rename_keys(key), [q_size, k_size, v_size]):
-                lora_weights[0][new_key] = lora_weights[0][key][srt : srt + size]
-                print(new_key, lora_weights[0][new_key].shape)
-                srt = srt + size
-
-        lora_weights[0].pop(key)
+            assert lora_weights[key].size(1) == adapter_size
+            for new_key, size in zip(rename_qkv_keys(key), [q_slice, k_slice, v_slice]):
+                lora_weights[new_key] = (
+                    lora_weights[key]
+                    .reshape((qkv_total_dim, head_size, adapter_size))[size]
+                    .reshape((-1, adapter_size))
+                )
+                assert len(lora_weights[new_key].size()) == 2
+        lora_weights.pop(key)
+
+    # This maps to gate_up_proj in HF, but we need to split it up into gate_proj and up_proj
+    for key in hto4h_keys_to_update:
+        gate_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.gate_adapter.")
+        up_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.up_adapter.")
+
+        module_weight = lora_weights[key]
+        if "linear_in" in key:
+            # lora_a gets duplicated
+            lora_weights[gate_proj_key] = module_weight
+            lora_weights[up_proj_key] = module_weight
+        elif "linear_out" in key:
+            # lora_b gets split
+            split_size = module_weight.shape[0]
+            gate_up_split = module_weight.split(split_size // 2)
+            lora_weights[gate_proj_key] = gate_up_split[0]
+            lora_weights[up_proj_key] = gate_up_split[1]
+        lora_weights.pop(key)
     return lora_weights
 
 
@@ -159,17 +197,24 @@ def convert_lora(lora_nemo, save_path, hf_format=False):
                         new_key = replace_number_add_offset(key, layer_offset)
                         lora_state_dict[tp][new_key] = value
 
-        with open_dict(lora_config):
-            lora_config.peft.lora_tuning.variant = "canonical"
-        with open(f"{tmpdir}/model_config.yaml", "w") as f:
-            OmegaConf.save(lora_config, f)
-        lora_state_dict = convert_qkv(lora_state_dict, lora_config)
-        lora_state_dict = convert_hto4h(lora_state_dict, lora_config)
         # TODO: currently suport tp=1
         lora_state_dict = lora_state_dict[0]
+        if lora_config.peft.lora_tuning.variant == "nemo":
+            with open_dict(lora_config):
+                lora_config.peft.lora_tuning.variant = "canonical"
+            with open(f"{tmpdir}/model_config.yaml", "w") as f:
+                OmegaConf.save(lora_config, f)
+            lora_state_dict = convert_lora_weights_to_canonical(lora_config, lora_state_dict)
         if hf_format:
             lora_state_dict = reformat_module_names_to_hf(lora_state_dict)
-            torch.save(lora_state_dict, f"{save_path}/model_weights_hf_formatted.pt")
+            Path(save_path).mkdir(parents=True, exist_ok=True)
+            torch.save(lora_state_dict, f"{save_path}/adapter_model.bin")
+            adapter_config = json.load(open(args.hf_config))
+            adapter_config['peft_type'] = "LORA"
+            adapter_config['r'] = lora_config.peft.lora_tuning.adapter_dim
+            adapter_config['lora_alpha'] = lora_config.peft.lora_tuning.alpha
+            with open(f"{save_path}/adapter_config.json", "w") as f:
+                json.dump(adapter_config, f, indent=4)
         else:
             torch.save(lora_state_dict, f"{tmpdir}/model_weights.ckpt")
             NLPSaveRestoreConnector._make_nemo_file_from_folder(save_path, tmpdir)
@@ -188,7 +233,7 @@ def fix_for_O2(state_dict):
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--lora_path",
+        "--nemo_lora_path",
         type=str,
         default=None,
         required=True,
@@ -202,6 +247,7 @@ def get_args():
         help="Path to save the canonical (unfused) lora .nemo file.",
     )
     parser.add_argument("--hf_format", action='store_true', help="saves tensors in huggingface naming format.")
+    parser.add_argument("--hf_config", type=str, help="the adapter config in HF-PEFT format.")
     parser.add_argument("--precision", type=str, default="16", help="Model precision")
     args = parser.parse_args()
     return args
@@ -209,4 +255,4 @@ def get_args():
 
 if __name__ == '__main__':
     args = get_args()
-    convert_lora(args.lora_path, args.output_path, args.hf_format)
+    convert_lora(args.nemo_lora_path, args.output_path, args.hf_format)

From d83696f4a46d38b6b749aed2bbaff3363da8e46c Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Wed, 8 May 2024 14:08:17 -0400
Subject: [PATCH 052/178] NeMo Inference Requirements (#9093)

* Add inference requirements

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* add tensorstore and zarr

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Removed versions from zarr and pytriton

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 requirements/requirements_infer.txt | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 requirements/requirements_infer.txt

diff --git a/requirements/requirements_infer.txt b/requirements/requirements_infer.txt
new file mode 100644
index 000000000000..c18f4e81ade3
--- /dev/null
+++ b/requirements/requirements_infer.txt
@@ -0,0 +1,4 @@
+nvidia-pytriton
+tensorstore==0.1.45
+zarr
+

From 5d8321322083476903853bc5b82ff1b385bcabf2 Mon Sep 17 00:00:00 2001
From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Date: Wed, 8 May 2024 13:49:14 -0500
Subject: [PATCH 053/178] remove frequent flashattn print (#9074)

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../collections/multimodal/modules/stable_diffusion/attention.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
index 3fcab2127f4f..f5689c706e2c 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
@@ -56,7 +56,6 @@ def check_cuda():
     from flash_attn.modules.mha import FlashCrossAttention, FlashSelfAttention
 
     flash_attn_installed = check_cuda()
-    print("FlashAttention Installed")
 
     # Disable TorchDynamo on FlashAttention
     FlashSelfAttention.forward = disable(FlashSelfAttention.forward)

From 305392dcf91c00b62e97993e1c1790f901fb6782 Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Wed, 8 May 2024 21:23:44 +0200
Subject: [PATCH 054/178] Prevent duplicated checkpoints (#9015)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Prevent duplicated checkpoints

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add versioning to save_to

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add versioning logic to all .nemo files

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add versioning test

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add dist-ckpt test

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Rename existing ckpts instead of using different name

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add comment

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Run dist-ckpt test on GPU

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Prevent saving last for non-equal val intervals

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Move checkpoint on rank 0

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py   | 33 +++-------
 nemo/utils/callbacks/nemo_model_checkpoint.py | 46 ++++++++++++--
 tests/core/test_exp_manager.py                | 63 +++++++++++++++++++
 3 files changed, 114 insertions(+), 28 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 84c0e23542c7..4cfa4c220b25 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -364,12 +364,6 @@ def save_checkpoint(
             # dist_checkpointing expects a directory so we will name the directory
             # using the path with the file extension removed
             checkpoint_dir = ckpt_to_dir(filepath)
-
-            fs = get_filesystem(checkpoint_dir)
-            if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
-                logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
-                return
-
             # remove device state_dict
             checkpoint['state_dict'] = OrderedDict([])
 
@@ -861,26 +855,19 @@ def save_to(self, model, save_path: str):
             if dist_ckpt:
                 # model weights is a directory
                 dist_ckpt_dir = ckpt_to_dir(os.path.join(dir_name, self.model_weights_ckpt))
-                fs = get_filesystem(dist_ckpt_dir)
-
-                if fs.isdir(dist_ckpt_dir) and dist_checkpointing.check_is_distributed_checkpoint(dist_ckpt_dir):
-                    logging.info(f'Distributed checkpoint at path {dist_ckpt_dir} already exists, skipping saving')
-                else:
-                    if is_global_rank_zero():
-                        fs.makedirs(dist_ckpt_dir, exist_ok=True)
 
-                    sharded_state_dict = model.sharded_state_dict()
-                    # dist checkpoint needs torch.distributed to save the checkpoint
-                    if not parallel_state.is_initialized():
+                sharded_state_dict = model.sharded_state_dict()
+                # dist checkpoint needs torch.distributed to save the checkpoint
+                if not parallel_state.is_initialized():
 
-                        def dummy():
-                            return
+                    def dummy():
+                        return
 
-                        if model.trainer.strategy.launcher is not None:
-                            model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
-                        model.trainer.strategy.setup_environment()
-                    checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'zarr'))
-                    checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir)
+                    if model.trainer.strategy.launcher is not None:
+                        model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
+                    model.trainer.strategy.setup_environment()
+                checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'zarr'))
+                checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir)
 
             else:
 
diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py
index e532297d9747..f8bdb9d9b294 100644
--- a/nemo/utils/callbacks/nemo_model_checkpoint.py
+++ b/nemo/utils/callbacks/nemo_model_checkpoint.py
@@ -188,9 +188,8 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
         if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
             logging.warning(f'always_save_nemo will slow down training for model_parallel > 1.')
         # since we are creating tarfile artifacts we need to update .nemo path
-        app_state.model_restore_path = os.path.abspath(
-            os.path.expanduser(os.path.join(self.dirpath, self.prefix + self.postfix))
-        )
+        self._backup_existing_nemo_ckpt(trainer)
+        app_state.model_restore_path = self._format_nemo_checkpoint_name()
         if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
             maybe_injected_best_model_path = inject_model_parallel_rank(self.best_model_path)
         else:
@@ -236,7 +235,10 @@ def on_train_end(self, trainer, pl_module):
                 should_save_last_checkpoint = True
             if should_save_last_checkpoint:
                 monitor_candidates = self._monitor_candidates(trainer)
-                super()._save_last_checkpoint(trainer, monitor_candidates)
+                if self.last_model_path == self.format_checkpoint_name(monitor_candidates, self.CHECKPOINT_NAME_LAST):
+                    logging.debug(f'Last checkpoint {self.last_model_path} already saved')
+                else:
+                    super()._save_last_checkpoint(trainer, monitor_candidates)
         # Call parent on_train_end() to save the -last checkpoint
         super().on_train_end(trainer, pl_module)
 
@@ -256,7 +258,36 @@ def on_train_end(self, trainer, pl_module):
                 trainer._checkpoint_connector.restore(self.best_model_path)
 
         if self.save_nemo_on_train_end:
-            pl_module.save_to(save_path=os.path.join(self.dirpath, self.prefix + self.postfix))
+            self._backup_existing_nemo_ckpt(trainer)
+            pl_module.save_to(save_path=self._format_nemo_checkpoint_name())
+
+    def _backup_existing_nemo_ckpt(self, trainer) -> str:
+        """ Search for an available name with version infix and rename existing checkpoint.
+
+        NOTE: this behavior is slightly different from regular checkpoints.
+        PTL creates new regular checkpoint with the first available name.
+        Here, for backward compatibility, we create .nemo checkpoint as before
+        and create a backup under the first available name.
+        """
+        base_path = self._format_nemo_checkpoint_name()
+        available_path = base_path
+        if self._enable_version_counter:
+            version_cnt = self.STARTING_VERSION
+            while self.file_exists(available_path, trainer, check_dist_ckpt=False):
+                available_path = self._format_nemo_checkpoint_name(version_cnt)
+                version_cnt += 1
+        if available_path != base_path:
+            if trainer.is_global_zero:
+                logging.info(f'{base_path} already exists, moving existing checkpoint to {available_path}')
+                shutil.move(base_path, available_path)
+            trainer.strategy.barrier()
+        return available_path
+
+    def _format_nemo_checkpoint_name(self, ver: Optional[int] = None) -> str:
+        version_infix = '' if ver is None else f'{self.CHECKPOINT_JOIN_CHAR}v{ver}'
+        return os.path.abspath(
+            os.path.expanduser(os.path.join(self.dirpath, self.prefix + version_infix + self.postfix))
+        )
 
     def _del_model_without_trainer(self, filepath: str) -> None:
 
@@ -367,6 +398,11 @@ def remove_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barri
         except:
             return
 
+    def file_exists(self, filepath: str, trainer: "pytorch_lightning.Trainer", check_dist_ckpt: bool = True) -> bool:
+        """Checks if a file or a file without a suffix (distributed checkpoint) exists."""
+        exists = self._fs.exists(filepath) or (check_dist_ckpt and self._fs.exists(ckpt_to_dir(filepath)))
+        return trainer.strategy.broadcast(exists)
+
     def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None:
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
         # if anything goes wrong during checkpointing, we should be able to detect that data is incomplete.
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index 40fd545ab52d..8c6b33022dac 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import math
 import os
 import re
@@ -25,6 +26,7 @@
 from pytorch_lightning import Callback
 from pytorch_lightning.loops import _TrainingEpochLoop
 
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.constants import NEMO_ENV_VARNAME_VERSION
 from nemo.core.classes import ModelPT
 from nemo.utils.callbacks import NeMoModelCheckpoint
@@ -130,6 +132,11 @@ def on_validation_epoch_end(self):
         self.log("val_loss", torch.stack([self.loss]).mean())
 
 
+class ExampleMCoreModel(ExampleModel):
+    def sharded_state_dict(self):
+        return {'a': 3}
+
+
 class DoNothingModel(ExampleModel):
     def configure_optimizers(self):
         return DoNothingOptimizer(self.parameters())
@@ -502,6 +509,62 @@ def test_nemo_checkpoint_restore_model(self, tmp_path):
         test_trainer.fit(model)
         assert math.fabs(float(model(torch.tensor([1.0, 1.0], device=model.device))) - 0.03) < 1e-5
 
+    @pytest.mark.run_only_on('GPU')
+    @pytest.mark.parametrize('test_dist_ckpt', [False, True])
+    def test_checkpoints_are_not_overwritten(self, tmp_path, test_dist_ckpt):
+        """ Simulates already existing checkpoints in the ckpt directory and tests ckpt versioning """
+        strategy = NLPDDPStrategy() if test_dist_ckpt else 'auto'
+        test_trainer = pl.Trainer(
+            accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4, strategy=strategy
+        )
+        exp_manager(
+            test_trainer,
+            {
+                "checkpoint_callback_params": {"save_nemo_on_train_end": True},
+                "explicit_log_dir": str(tmp_path / "test"),
+            },
+        )
+        model = ExampleMCoreModel() if test_dist_ckpt else ExampleModel()
+
+        ckpt_dir = Path(tmp_path / "test" / "checkpoints")
+        assert not ckpt_dir.exists()
+
+        # Fake existing 1st and last checkpoint
+        suffix = '' if test_dist_ckpt else '.ckpt'
+        ckpt_dir.mkdir(parents=True)
+        ckpt_1 = ckpt_dir / f'default--val_loss=0.0000-epoch=1{suffix}'
+        ckpt_2 = ckpt_dir / f'default--val_loss=0.0300-epoch=2{suffix}'
+
+        if test_dist_ckpt:
+            ckpt_1.mkdir()
+            with open(ckpt_1 / 'metadata.json', 'w') as f:
+                json.dump({'sharded_backend': 'xxx'}, f)
+        else:
+            ckpt_1.touch()
+        # don't create 2nd checkpoint
+        ckpt_nemo = ckpt_dir / 'default.nemo'
+        ckpt_nemo.touch()
+
+        # Train
+        test_trainer.fit(model)
+
+        # Check base checkpoint (without versioning)
+        all_checkpoints = [p.name for p in Path(str(tmp_path / "test" / "checkpoints")).glob("*")]
+        assert ckpt_1.exists(), all_checkpoints  # existed before
+        assert ckpt_2.exists(), all_checkpoints
+        assert ckpt_nemo.exists(), all_checkpoints  # existed before
+
+        # Versioned checkpoints
+        def _get_versioned_name(ckpt_name: Path, nemo: bool = False):
+            if test_dist_ckpt and not nemo:
+                # no suffix at all
+                return ckpt_name.with_name(ckpt_name.name + '-v1')
+            return ckpt_name.with_stem(ckpt_name.stem + '-v1')
+
+        assert _get_versioned_name(ckpt_1).exists(), all_checkpoints
+        assert not _get_versioned_name(ckpt_2).exists(), all_checkpoints  # ckpt2 didn't exist before
+        assert _get_versioned_name(ckpt_nemo, nemo=True).exists(), all_checkpoints
+
     @pytest.mark.unit
     def test_last_checkpoint_saved(self, tmp_path):
         max_steps = 64

From b518cff703a2451b6b65c4b34b4f9226ef0d1eee Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Wed, 8 May 2024 17:56:38 -0400
Subject: [PATCH 055/178] Update cicd-main.yml (#9144)

Signed-off-by: Jason <jasoli@nvidia.com>
---
 .github/workflows/cicd-main.yml | 82 ++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 53e0571ab334..ef3206e48f69 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -6334,47 +6334,47 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/radtts.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            export_dir=/home/TestData/radtts_test \
-            model.optim.lr=0.0001 \
-            model.modelConfig.decoder_use_partial_padding=True \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs
-        #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-        #  if: "failure()"
+  # OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   timeout-minutes: 10
+  #   container:
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+  #     options: 
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g
+  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /mnt/datadrive/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v4
+  #       - run: |
+  #           python examples/tts/radtts.py \
+  #           train_dataset=/home/TestData/an4_dataset/an4_train.json \
+  #           validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+  #           sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \
+  #           trainer.devices="[0]" \
+  #           +trainer.limit_train_batches=1 \
+  #           +trainer.limit_val_batches=1 \
+  #           trainer.max_epochs=1 \
+  #           trainer.strategy=auto \
+  #           model.pitch_mean=212.35873413085938 \
+  #           model.pitch_std=68.52806091308594 \
+  #           model.train_ds.dataloader_params.batch_size=4 \
+  #           model.train_ds.dataloader_params.num_workers=0 \
+  #           model.validation_ds.dataloader_params.batch_size=4 \
+  #           model.validation_ds.dataloader_params.num_workers=0 \
+  #           export_dir=/home/TestData/radtts_test \
+  #           model.optim.lr=0.0001 \
+  #           model.modelConfig.decoder_use_partial_padding=True \
+  #           ~trainer.check_val_every_n_epoch \
+  #           ~model.text_normalizer \
+  #           ~model.text_normalizer_call_kwargs
+  #       #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+  #       #  if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_Mixer-TTS:
     needs: [cicd-test-container-setup]

From 363f9ec24f25bd394d97783e7ab075dc5a643ebc Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Thu, 9 May 2024 11:00:08 -0700
Subject: [PATCH 056/178] Fix #8891 by supported GPU-side batched CTC Greedy
 Decoding (#9100)

* Support batched inference of greedy CTC decoding.

Fixes #8891

Basically, doing max() on CPU one at a time is very very slow. It is
better to do that all on the GPU before we do the copy over to CPU.

This new algorithm has the same interface as the old one and can be accessed by setting strategy to "greedy_batched" rather than "greedy".

Warn when using greedy rather than greedy_batched strategy.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
---
 .../asr/parts/submodules/ctc_decoding.py      |  25 +-
 .../parts/submodules/ctc_greedy_decoding.py   | 223 +++++++++++++++++-
 .../asr/decoding/test_ctc_decoding.py         |  80 +++++++
 .../asr/test_asr_ctc_encoder_model_bpe.py     |   2 +-
 .../asr/test_asr_ctcencdec_model.py           |   2 +-
 .../asr/test_asr_hybrid_rnnt_ctc_model_bpe.py |   2 +-
 .../test_asr_hybrid_rnnt_ctc_model_char.py    |   2 +-
 7 files changed, 325 insertions(+), 11 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/ctc_decoding.py b/nemo/collections/asr/parts/submodules/ctc_decoding.py
index 67559eccf6e2..70d63c0f8c6f 100644
--- a/nemo/collections/asr/parts/submodules/ctc_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_decoding.py
@@ -213,20 +213,20 @@ def __init__(self, decoding_cfg, blank_id: int):
         self.batch_dim_index = self.cfg.get('batch_dim_index', 0)
         self.word_seperator = self.cfg.get('word_seperator', ' ')
 
-        possible_strategies = ['greedy', 'beam', 'pyctcdecode', 'flashlight']
+        possible_strategies = ['greedy', 'greedy_batched', 'beam', 'pyctcdecode', 'flashlight']
         if self.cfg.strategy not in possible_strategies:
             raise ValueError(f"Decoding strategy must be one of {possible_strategies}. Given {self.cfg.strategy}")
 
         # Update preserve alignments
         if self.preserve_alignments is None:
-            if self.cfg.strategy in ['greedy']:
+            if self.cfg.strategy in ['greedy', 'greedy_batched']:
                 self.preserve_alignments = self.cfg.greedy.get('preserve_alignments', False)
             else:
                 self.preserve_alignments = self.cfg.beam.get('preserve_alignments', False)
 
         # Update compute timestamps
         if self.compute_timestamps is None:
-            if self.cfg.strategy in ['greedy']:
+            if self.cfg.strategy in ['greedy', 'greedy_batched']:
                 self.compute_timestamps = self.cfg.greedy.get('compute_timestamps', False)
             elif self.cfg.strategy in ['beam']:
                 self.compute_timestamps = self.cfg.beam.get('compute_timestamps', False)
@@ -234,10 +234,10 @@ def __init__(self, decoding_cfg, blank_id: int):
         # initialize confidence-related fields
         self._init_confidence(self.cfg.get('confidence_cfg', None))
 
-        # Confidence estimation is not implemented for strategies other than `greedy`
+        # Confidence estimation is not implemented for strategies other than `greedy` and `greedy_batched`
         if (
             not self.preserve_frame_confidence
-            and self.cfg.strategy != 'greedy'
+            and self.cfg.strategy not in ('greedy', 'greedy_batched')
             and self.cfg.beam.get('preserve_frame_confidence', False)
         ):
             raise NotImplementedError(f"Confidence calculation is not supported for strategy `{self.cfg.strategy}`")
@@ -247,6 +247,10 @@ def __init__(self, decoding_cfg, blank_id: int):
             self.compute_timestamps |= self.preserve_frame_confidence
 
         if self.cfg.strategy == 'greedy':
+            logging.warning(
+                "CTC decoding strategy 'greedy' is slower than 'greedy_batched', which implements the same exact interface. Consider changing your strategy to 'greedy_batched' for a free performance improvement.",
+                mode=logging_mode.ONCE,
+            )
 
             self.decoding = ctc_greedy_decoding.GreedyCTCInfer(
                 blank_id=self.blank_id,
@@ -256,6 +260,15 @@ def __init__(self, decoding_cfg, blank_id: int):
                 confidence_method_cfg=self.confidence_method_cfg,
             )
 
+        elif self.cfg.strategy == "greedy_batched":
+            self.decoding = ctc_greedy_decoding.GreedyBatchedCTCInfer(
+                blank_id=self.blank_id,
+                preserve_alignments=self.preserve_alignments,
+                compute_timestamps=self.compute_timestamps,
+                preserve_frame_confidence=self.preserve_frame_confidence,
+                confidence_method_cfg=self.confidence_method_cfg,
+            )
+
         elif self.cfg.strategy == 'beam':
 
             self.decoding = ctc_beam_decoding.BeamCTCInfer(
@@ -1287,7 +1300,7 @@ def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]:
 
 @dataclass
 class CTCDecodingConfig:
-    strategy: str = "greedy"
+    strategy: str = "greedy_batched"
 
     # preserve decoding alignments
     preserve_alignments: Optional[bool] = None
diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index ab4b4c40e860..1ef26cd7adf3 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -110,7 +110,7 @@ class GreedyCTCInfer(Typing, ConfidenceMethodMixin):
     def input_types(self):
         """Returns definitions of module input ports.
         """
-        # Input can be of dimention -
+        # Input can be of dimension -
         # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]
 
         return {
@@ -266,6 +266,227 @@ def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
 
+class GreedyBatchedCTCInfer(Typing, ConfidenceMethodMixin):
+    """A vectorized greedy CTC decoder.
+
+    This is basically always faster than GreedyCTCInfer, and supports
+    the same interface. See issue #8891 on github for what is wrong
+    with GreedyCTCInfer. GreedyCTCInfer loops over each element in the
+    batch, running kernels at batch size one. CPU overheads end up
+    dominating. This implementation does appropriate masking to
+    appropriately do the same operation in a batched manner.
+
+    Args:
+        blank_index: int index of the blank token. Can be 0 or len(vocabulary).
+        preserve_alignments: Bool flag which preserves the history of logprobs generated during
+            decoding (sample / batched). When set to true, the Hypothesis will contain
+            the non-null value for `logprobs` in it. Here, `logprobs` is a torch.Tensors.
+        compute_timestamps: A bool flag, which determines whether to compute the character/subword, or
+                word based timestamp mapping the output log-probabilities to discrite intervals of timestamps.
+                The timestamps will be available in the returned Hypothesis.timestep as a dictionary.
+        preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
+            generated during decoding. When set to true, the Hypothesis will contain
+            the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats.
+        confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame
+            confidence scores.
+
+            name: The method name (str).
+                Supported values:
+                    - 'max_prob' for using the maximum token probability as a confidence.
+                    - 'entropy' for using a normalized entropy of a log-likelihood vector.
+
+            entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`.
+                Supported values:
+                    - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided,
+                        the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)).
+                        Note that for this entropy, the alpha should comply the following inequality:
+                        (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1)
+                        where V is the model vocabulary size.
+                    - 'tsallis' for the Tsallis entropy with the Boltzmann constant one.
+                        Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)),
+                        where α is a parameter. When α == 1, it works like the Gibbs entropy.
+                        More: https://en.wikipedia.org/wiki/Tsallis_entropy
+                    - 'renyi' for the Rényi entropy.
+                        Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)),
+                        where α is a parameter. When α == 1, it works like the Gibbs entropy.
+                        More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy
+
+            alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0.
+                When the alpha equals one, scaling is not applied to 'max_prob',
+                and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i))
+
+            entropy_norm: A mapping of the entropy value to the interval [0,1].
+                Supported values:
+                    - 'lin' for using the linear mapping.
+                    - 'exp' for using exponential mapping with linear shift.
+
+    """
+
+    @property
+    def input_types(self):
+        """Returns definitions of module input ports.
+        """
+        # Input can be of dimension -
+        # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]
+
+        return {
+            "decoder_output": NeuralType(None, LogprobsType()),
+            "decoder_lengths": NeuralType(tuple('B'), LengthsType()),
+        }
+
+    @property
+    def output_types(self):
+        """Returns definitions of module output ports.
+        """
+        return {"predictions": [NeuralType(elements_type=HypothesisType())]}
+
+    def __init__(
+        self,
+        blank_id: int,
+        preserve_alignments: bool = False,
+        compute_timestamps: bool = False,
+        preserve_frame_confidence: bool = False,
+        confidence_method_cfg: Optional[DictConfig] = None,
+    ):
+        super().__init__()
+
+        self.blank_id = blank_id
+        self.preserve_alignments = preserve_alignments
+        # we need timestamps to extract non-blank per-frame confidence
+        self.compute_timestamps = compute_timestamps | preserve_frame_confidence
+        self.preserve_frame_confidence = preserve_frame_confidence
+
+        # set confidence calculation method
+        self._init_confidence_method(confidence_method_cfg)
+
+    @typecheck()
+    def forward(
+        self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor,
+    ):
+        """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
+        Output token is generated auto-repressively.
+
+        Args:
+            decoder_output: A tensor of size (batch, timesteps, features) or (batch, timesteps) (each timestep is a label).
+            decoder_lengths: list of int representing the length of each sequence
+                output sequence.
+
+        Returns:
+            packed list containing batch number of sentences (Hypotheses).
+        """
+        if decoder_output.ndim == 2:
+            hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
+        else:
+            hypotheses = self._greedy_decode_logprobs_batched(decoder_output, decoder_lengths)
+        packed_result = pack_hypotheses(hypotheses, decoder_lengths)
+        return (packed_result,)
+
+    @torch.no_grad()
+    def _greedy_decode_logprobs_batched(self, x: torch.Tensor, out_len: torch.Tensor):
+        # x: [B, T, D]
+        # out_len: [B]
+
+        batch_size = x.shape[0]
+        max_time = x.shape[1]
+
+        predictions = x
+        # In CTC greedy decoding, each output maximum likelihood token
+        # is calculated independent of the other tokens.
+        predictions_logprobs, predictions_labels = predictions.max(dim=-1)
+
+        # Since predictions_logprobs is a padded matrix in the time
+        # dimension, we consider invalid timesteps to be "blank".
+        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
+        non_blank_ids_mask = torch.logical_and(predictions_labels != self.blank_id, time_steps < out_len.unsqueeze(1))
+        # Sum the non-blank labels to compute the score of the
+        # transcription. This follows from Eq. (3) of "Connectionist
+        # Temporal Classification: Labelling Unsegmented Sequence Data
+        # with Recurrent Neural Networks".
+        scores = torch.where(non_blank_ids_mask, predictions_logprobs, 0.0).sum(axis=1)
+
+        scores = scores.cpu()
+        predictions_labels = predictions_labels.cpu()
+        out_len = out_len.cpu()
+
+        if self.preserve_alignments or self.preserve_frame_confidence:
+            predictions = predictions.cpu()
+
+        hypotheses = []
+
+        # This mimics the for loop in GreedyCTCInfer::forward.
+        for i in range(batch_size):
+            hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
+            hypothesis.score = scores[i]
+
+            prediction_labels_no_padding = predictions_labels[i, : out_len[i]].tolist()
+
+            assert predictions_labels.dtype == torch.int64
+            hypothesis.y_sequence = prediction_labels_no_padding
+
+            if self.preserve_alignments:
+                hypothesis.alignments = (
+                    predictions[i, : out_len[i], :].clone(),
+                    predictions_labels[i, : out_len[i]].clone(),
+                )
+            if self.compute_timestamps:
+                # TOOD: Could do this in a vectorized manner... Would
+                # prefer to have nonzero_static, though, for sanity.
+                # Or do a prefix sum on out_len
+                hypothesis.timestep = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist()
+            if self.preserve_frame_confidence:
+                hypothesis.frame_confidence = self._get_confidence(predictions[i, : out_len[i], :])
+
+            hypotheses.append(hypothesis)
+
+        return hypotheses
+
+    @torch.no_grad()
+    def _greedy_decode_labels_batched(self, x: torch.Tensor, out_len: torch.Tensor):
+        """
+        This does greedy decoding in the case where you have already found the
+        most likely token at each timestep.
+        """
+        # x: [B, T]
+        # out_len: [B]
+
+        batch_size = x.shape[0]
+        max_time = x.shape[1]
+
+        predictions_labels = x
+        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
+        non_blank_ids_mask = torch.logical_and(predictions_labels != self.blank_id, time_steps < out_len.unsqueeze(1))
+        predictions_labels = predictions_labels.cpu()
+        out_len = out_len.cpu()
+
+        hypotheses = []
+
+        for i in range(batch_size):
+            hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
+            hypothesis.y_sequence = predictions_labels[i, : out_len[i]].tolist()
+            hypothesis.score = -1.0
+
+            if self.preserve_alignments:
+                raise ValueError(
+                    "Requested for alignments, but predictions provided were labels, not log probabilities."
+                )
+            if self.compute_timestamps:
+                # TOOD: Could do this in a vectorized manner... Would
+                # prefer to have nonzero_static, though, for sanity.
+                # Or do a prefix sum on out_len
+                hypothesis.timestep = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist()
+            if self.preserve_frame_confidence:
+                raise ValueError(
+                    "Requested for per-frame confidence, but predictions provided were labels, not log probabilities."
+                )
+
+            hypotheses.append(hypothesis)
+
+        return hypotheses
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+
 @dataclass
 class GreedyCTCInferConfig:
     preserve_alignments: bool = False
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index a3a5689062bf..8eceb822fd38 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -26,6 +26,7 @@
     CTCDecoding,
     CTCDecodingConfig,
 )
+from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 
 
@@ -191,3 +192,82 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
                 # timestamps check
                 if timestamps:
                     check_subword_timestamps(hyp, decoding)
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('alignments', [False, True])
+    @pytest.mark.parametrize('timestamps', [False, True])
+    @pytest.mark.parametrize('preserve_frame_confidence', [False, True])
+    def test_batched_decoding_logprobs(self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence):
+        cfg = CTCBPEDecodingConfig(
+            strategy='greedy',
+            preserve_alignments=alignments,
+            compute_timestamps=timestamps,
+            confidence_cfg=ConfidenceConfig(preserve_frame_confidence=preserve_frame_confidence),
+        )
+        unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
+
+        cfg.strategy = 'greedy_batched'
+        batched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
+
+        torch.manual_seed(1)
+        B, T = 4, 20
+        V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
+        input_signal = torch.randn(size=(B, T, V))
+        # Set the blank index to a very high probability to make sure
+        # that we always handle at least a few blanks.
+        input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
+        input_signal[:, 1, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
+        length = torch.randint(low=1, high=T, size=[B])
+
+        with torch.inference_mode():
+            hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
+                input_signal, length, fold_consecutive=True, return_hypotheses=True
+            )
+
+            batched_hyps, _ = batched_decoding.ctc_decoder_predictions_tensor(
+                input_signal, length, fold_consecutive=True, return_hypotheses=True
+            )
+
+            assert len(hyps) == len(batched_hyps) == B
+            for hyp, batched_hyp in zip(hyps, batched_hyps):
+                assert torch.abs(hyp.score - batched_hyp.score) <= 1e-5
+                assert torch.all(hyp.y_sequence == batched_hyp.y_sequence)
+                if timestamps:
+                    assert hyp.timestep == batched_hyp.timestep
+                if alignments:
+                    assert torch.all(hyp.alignments[0] == batched_hyp.alignments[0])
+                    assert torch.all(hyp.alignments[1] == batched_hyp.alignments[1])
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize('timestamps', [False, True])
+    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps):
+        cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
+        unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
+        cfg.strategy = 'greedy_batched'
+        batched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
+
+        torch.manual_seed(1)
+        B, T = 4, 20
+        V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
+        input_labels = torch.randint(V, size=(B, T))
+        # Set some indices to blank to make sure that we always handle
+        # at least a few blanks.
+        input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size
+        input_labels[:, 1] = unbatched_decoding.tokenizer.tokenizer.vocab_size
+        length = torch.randint(low=1, high=T, size=[B])
+
+        with torch.inference_mode():
+            hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
+                input_labels, length, fold_consecutive=True, return_hypotheses=True
+            )
+
+            batched_hyps, _ = batched_decoding.ctc_decoder_predictions_tensor(
+                input_labels, length, fold_consecutive=True, return_hypotheses=True
+            )
+
+            assert len(hyps) == len(batched_hyps) == B
+            for hyp, batched_hyp in zip(hyps, batched_hyps):
+                assert abs(hyp.score - batched_hyp.score) <= 1e-5
+                assert torch.all(hyp.y_sequence == batched_hyp.y_sequence)
+                if timestamps:
+                    assert hyp.timestep == batched_hyp.timestep
diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
index 744936263a03..2005c0e8d41c 100644
--- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
+++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
@@ -269,7 +269,7 @@ def test_vocab_change(self, test_data_dir, asr_model):
     def test_decoding_change(self, asr_model):
         assert asr_model.decoding is not None
         assert isinstance(asr_model.decoding, CTCBPEDecoding)
-        assert asr_model.decoding.cfg.strategy == "greedy"
+        assert asr_model.decoding.cfg.strategy == "greedy_batched"
         assert asr_model.decoding.preserve_alignments is False
         assert asr_model.decoding.compute_timestamps is False
 
diff --git a/tests/collections/asr/test_asr_ctcencdec_model.py b/tests/collections/asr/test_asr_ctcencdec_model.py
index 98d563a4a688..d2587913b879 100644
--- a/tests/collections/asr/test_asr_ctcencdec_model.py
+++ b/tests/collections/asr/test_asr_ctcencdec_model.py
@@ -150,7 +150,7 @@ def test_vocab_change(self, asr_model):
     def test_decoding_change(self, asr_model):
         assert asr_model.decoding is not None
         assert isinstance(asr_model.decoding, CTCDecoding)
-        assert asr_model.decoding.cfg.strategy == "greedy"
+        assert asr_model.decoding.cfg.strategy == "greedy_batched"
         assert asr_model.decoding.preserve_alignments is False
         assert asr_model.decoding.compute_timestamps is False
 
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
index 55e780c022d8..994d832ec6e5 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
@@ -297,7 +297,7 @@ def test_decoding_change(self, hybrid_asr_model):
 
         assert hybrid_asr_model.ctc_decoding is not None
         assert isinstance(hybrid_asr_model.ctc_decoding, CTCBPEDecoding)
-        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy"
+        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy_batched"
         assert hybrid_asr_model.ctc_decoding.preserve_alignments is False
         assert hybrid_asr_model.ctc_decoding.compute_timestamps is False
 
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
index 018c9bcc4aa2..923263787def 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
@@ -231,7 +231,7 @@ def test_decoding_change(self, hybrid_asr_model):
 
         assert hybrid_asr_model.ctc_decoding is not None
         assert isinstance(hybrid_asr_model.ctc_decoding, CTCDecoding)
-        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy"
+        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy_batched"
         assert hybrid_asr_model.ctc_decoding.preserve_alignments is False
         assert hybrid_asr_model.ctc_decoding.compute_timestamps is False
 

From f58e8038988df25fdd36b9d55f7b3685a44b40a9 Mon Sep 17 00:00:00 2001
From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Date: Thu, 9 May 2024 11:35:05 -0700
Subject: [PATCH 057/178] add TN/ITN link in speech tools list (#9142)

* add TN/ITN link in speech tools list

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix TN docs warnings

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
---
 .../nn_text_normalization.rst                 | 26 +++++++++----------
 .../text_normalization_as_tagging.rst         |  6 ++---
 .../nlp/text_normalization/wfst/intro.rst     |  2 +-
 .../wfst/wfst_text_normalization.rst          |  5 ++--
 .../wfst/wfst_text_processing_deployment.rst  |  5 ++--
 docs/source/tools/intro.rst                   |  1 +
 6 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/docs/source/nlp/text_normalization/nn_text_normalization.rst b/docs/source/nlp/text_normalization/nn_text_normalization.rst
index 0de5ccefef05..87530dbcbc29 100644
--- a/docs/source/nlp/text_normalization/nn_text_normalization.rst
+++ b/docs/source/nlp/text_normalization/nn_text_normalization.rst
@@ -26,7 +26,7 @@ The term *duplex* refers to the fact that our system can be trained to do both T
 Quick Start Guide
 -----------------
 
-To run the pretrained models interactively see :ref:`inference_text_normalization`.
+To run the pretrained models interactively see :ref:`inference_text_normalization_nn`.
 
 Available models
 ^^^^^^^^^^^^^^^^
@@ -79,7 +79,7 @@ The purpose of the preprocessing scripts is to standardize the format in order t
 We also changed punctuation class `PUNCT` to be treated like a plain token ( label changed from `<sil> to ``<self>`), since we want to preserve punctuation even after normalization. 
 For text normalization it is crucial to avoid unrecoverable errors, which are linguistically coherent and not semantic preserving. 
 We noticed that due to data scarcity the model struggles verbalizing long numbers correctly, so we changed the ground truth for long numbers to digit by digit verbalization.
-We also ignore certain semiotic classes from neural verbalization, e.g. `ELECTRONIC` or `WHITELIST` -- `VERBATIM` and `LETTER` in the original dataset. Instead we label urls/email addresses and abbreviations as plain tokens, and handle it separately with WFST-based grammars, see :ref:`inference_text_normalization`.
+We also ignore certain semiotic classes from neural verbalization, e.g. `ELECTRONIC` or `WHITELIST` -- `VERBATIM` and `LETTER` in the original dataset. Instead we label urls/email addresses and abbreviations as plain tokens, and handle it separately with WFST-based grammars, see :ref:`inference_text_normalization_nn`.
 This simplifies the task for the model and significantly reduces unrecoverable errors.
 
 
@@ -199,7 +199,7 @@ To enable training with the tarred dataset, add the following arguments:
     data.train_ds.use_tarred_dataset=True \
     data.train_ds.tar_metadata_file=\PATH_TO\<TARRED_DATA_OUTPUT_DIR>\metadata.json
 
-.. _inference_text_normalization:
+.. _inference_text_normalization_nn:
 
 Model Inference
 ---------------
@@ -230,16 +230,16 @@ To run inference from a file adjust the previous command by
 
 This pipeline consists of 
     
-    * WFST-based grammars to verbalize hard classes, such as urls and abbreviations.
-    * regex pre-preprocssing of the input, e.g.
-        * adding space around `-` in alpha-numerical words, e.g. `2-car` -> `2 - car`
-        * converting unicode fraction e.g. ½ to 1/2
-        * normalizing greek letters and some special characters, e.g. `+` -> `plus`
-    * Moses :cite:`nlp-textnorm-koehnetal2007moses`. tokenization/preprocessing of the input
-    * inference with neural tagger and decoder
-    * Moses postprocessing/ detokenization
-    * WFST-based grammars to verbalize some `VERBATIM`
-    * punctuation correction for TTS (to match  the output punctuation to the input form)
+* WFST-based grammars to verbalize hard classes, such as urls and abbreviations.
+* regex pre-preprocssing of the input, e.g.
+    * adding space around `-` in alpha-numerical words, e.g. `2-car` -> `2 - car`
+    * converting unicode fraction e.g. ½ to 1/2
+    * normalizing greek letters and some special characters, e.g. `+` -> `plus`
+* Moses :cite:`nlp-textnorm-koehnetal2007moses` tokenization/preprocessing of the input
+* inference with neural tagger and decoder
+* Moses postprocessing/ detokenization
+* WFST-based grammars to verbalize some `VERBATIM`
+* punctuation correction for TTS (to match  the output punctuation to the input form)
 
 Model Architecture
 ------------------
diff --git a/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst b/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
index 702fb9425026..07e1fbd7702c 100644
--- a/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
+++ b/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
@@ -20,7 +20,7 @@ An example bash-script that runs inference and evaluation is provided here: `run
 Quick Start Guide
 -----------------
 
-To run the pretrained models see :ref:`inference_text_normalization`.
+To run the pretrained models see :ref:`inference_text_normalization_tagging`.
 
 Available models
 ^^^^^^^^^^^^^^^^
@@ -115,7 +115,7 @@ Example of a training command:
 
 
-.. _inference_text_normalization:
+.. _inference_text_normalization_tagging:
 
 Model Inference
 ---------------
@@ -162,4 +162,4 @@ References
 .. bibliography:: tn_itn_all.bib
     :style: plain
     :labelprefix: NLP-TEXTNORM-TAG
-    :keyprefix: nlp-textnorm-tag
+    :keyprefix: nlp-textnorm-tag-
diff --git a/docs/source/nlp/text_normalization/wfst/intro.rst b/docs/source/nlp/text_normalization/wfst/intro.rst
index a5d6ab3a8c5d..9805345b30b8 100644
--- a/docs/source/nlp/text_normalization/wfst/intro.rst
+++ b/docs/source/nlp/text_normalization/wfst/intro.rst
@@ -5,7 +5,7 @@ NeMo-text-processing supports Text Normalization (TN), audio-based TN and Invers
 
 .. warning::
 
-    *TN/ITN transitioned from [NVIDIA/NeMo](https://github.com/NVIDIA/NeMo) repository to a standalone [NVIDIA/NeMo-text-processing](https://github.com/NVIDIA/NeMo-text-processing) repository. All updates and discussions/issues should go to the new repository.*
+    TN/ITN transitioned from `NVIDIA/NeMo <https://github.com/NVIDIA/NeMo>`__ repository to a standalone `NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`__ repository. All updates and discussions/issues should go to the new repository.
 
 
 WFST-based TN/ITN:
diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
index 7e1a34c3864e..8fab07e6e278 100644
--- a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
+++ b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
@@ -5,8 +5,7 @@ Text (Inverse) Normalization
 
 .. warning::
 
-    *TN/ITN transitioned from [NVIDIA/NeMo](https://github.com/NVIDIA/NeMo) repository to a standalone [NVIDIA/NeMo-text-processing](https://github.com/NVIDIA/NeMo-text-processing) repository. All updates and discussions/issues should go to the new repository.*
-
+    TN/ITN transitioned from `NVIDIA/NeMo <https://github.com/NVIDIA/NeMo>`_ repository to a standalone `NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_ repository. All updates and discussions/issues should go to the new repository.
 
 The `nemo_text_processing` Python package is based on WFST grammars :cite:`textprocessing-norm-mohri2005weighted` and supports:
 
@@ -188,7 +187,7 @@ Language Support Matrix
 
 See :ref:`Grammar customization <wfst_customization>` for grammar customization details.
 
-See :ref:`Text Processing Deployment <wfst_text_processing_deployment>` for deployment in C++ details.
+See :doc:`Text Processing Deployment <./wfst_text_processing_deployment>` for deployment in C++ details.
 
 WFST TN/ITN resources could be found in :ref:`here <wfst_resources>`.
 
diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst
index 396a7cde578e..4d584e13526b 100644
--- a/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst
+++ b/docs/source/nlp/text_normalization/wfst/wfst_text_processing_deployment.rst
@@ -5,14 +5,13 @@ Deploy to Production with C++ backend
 
 .. warning::
 
-    *TN/ITN transitioned from [NVIDIA/NeMo](https://github.com/NVIDIA/NeMo) repository to a standalone [NVIDIA/NeMo-text-processing](https://github.com/NVIDIA/NeMo-text-processing) repository. All updates and discussions/issues should go to the new repository.*
-
+    TN/ITN transitioned from `NVIDIA/NeMo <https://github.com/NVIDIA/NeMo>`_ repository to a standalone `NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_ repository. All updates and discussions/issues should go to the new repository.
 
 NeMo-text-processing provides tools to deploy :doc:`TN and ITN <wfst_text_normalization>` for production :cite:`textprocessing-deployment-zhang2021nemo`.
 It uses `Sparrowhawk <https://github.com/google/sparrowhawk>`_ :cite:`textprocessing-deployment-sparrowhawk` -- an open-source C++ framework by Google.
 The grammars written with NeMo-text-processing can be exported into an `OpenFST <https://www.openfst.org/>`_ Archive File (FAR) and dropped into Sparrowhawk.
 
-    .. image:: images/deployment_pipeline.png
+    .. image:: ./images/deployment_pipeline.png
         :align: center
         :alt: Deployment pipeline
         :scale: 50%
diff --git a/docs/source/tools/intro.rst b/docs/source/tools/intro.rst
index 5a08d05f3405..b38e435353c6 100644
--- a/docs/source/tools/intro.rst
+++ b/docs/source/tools/intro.rst
@@ -20,3 +20,4 @@ There are also additional NeMo-related tools hosted in separate github repositor
    :maxdepth: 1
 
    speech_data_processor
+   ../nlp/text_normalization/intro

From b7a89102d920cf9159ee97a27f358676b164e597 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Thu, 9 May 2024 11:39:38 -0700
Subject: [PATCH 058/178] remove legacy CI (#9149)

---
 .github/workflows/blossom-ci.yml |  104 -
 Jenkinsfile                      | 5912 ------------------------------
 ci.groovy                        |  119 -
 3 files changed, 6135 deletions(-)
 delete mode 100644 .github/workflows/blossom-ci.yml
 delete mode 100644 Jenkinsfile
 delete mode 100644 ci.groovy

diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
deleted file mode 100644
index bdfb24c4b1e5..000000000000
--- a/.github/workflows/blossom-ci.yml
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# A workflow to trigger ci on hybrid infra (github + self hosted runner)
-name: Blossom-CI
-on:
-  issue_comment:
-    types: [created]
-  workflow_dispatch:
-      inputs:
-          platform:
-            description: 'runs-on argument'     
-            required: false
-          args:
-            description: 'argument'     
-            required: false
-jobs:
-  Authorization:
-    name: Authorization
-    runs-on: blossom 
-    outputs:
-      args: ${{ env.args }}
-      
-    # This job only runs for pull request comments
-    if: |
-         contains( 'okuchaiev,ericharper,titu1994,MaximumEntropy,nithinraok,redoctopus,yidong72,SeanNaren,yzhang123,ekmb,arendu,', format('{0},', github.actor)) && 
-         github.event.comment.body == '/blossom-ci'  
-    steps:
-      - name: Check if comment is issued by authorized person
-        run: blossom-ci
-        env:
-          OPERATION: 'AUTH'
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
-        
-  Vulnerability-scan:
-    name: Vulnerability scan
-    needs: [Authorization]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v2
-        with:
-          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
-          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
-          lfs: 'true'
-      
-      # repo specific steps 
-      #- name: Setup java
-      #  uses: actions/setup-java@v1
-      #  with:
-      #    java-version: 1.8
-      
-      # add blackduck properties https://synopsys.atlassian.net/wiki/spaces/INTDOCS/pages/631308372/Methods+for+Configuring+Analysis#Using-a-configuration-file
-      #- name: Setup blackduck properties
-      #  run: |
-      #       PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g')
-      #       echo detect.maven.build.command="-pl=$PROJECTS -am" >> application.properties
-      #       echo detect.maven.included.scopes=compile >> application.properties
-          
-      - name: Run blossom action
-        uses: NVIDIA/blossom-action@main
-        env:
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
-        with:
-          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
-          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
-          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
-          
-  Job-trigger:
-    name: Start ci job
-    needs: [Vulnerability-scan]
-    runs-on: blossom
-    steps:
-      - name: Start ci job
-        run: blossom-ci
-        env:
-          OPERATION: 'START-CI-JOB'
-          CI_SERVER: ${{ secrets.CI_SERVER }}
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-              
-  Upload-Log:
-    name: Upload log
-    runs-on: blossom
-    if : github.event_name == 'workflow_dispatch'
-    steps:
-      - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)
-        run: blossom-ci
-        env:
-          OPERATION: 'POST-PROCESSING'
-          CI_SERVER: ${{ secrets.CI_SERVER }}
-          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/Jenkinsfile b/Jenkinsfile
deleted file mode 100644
index 3de178e3a12b..000000000000
--- a/Jenkinsfile
+++ /dev/null
@@ -1,5912 +0,0 @@
-pipeline {
-  agent {
-        docker {
-          image 'nvcr.io/nvidia/pytorch:24.02-py3'
-          args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1'
-        }
-  }
-
-  environment {
-        NVTE_FUSED_ATTN = 0
-        NVTE_FLASH_ATTN = 0
-        PYTHONPATH = "/mnt/D3/JenkinsWorkDir/workspace/NeMo-multibranch_${GIT_BRANCH}/Megatron-LM"
-  }
-
-  options {
-    timeout(time: 8, unit: 'HOURS')
-    disableConcurrentBuilds(abortPrevious: true)
-  }
-
-  stages {
-
-    stage('Add git safe directory'){
-      steps{
-        sh 'git config --global --add safe.directory /var/lib/jenkins/workspace/NeMo_$GIT_BRANCH'
-        sh 'git config --global --add safe.directory /raid/JenkinsWorkDir/workspace/NeMo_$GIT_BRANCH'
-        sh 'git config --global --add safe.directory /mnt/D3/JenkinsWorkDir/workspace/NeMo_$GIT_BRANCH'
-      }
-    }
-
-    stage('nvidia-smi'){
-      steps{
-        sh 'nvidia-smi'
-      }
-    }
-
-    stage('PyTorch version') {
-      steps {
-        sh 'python -c "import torch; print(torch.__version__)"'
-        sh 'python -c "import torchvision; print(torchvision.__version__)"'
-      }
-    }
-
-    stage('Install test requirements') {
-      steps {
-        sh 'apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt'
-      }
-    }
-
-    stage('Code formatting checks') {
-      steps {
-        sh 'python setup.py style'
-      }
-    }
-
-    stage('Copyright Headers check') {
-      steps {
-        sh 'python tests/check_copyright_header.py --dir .'
-      }
-    }
-
-    stage('NeMo Installation') {
-      steps {
-        sh './reinstall.sh release'
-      }
-    }
-
-    stage('Transformer Engine installation') {
-      steps {
-         sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \
-             cd TransformerEngine && \
-             git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
-             git checkout FETCH_HEAD && \
-             git submodule init && git submodule update && \
-             NVTE_FRAMEWORK=pytorch pip install .'
-      }
-    }
-
-    stage('Apex installation') {
-      steps {
-         sh 'git clone https://github.com/NVIDIA/apex.git && \
-             cd apex && \
-             git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
-             cp -R apex /usr/local/lib/python3.10/dist-packages'
-      }
-    }
-
-    stage('Megatron Core installation') {
-      steps {
-         sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
-             cd Megatron-LM && \
-             git checkout 709472117364eed93b6a767e2ac343e229d3aa89 && \
-             pip install . && \
-             cd megatron/core/datasets && \
-             make'
-      }
-    }
-
-    stage('AMMO installation') {
-      steps {
-         sh 'pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir'
-      }
-    }
-
-    stage('PyTorch Lightning version') {
-      steps {
-        sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"'
-      }
-    }
-
-    stage('PyTorch Lightning DDP Checks') {
-      steps {
-        sh 'CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"'
-      }
-    }
-
-    stage('Basic Import Checks') {
-      steps {
-        sh 'python -c "import nemo.collections.asr as nemo_asr"'
-        sh 'python -c "import nemo.collections.nlp as nemo_nlp"'
-        sh 'python -c "import nemo.collections.tts as nemo_tts"'
-      }
-    }
-    stage('Import Checks'){
-      steps {
-        sh 'python tests/core_ptl/check_imports.py --domain "nlp"'
-      }
-    }
-
-    stage('L0: Unit Tests GPU') {
-      steps {
-        sh 'NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads'
-      }
-    }
-
-   stage('L0: Unit Tests CPU') {
-     when {
-       anyOf {
-         branch 'main'
-         changeRequest target: 'main'
-       }
-     }
-     steps {
-       sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat'
-     }
-   }
-
-    stage('L2: Multimodal Imagen Train') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/imagen_train"
-        sh "python examples/multimodal/text_to_image/imagen/imagen_training.py \
-        trainer.precision=16 \
-        trainer.num_nodes=1 \
-        trainer.devices=1 \
-        ++exp_manager.max_time_per_run=00:00:03:00 \
-        trainer.max_steps=20 \
-        model.conditioning.embed_dim=64 \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.synthetic_data=True \
-        exp_manager.exp_dir=/home/TestData/multimodal/imagen_train \
-        model.inductor=False \
-        model.unet.flash_attention=False \
-        "
-        sh "rm -rf /home/TestData/multimodal/imagen_train"
-      }
-    }
-
-    stage('L2: Multimodal Stable Diffusion Train') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
-        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-            trainer.precision=bf16 \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            trainer.max_steps=20 \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.optim.name=megatron_fused_adam \
-            model.data.synthetic_data=True \
-            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train \
-            model.inductor=False \
-            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-            ++model.cond_stage_config.max_length=77 \
-            ~model.cond_stage_config.restore_from_path \
-            ~model.cond_stage_config.freeze \
-            ~model.cond_stage_config.layer \
-            model.unet_config.from_pretrained=null \
-            model.first_stage_config.from_pretrained=null \
-            model.unet_config.use_flash_attention=False \
-            model.unet_config.attention_resolutions=[1] \
-            model.unet_config.channel_mult=[1] \
-            model.ddp_overlap=False \
-            "
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
-      }
-    }
-    stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-            trainer.precision=bf16 \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
-            trainer.max_steps=20 \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-           model.data.synthetic_data=True \
-            model.first_stage_key=images_moments \
-            model.cond_stage_key=clip_encoded \
-            model.optim.name=megatron_fused_adam \
-            +model.optim.capturable=True \
-            exp_manager.ema.enable=False \
-            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-            ++model.cond_stage_config.max_length=77 \
-            model.inductor=False \
-            ~model.cond_stage_config.restore_from_path \
-            ~model.cond_stage_config.freeze \
-            ~model.cond_stage_config.layer \
-            model.first_stage_config.from_pretrained=null \
-            model.ddp_overlap=False \
-            model.capture_cudagraph_iters=15 \
-            model.unet_config.use_flash_attention=False \
-            model.unet_config.attention_resolutions=[1] \
-            model.unet_config.channel_mult=[1] \
-            "
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-      }
-    }
-//     stage('L2: Multimodal ControlNet Train') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/multimodal/controlnet_train"
-//         sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \
-//             trainer.precision=16 \
-//             trainer.num_nodes=1 \
-//             trainer.devices=1 \
-//             ++exp_manager.max_time_per_run=00:00:03:00 \
-//             trainer.max_steps=20 \
-//             model.micro_batch_size=1 \
-//             model.global_batch_size=1 \
-//             model.data.synthetic_data=True \
-//             exp_manager.exp_dir=/home/TestData/multimodal/controlnet_train \
-//             model.inductor=False \
-//             model.image_logger.max_images=0 \
-//             model.control_stage_config.params.from_pretrained_unet=null \
-//             model.unet_config.from_pretrained=null \
-//             model.first_stage_config.from_pretrained=null \
-//             model.unet_config.use_flash_attention=False \
-//             "
-//         sh "rm -rf /home/TestData/multimodal/controlnet_train"
-//       }
-//     }
-//     stage('L2: Multimodal DreamBooth Train') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
-//         sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \
-//             trainer.precision=16 \
-//             trainer.num_nodes=1 \
-//             trainer.devices=1 \
-//             ++exp_manager.max_time_per_run=00:00:03:00 \
-//             trainer.max_steps=20 \
-//             model.micro_batch_size=1 \
-//             model.global_batch_size=1 \
-//             exp_manager.exp_dir=/home/TestData/multimodal/dreambooth_train \
-//             model.inductor=False \
-//             model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-//             ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-//             ++model.cond_stage_config.max_length=77 \
-//             ~model.cond_stage_config.restore_from_path \
-//             ~model.cond_stage_config.freeze \
-//             ~model.cond_stage_config.layer \
-//             model.unet_config.from_pretrained=null \
-//             model.first_stage_config.from_pretrained=null \
-//             model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \
-//             model.unet_config.use_flash_attention=False \
-//             "
-//         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
-//       }
-//     }
-    stage('L2: Vision ViT Pretrain TP=1') {
-      when {
-        anyOf {
-          branch 'r1.23.0'
-          changeRequest target: 'r1.23.0'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
-        sh "python examples/vision/vision_transformer/megatron_vit_classification_pretrain.py \
-            trainer.precision=16 \
-            model.megatron_amp_O2=False \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            trainer.val_check_interval=5 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            trainer.max_steps=20 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.data.num_workers=0 \
-            exp_manager.create_checkpoint_callback=False \
-            model.data.data_path=[/home/TestData/multimodal/tiny-imagenet/train,/home/TestData/multimodal/tiny-imagenet/val] \
-            exp_manager.exp_dir=/home/TestData/vision/vit_pretrain_tp1 "
-        sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
-      }
-    }
-
-    stage('L2: Multimodal CLIP Pretrain TP=1') {
-      when {
-        anyOf {
-          branch 'r1.23.0'
-          changeRequest target: 'r1.23.0'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
-        sh "python examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py  \
-            trainer.precision=16 \
-            model.megatron_amp_O2=False \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            trainer.val_check_interval=10 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            trainer.max_steps=20 \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            exp_manager.create_checkpoint_callback=False \
-            model.data.num_workers=0 \
-            model.vision.num_layers=2 \
-            model.text.num_layers=2 \
-            model.vision.patch_dim=32 \
-            model.vision.encoder_seq_length=49 \
-            model.vision.class_token_length=7 \
-            model.data.train.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
-            model.data.validation.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
-            model.data.webdataset.local_root_path=/ \
-            exp_manager.exp_dir=/home/TestData/multimodal/clip_pretrain_tp1 "
-        sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
-      }
-    }
-
-    stage('L2: Multimodal NeVA Pretrain TP=1') {
-      when {
-        anyOf {
-          branch 'r1.23.0'
-          changeRequest target: 'r1.23.0'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
-        sh "python examples/multimodal/multimodal_llm/neva/neva_pretrain.py \
-            trainer.precision=16 \
-            model.megatron_amp_O2=False \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=5 \
-            trainer.log_every_n_steps=1 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            trainer.max_steps=20 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            exp_manager.create_checkpoint_callback=False \
-            model.data.data_path=/home/TestData/multimodal/tiny-neva/dummy.json \
-            model.data.image_folder=/home/TestData/multimodal/tiny-neva/images \
-            model.tokenizer.library=sentencepiece \
-            model.tokenizer.model=/home/TestData/multimodal/tiny-neva/tokenizer_add_special.model \
-            model.num_layers=2 \
-            model.hidden_size=5120 \
-            model.ffn_hidden_size=13824 \
-            model.num_attention_heads=40 \
-            model.normalization=rmsnorm \
-            model.data.num_workers=0 \
-            model.data.conv_template=llama_2 \
-            model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \
-            model.mm_cfg.llm.from_pretrained=null \
-            model.use_flash_attention=false \
-            exp_manager.exp_dir=/home/TestData/multimodal/neva_pretrain_tp1 "
-        sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
-      }
-    }
-
-    stage('Setup test data and models') {
-      steps {
-        sh 'python -m tests.setup --save_dir /home/TestData/nlp'
-      }
-    }
-
-    // TODO: this requires TE >= v0.11 which is not available in 23.06.
-    //        please uncomment this test once mcore CI is ready.
-    
-
-    stage('L2: Community LLM Checkpoints tests') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Llama') {
-          steps {
-            sh 'CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
-            --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \
-            --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo'
-          }
-        }
-        stage('StarCoder') {
-          steps {
-            sh 'python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
-            --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
-            --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf'
-            sh 'rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo'
-          }
-        }
-        stage('Falcon') {
-          steps {
-            sh 'python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
-            --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
-            --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo'
-            sh 'rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo'
-          }
-        }
-        stage('Baichuan2') {
-          steps {
-            sh 'python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
-            --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
-            --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo'
-            sh 'rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo'
-          }
-        }
-      }
-    }
-
-    stage('L2: Nemo PTQ') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Llama2 - Export Only') {
-          steps {
-            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-            quantization.algorithm=null \
-            model_save=/home/TestData/nlp/megatron_llama/ci_baseline'
-            sh 'rm -rf /home/TestData/nlp/megatron_llama/ci_baseline'
-          }
-        }
-        stage('Llama2 - INT8 SQ') {
-          steps {
-            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci_megatron_amp_O2_hf_tokenizer.nemo \
-            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-            quantization.algorithm=int8_sq \
-            quantization.num_calib_size=8 \
-            inference.batch_size=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo'
-          }
-        }
-        stage('Llama2 - FP8') {
-          steps {
-            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-            tensor_model_parallel_size=2 \
-            trainer.devices=2 \
-            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-            quantization.algorithm=fp8 \
-            quantization.num_calib_size=8 \
-            inference.batch_size=2 \
-            export.inference_tensor_parallel=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
-          }
-        }
-      }
-    }
-
-    stage('L2: ASR dev run') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Speech to Text') {
-          steps {
-            sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_results'
-            sh 'rm -rf examples/asr/speech_to_text_results'
-          }
-        }
-
-        stage('Speech to Text WPE - CitriNet') {
-          steps {
-            sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/citrinet/" --config-name="config_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results'
-            sh 'rm -rf examples/asr/speech_to_text_wpe_results'
-          }
-        }
-
-        stage('Speech Pre-training - CitriNet') {
-          steps {
-            sh 'python examples/asr/speech_pretraining/speech_pre_training.py \
-            --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_pre_training_results'
-            sh 'rm -rf examples/asr/speech_pre_training_results'
-          }
-        }
-
-        stage('Speech To Text Finetuning') {
-          steps {
-            sh 'python examples/asr/speech_to_text_finetune.py \
-            --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
-            model.tokenizer.update_tokenizer=False \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_finetuning_results'
-            sh 'rm -rf examples/asr/speech_finetuning_results'
-          }
-        }
-
-        stage('Speech To Text HF Finetuning') {
-          steps {
-            sh 'python examples/asr/speech_to_text_finetune.py \
-            --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
-            ~model.train_ds.hf_data_cfg \
-            model.train_ds.num_workers=1 \
-            model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
-            model.train_ds.streaming=true \
-            +model.train_ds.hf_data_cfg.path="librispeech_asr" \
-            +model.train_ds.hf_data_cfg.name=null \
-            +model.train_ds.hf_data_cfg.split="test.clean" \
-            +model.train_ds.hf_data_cfg.streaming=true \
-            ~model.validation_ds.hf_data_cfg \
-            model.validation_ds.streaming=true \
-            +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
-            +model.validation_ds.hf_data_cfg.name=null \
-            +model.validation_ds.hf_data_cfg.split="test.clean" \
-            +model.validation_ds.hf_data_cfg.streaming=true \
-            ~model.test_ds \
-            init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
-            model.tokenizer.update_tokenizer=False \
-            model.optim.sched.warmup_steps=0 \
-            +model.optim.sched.max_steps=3 \
-            trainer.max_epochs=null \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_finetuning_results'
-            sh 'rm -rf examples/asr/speech_finetuning_results'
-          }
-        }
-
-        // TODO: Please Fix Me
-        // Error locating target 'nemo.collections.asr.modules.wav2vec_modules.ConvFeatureEncoder', see chained exception above.
-        // stage('L2: Speech Pre-training - Wav2Vec') {
-        //   steps {
-        //     sh 'python examples/asr/speech_pretraining/speech_pre_training.py \
-        //     --config-path="../conf/ssl/wav2vec/" --config-name="wav2vec_ci" \
-        //     model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        //     model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        //     trainer.devices=[1] \
-        //     trainer.accelerator="gpu" \
-        //     +trainer.fast_dev_run=True \
-        //     exp_manager.exp_dir=examples/asr/speech_pre_training_results'
-        //     sh 'rm -rf examples/asr/speech_pre_training_results'
-        //   }
-        // }
-
-        stage('L2: Speech to Text WPE - Conformer') {
-          steps {
-            sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            model.train_ds.batch_size=4 \
-            model.validation_ds.batch_size=4 \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results'
-            sh 'rm -rf examples/asr/speech_to_text_wpe_conformer_results'
-          }
-        }
-      }
-    }
-
-    stage('L2: ASR dev run - part two') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: Speech to Text WPE - Squeezeformer') {
-          steps {
-            sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            model.encoder.d_model=144 \
-            model.train_ds.batch_size=4 \
-            model.validation_ds.batch_size=4 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results'
-            sh 'rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results'
-          }
-        }
-      }
-    }
-
-    stage('L2: Speech to Text EMA') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      steps {
-        sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
-        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        trainer.devices=2 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        +exp_manager.ema.enable=True \
-        exp_manager.exp_dir=examples/asr/speech_to_text_results'
-        sh 'rm -rf examples/asr/speech_to_text_results'
-      }
-
-    }
-
-    stage('L2: Speech to Text AED') {
-      when {
-        anyOf {
-          branch 'r1.23.0'
-          changeRequest target: 'r1.23.0'
-        }
-      }
-      steps {
-        sh 'python examples/asr/speech_multitask/speech_to_text_aed.py \
-        model.prompt_format=canary \
-        model.model_defaults.asr_enc_hidden=256 \
-        model.model_defaults.lm_dec_hidden=256 \
-        model.encoder.n_layers=12 \
-        model.transf_encoder.num_layers=0 \
-        model.transf_decoder.config_dict.num_layers=12 \
-        model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \
-        ++model.train_ds.is_tarred=false \
-        model.train_ds.batch_duration=60 \
-        +model.train_ds.text_field="answer" \
-        +model.train_ds.lang_field="target_lang" \
-        model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
-        +model.validation_ds.text_field="answer" \
-        +model.validation_ds.lang_field="target_lang" \
-        model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
-        +model.test_ds.text_field="answer" \
-        +model.test_ds.lang_field="target_lang" \
-        model.tokenizer.langs.spl_tokens.dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \
-        model.tokenizer.langs.spl_tokens.type="bpe" \
-        model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \
-        model.tokenizer.langs.en.type=bpe \
-        ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \
-        ++model.tokenizer.langs.es.type=bpe \
-        trainer.devices=[0] \
-        trainer.accelerator="gpu" \
-        +trainer.use_distributed_sampler=false \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=examples/asr/speech_to_text_aed_results'
-        sh 'rm -rf examples/asr/speech_to_text_results'
-      }
-
-    }
-
-    stage('L2: Speaker dev run') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Speaker Recognition') {
-          steps {
-            sh 'python examples/speaker_tasks/recognition/speaker_reco.py \
-            model.train_ds.batch_size=10 \
-            model.validation_ds.batch_size=2 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
-            model.decoder.num_classes=2 \
-            trainer.max_epochs=10 \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results'
-            sh 'rm -rf examples/speaker_tasks/recognition/speaker_recognition_results'
-          }
-        }
-
-        stage('Speaker Diarization') {
-          steps {
-            sh 'python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \
-            model.diarizer.speaker_embeddings.model_path=titanet_large \
-            model.train_ds.batch_size=5 \
-            model.validation_ds.batch_size=5 \
-            model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
-            model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
-            model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results'
-            sh 'rm -rf examples/speaker_tasks/diarization/speaker_diarization_results'
-          }
-        }
-
-        stage('Speech to Label') {
-          steps {
-            sh 'python examples/asr/speech_classification/speech_to_label.py \
-            model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
-            model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-            model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
-            ~model.preprocessor.window_size \
-            ~model.preprocessor.window_stride \
-            ~model.preprocessor.window \
-            ~model.preprocessor.n_mels \
-            ~model.preprocessor.n_mfcc \
-            ~model.preprocessor.n_fft \
-            exp_manager.exp_dir=examples/asr/speech_to_label_results'
-            sh 'rm -rf examples/asr/speech_to_label_results'
-          }
-        }
-
-        stage('Speaker Diarization with ASR Inference') {
-          steps {
-            sh 'python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \
-	        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \
-            diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \
-            diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \
-            diarizer.asr.model_path=QuartzNet15x5Base-En \
-            diarizer.asr.parameters.asr_based_vad=True \
-            diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results'
-            sh 'rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results'
-          }
-        }
-
-        stage('Clustering Diarizer Inference') {
-          steps {
-            sh 'python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
-	        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \
-            diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \
-            diarizer.speaker_embeddings.parameters.multiscale_weights=null \
-            diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
-            diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results'
-            sh 'rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results'
-          }
-        }
-
-        stage('Neural Diarizer Inference') {
-          steps {
-            sh 'python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \
-            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
-            diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results'
-            sh 'rm -rf examples/speaker_tasks/diarization/neural_diarizer_results'
-          }
-        }
-
-        stage('Multispeaker ASR Data Simulation') {
-          steps {
-            sh 'python tools/speech_data_simulator/multispeaker_simulator.py \
-            --config-path=conf --config-name=data_simulator.yaml \
-            data_simulator.random_seed=42 \
-            data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \
-            data_simulator.outputs.output_dir=./test_simulator \
-            data_simulator.session_config.num_sessions=2 \
-            data_simulator.session_config.session_length=60'
-            sh 'rm -rf ./test_simulator'
-          }
-        }
-      }
-    }
-    // TODO: Enable test after 21.08 container is used.
-    // stage('L2: ASR DALI dev run') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('Speech to Text - DALI AudioToMelSpectrogramPreprocessor') {
-    //       steps {
-    //         sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         +model.train_ds.use_dali=True \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         +model.validation_ds.use_dali=True \
-    //         trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_results'
-    //       }
-    //     }
-    //    stage('Speech to Text BPE - DALI AudioToMelSpectrogramPreprocessor') {
-    //       steps {
-    //         sh 'python examples/asr/asr_ctc/speech_to_text_bpe.py \
-    //         --config-path="../conf/citrinet/" --config-name="config_bpe" \
-    //         model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-    //         model.tokenizer.type="wpe" \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         +model.train_ds.use_dali=True \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         +model.validation_ds.use_dali=True \
-    // 	       trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_wpe_results'
-    //       }
-    //     }
-    //     // TODO: This would fail due to an unnecessary torchaudio import.
-    //     //       To be enabled once torchaudio is available in the container used for CI
-    //     // stage('Speech to Text - DALI AudioToMFCCPreprocessor') {
-    //     //   steps {
-    //     //     sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
-    //     //     model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //     //     +model.train_ds.use_dali=True \
-    //     //     model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //     //     +model.validation_ds.use_dali=True \
-    //     //     model.preprocessor._target_=nemo.collections.asr.modules.AudioToMFCCPreprocessor \
-    //     //     ~model.preprocessor.normalize \
-    //     //     ~model.preprocessor.features \
-    //     //     ~model.preprocessor.frame_splicing \
-    //     //     ~model.preprocessor.dither \
-    //     //     ~model.preprocessor.stft_conv \
-    //     //     +model.n_mels=64 \
-    //     //     +model.n_mfcc=64 \
-    //     //     trainer.devices=[1] \
-    //     //     trainer.accelerator="gpu" \
-    //     //     +trainer.fast_dev_run=True \
-    //     //     exp_manager.exp_dir=examples/asr/speech_to_text_results'
-    //     //     sh 'rm -rf examples/asr/speech_to_text_results'
-    //     //   }
-    //     // }
-    //   }
-    // }
-
-    // TODO: Add back once CI is updated
-    // stage('L2: ASR RNNT dev run') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('Speech to Text - RNNT') {
-    //       steps {
-    //         sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_transducer/speech_to_text_rnnt.py \
-    //         --config-path="../conf/contextnet_rnnt/" --config-name="config_rnnt.yaml" \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         model.train_ds.batch_size=2 \
-    //         model.validation_ds.batch_size=2 \
-    //         trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_rnnt_results'
-    //       }
-    //     }
-    //     stage('L2: Speech to Text RNNT WPE') {
-    //       steps {
-    //         sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py \
-    //         --config-path="../conf/contextnet_rnnt/" --config-name="config_rnnt_bpe.yaml" \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         model.train_ds.batch_size=2 \
-    //         model.validation_ds.batch_size=2 \
-    //         model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-    //         model.tokenizer.type="wpe" \
-    //         trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_wpe_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_rnnt_wpe_results'
-    //       }
-    //     }
-    //     stage('L3: Speech to Text Hybrid Transducer-CTC WPE') {
-    //       steps {
-    //         sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py \
-    //         --config-path="../conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc/" --config-name="conformer_hybrid_transducer_ctc_bpe.yaml" \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         model.encoder.n_layers= 2 \
-    //         model.train_ds.batch_size=2 \
-    //         model.validation_ds.batch_size=2 \
-    //         model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-    //         model.tokenizer.type="wpe" \
-    //         trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results'
-    //       }
-    //     }
-    //   }
-    // }
-
-    // stage('L2: Hybrid ASR RNNT-CTC dev run') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('Speech to Text Hybrid Transducer-CTC WPE') {
-    //       steps {
-    //         sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py \
-    //         --config-path="../conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc/" --config-name="conformer_hybrid_transducer_ctc_bpe.yaml" \
-    //         model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-    //         model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-    //         model.encoder.n_layers= 2 \
-    //         model.train_ds.batch_size=2 \
-    //         model.validation_ds.batch_size=2 \
-    //         model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-    //         model.tokenizer.type="wpe" \
-    //         trainer.devices=[0] \
-    //         trainer.accelerator="gpu" \
-    //         +trainer.fast_dev_run=True \
-    //         exp_manager.exp_dir=examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results'
-    //         sh 'rm -rf examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results'
-    //       }
-    //     }
-    //   }
-    // }
-
-    stage('L2: ASR Multi-dataloader dev run') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Speech to Text multi-dataloader') {
-          steps {
-            sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            +trainer.num_sanity_val_steps=1 \
-            exp_manager.exp_dir=examples/asr/speech_to_text_results'
-            sh 'rm -rf examples/asr/speech_to_text_results'
-          }
-        }
-
-        stage('Speech to Label multi-dataloader') {
-          steps {
-            sh 'python examples/asr/speech_classification/speech_to_label.py \
-            model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
-            model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            +trainer.num_sanity_val_steps=1 \
-            model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
-            ~model.preprocessor.window_size \
-            ~model.preprocessor.window_stride \
-            ~model.preprocessor.window \
-            ~model.preprocessor.n_mels \
-            ~model.preprocessor.n_mfcc \
-            ~model.preprocessor.n_fft \
-            exp_manager.exp_dir=examples/asr/speech_to_label_results'
-            sh 'rm -rf examples/asr/speech_to_label_results'
-          }
-        }
-      }
-    }
-
-    stage('L2: ASR Adapters') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Linear Adapters') {
-          steps {
-            sh 'python examples/asr/asr_adapters/train_asr_adapter.py \
-            model.pretrained_model="stt_en_conformer_ctc_small" \
-            model.adapter.adapter_name="an4" \
-            model.adapter.linear.in_features=176 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.max_steps=5 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results'
-            sh 'rm -rf examples/asr/speech_to_text_adapters_results'
-          }
-        }
-        stage('RelPos MHA Adapters') {
-          steps {
-            sh 'python examples/asr/asr_adapters/train_asr_adapter.py \
-            model.pretrained_model="stt_en_conformer_ctc_small" \
-            model.adapter.adapter_name="encoder:an4" \
-            model.adapter.adapter_type="tiny_attn" \
-            model.adapter.tiny_attn.n_feat=176 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.max_steps=5 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results'
-            sh 'rm -rf examples/asr/speech_to_text_adapters_mha_results'
-          }
-        }
-
-      }
-    }
-
-    stage('L2: Speech Transcription') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Speech to Text Transcribe') {
-          steps {
-            sh 'python examples/asr/transcribe_speech.py \
-            pretrained_name="QuartzNet15x5Base-En" \
-            audio_dir="/home/TestData/an4_transcribe/test_subset/" \
-            output_filename="stt_test_res.json" \
-            amp=true'
-            sh 'rm -rf stt_test_res.json'
-          }
-        }
-      }
-    }
-    stage('L2: Transducer alignment') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Running pytest') {
-          steps {
-            sh 'pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1'
-          }
-        }
-      }
-    }
-
-    stage('L2: Segmentation Tool') {
-      when {
-            anyOf {
-              branch 'main'
-              changeRequest target: 'main'
-            }
-      }
-      stages {
-        stage('Install ctc_segmentation requirements') {
-            steps {
-            sh 'cd tools/ctc_segmentation && \
-            pip install -r requirements.txt && \
-            apt-get update && apt-get install libsox-fmt-all -y'
-            }
-        }
-
-        stage('Parallel ctc_segmentation test') {
-          failFast true
-          parallel {
-            stage('L2: Eng CitriNet with .wav') {
-              steps {
-                sh 'cd tools/ctc_segmentation && \
-            TIME=`date +"%Y-%m-%d-%T"` && \
-            /bin/bash run_segmentation.sh \
-            --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
-            --DATA_DIR=/home/TestData/ctc_segmentation/eng \
-            --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \
-            --LANGUAGE=en \
-            --USE_NEMO_NORMALIZATION="TRUE" && \
-            python /home/TestData/ctc_segmentation/verify_alignment.py \
-            -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
-            -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \
-            rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}'
-              }
-            }
-            stage('L2: Ru QN with mp3') {
-              steps {
-                sh 'cd tools/ctc_segmentation && \
-            TIME=`date +"%Y-%m-%d-%T"` && \
-            /bin/bash run_segmentation.sh \
-            --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \
-            --DATA_DIR=/home/TestData/ctc_segmentation/ru \
-            --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \
-            --LANGUAGE=ru \
-            --ADDITIONAL_SPLIT_SYMBOLS=";" && \
-            python /home/TestData/ctc_segmentation/verify_alignment.py \
-            -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
-            -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \
-            rm -rf /home/TestData/ctc_segmentation/ru/output${TIME}'
-              }
-            }
-          }
-        }
-      }
-    }
-
-    stage('L2: G2P Models') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('G2P Conformer training, evaluation and inference') {
-          steps {
-            sh 'cd examples/tts/g2p && \
-                TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
-                python g2p_train_and_evaluate.py \
-                    train_manifest=/home/TestData/g2p/g2p.json \
-                    validation_manifest=/home/TestData/g2p/g2p.json \
-                    model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
-                    model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \
-                    trainer.max_epochs=1 \
-                    model.max_source_len=64 \
-                    trainer.devices=[0] \
-                    do_training=True \
-                    do_testing=True \
-                    exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \
-                    +exp_manager.use_datetime_version=False\
-                    +exp_manager.version=test \
-                    --config-name=g2p_conformer_ctc && \
-                python g2p_inference.py \
-                    pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
-                    manifest_filepath=/home/TestData/g2p/g2p.json \
-                    phoneme_field=text'
-              }
-            }
-            // TODO: pleasefixme @redoctopus
-            // stage('ByT5G2P training, evaluation and inference') {
-            //   steps {
-            //     sh 'cd examples/tts/g2p && \
-            //         TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \
-            //         python g2p_train_and_evaluate.py \
-            //             train_manifest=/home/TestData/g2p/g2p.json \
-            //             validation_manifest=/home/TestData/g2p/g2p.json \
-            //             model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
-            //             trainer.max_epochs=1 \
-            //             model.max_source_len=64 \
-            //             trainer.devices=[1] \
-            //             do_training=True \
-            //             do_testing=True \
-            //             exp_manager.exp_dir=${OUTPUT_DIR_T5} \
-            //             +exp_manager.use_datetime_version=False\
-            //             +exp_manager.version=test && \
-            //         python g2p_inference.py \
-            //             pretrained_model=${OUTPUT_DIR_T5}/T5G2P/test/checkpoints/T5G2P.nemo \
-            //             manifest_filepath=/home/TestData/g2p/g2p.json \
-            //             phoneme_field=text'
-            //   }
-            // }
-           stage('HeteronymClassificationModel training, evaluation and inference') {
-              steps {
-                sh 'cd examples/tts/g2p && \
-                    TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
-                    python g2p_heteronym_classification_train_and_evaluate.py \
-                        train_manifest=/home/TestData/g2p/manifest.json \
-                        validation_manifest=/home/TestData/g2p/manifest.json \
-                        test_manifest=/home/TestData/g2p/manifest.json \
-                        model.wordids=/home/TestData/g2p/wordids.tsv \
-                        trainer.max_epochs=1 \
-                        model.max_seq_length=64 \
-                        do_training=True \
-                        do_testing=True \
-                        exp_manager.exp_dir=${OUTPUT_DIR} \
-                        +exp_manager.use_datetime_version=False\
-                        +exp_manager.version=test && \
-                    python g2p_heteronym_classification_inference.py \
-                        manifest=/home/TestData/g2p/manifest.json \
-                        pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
-                        output_manifest=preds.json'
-              }
-            }
-          }
-        }
-
-    // TODO: add test once megatron-bert is supported again
-    // stage('L2: Multi-GPU Megatron finetuning') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('L2: Cased Megatron finetuning on MRPC') {
-    //       steps {
-    //         sh 'cd examples/nlp/glue_benchmark && \
-    //     python glue_benchmark.py \
-    //     model.dataset.data_dir=/home/TestData/nlp/glue_fake/MRPC \
-    //     trainer.devices=[0,1] \
-    //     trainer.accelerator="gpu" \
-    //     +trainer.fast_dev_run=true \
-    //     model.dataset.use_cache=false \
-    //     model.language_model.pretrained_model_name=megatron-bert-345m-cased \
-    //     trainer.accelerator=gpu \
-    //     trainer.strategy=ddp \
-    //     exp_manager=null'
-    //       }
-    //     }
-    //   }
-    // }
-
-    stage('L2: STS-b') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('GLUE STS-b with AlBERT') {
-          steps {
-            sh 'python examples/nlp/glue_benchmark/glue_benchmark.py \
-            model.dataset.use_cache=false \
-            model.task_name=sts-b \
-            model.dataset.data_dir=/home/TestData/nlp/glue_fake/STS-B \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            model.language_model.pretrained_model_name=albert-base-v1 \
-            exp_manager=null'
-          }
-        }
-        stage('Test Restore Punctuation & Capitalization with AlBERT') {
-          steps {
-            sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_and_Capitalization_albert.nemo \
-              +model.test_ds.use_cache=false \
-              ~model.train_ds \
-              ~model.validation_ds \
-              model.test_ds.ds_item="${data_dir}" \
-              trainer.devices=[1] \
-              trainer.accelerator="gpu" \
-              exp_manager=null && \
-            rm -rf "${data_dir}"'
-          }
-        }
-//         stage('Test Restore Punctuation & Capitalization with RoBERTa') {
-//           steps {
-//             sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \
-//             cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-//             python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-//               +do_training=false \
-//               +do_testing=true \
-//               pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_and_Capitalization_roberta.nemo \
-//               +model.test_ds.use_cache=false \
-//               ~model.train_ds \
-//               ~model.validation_ds \
-//               model.test_ds.ds_item="${data_dir}" \
-//               trainer.devices=[1] \
-//               trainer.accelerator="gpu" \
-//               exp_manager=null && \
-//             rm -rf "${data_dir}"'
-//           }
-//         }
-      }
-    }
-    stage('L2: Dialogue Classification') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Dialogue: Intent and slot classification using GPT') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\
-            model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
-            model.dataset.dialogues_example_dir=sgd_gen_outputs \
-            model.dataset.task_name=debug_sample \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
-            model.dataset.use_cache=false \
-            model.tokenizer.special_tokens={pad_token:"endoftext"} \
-            model.tokenizer.tokenizer_name=gpt2 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
-            model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_outputs'
-          }
-        }
-        stage('Intent and slot classification using SGDQA') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
-            model.dataset.task_name=debug_sample \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.dataset.num_tasks=6 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-cased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_bert_outputs'
-          }
-        }
-        stage('Intent and slot classification using IntentSlotClassificationModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
-            model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
-            model.dataset.task=assistant \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_bert_intent_classification_outputs'
-          }
-        }
-        stage('Intent classification using ZeroShotIntentModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \
-            model.dataset.task=zero_shot \
-            model.dataset.prompt_template="This example is" \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[1] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_zero_shot_intent_classification_outputs'
-          }
-        }
-        stage('Design Intent classification using ZeroShotIntentModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="This example is related to" \
-            model.library=megatron \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[1] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_zero_shot_intent_classification_outputs'
-          }
-        }
-        stage('Design Intent classification using ZeroShotIntentModel BART Classifier') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="This example is related to" \
-            model.library=huggingface \
-            trainer.devices=[1] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_zero_shot_intent_classification_bart_outputs'
-          }
-        }
-        stage('Design Intent classification using DialogueNearestNeighbourModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="" \
-            model.library=huggingface \
-            trainer.devices=[0] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_dialogue_nearest_neighbour_classification_outputs'
-          }
-        }
-      }
-    }
-    stage('L2: Dialogue Generation') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Dialogue: Answer Extender using DialogueS2SGenerationModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-            model.dataset.dialogues_example_dir=answer_extender_s2s \
-            model.dataset.task=ms_marco \
-            model.library=huggingface \
-            model.dataset.debug_mode=True \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[1] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=facebook/bart-large \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf answer_extender_s2s'
-          }
-        }
-        stage('Dialogue: SGD Based Answer Extender using DialogueS2SGenerationModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \
-            model.dataset.task_name=debug_sample \
-            model.dataset.task=sgd_generation \
-            model.dataset.input_field=utterance+system_actions \
-            model.dataset.output_field=system_utterance \
-            model.dataset.use_cache=false \
-            model.dataset.system_utterance=next_turn \
-            model.dataset.debug_mode=True \
-            model.dataset.prompt_template=slots_values \
-            model.library=huggingface \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
-            model.language_model.pretrained_model_name=facebook/bart-large \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_answer_extender_s2s'
-          }
-        }
-      }
-    }
-//     stage('L2: Dialogue Generation Part 2') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       parallel {
-//         stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') {
-//           steps {
-//             sh 'cd examples/nlp/dialogue && \
-//             python dialogue.py \
-//             do_training=False \
-//             model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-//             model.dataset.dialogues_example_dir=answer_extender \
-//             model.library=huggingface \
-//             model.dataset.task=ms_marco \
-//             model.dataset.debug_mode=True \
-//             trainer.val_check_interval=0.0 \
-//             trainer.devices=[0] \
-//             model.dataset.use_cache=false \
-//             model.language_model.pretrained_model_name=gpt2 \
-//             trainer.accelerator=gpu \
-//             exp_manager=null  && \
-//             rm -rf answer_extender'
-//           }
-//         }
-//       }
-//     }
-    stage('L2: COPY') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') {
-          steps {
-            sh 'cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-            model.dataset.dialogues_example_dir=answer_extender \
-            model.library=huggingface \
-            model.dataset.task=ms_marco \
-            model.dataset.debug_mode=True \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=gpt2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf answer_extender'
-          }
-        }
-      }
-    }
-    stage('L2: Duplex Text Normalization') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('Duplex Text Normalization with Tarred dataset') {
-          steps {
-            sh 'cd examples/nlp/duplex_text_normalization && \
-            python duplex_text_normalization_train.py \
-            data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
-            mode=tn \
-            lang=en \
-            tagger_model.do_training=false \
-            decoder_model.transformer=t5-small \
-            data.validation_ds.batch_size=2 \
-            data.train_ds.use_cache=false \
-            data.validation_ds.use_cache=false \
-            data.test_ds.batch_size=2 \
-            data.train_ds.decoder_data_augmentation=false \
-            data.train_ds.num_workers=2 \
-            decoder_trainer.devices=[0,1] \
-            decoder_trainer.accelerator="gpu" \
-            data.train_ds.use_tarred_dataset=true \
-            +decoder_trainer.fast_dev_run=true \
-            decoder_exp_manager.create_checkpoint_callback=false \
-            data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
-            data.test_ds.use_cache=false \
-            data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv'
-          }
-        }
-      }
-    }
-    // Runs out of memory on the 12G TITAN V (GPU 0 on main CI)
-    // TODO: add when megatron bert is supported again in NeMo
-    // stage('L2: MegaBERT Token Classification') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps {
-    //     sh 'cd examples/nlp/token_classification && \
-    //     python token_classification_train.py \
-    //     model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-    //     model.language_model.pretrained_model_name=megatron-bert-345m-uncased \
-    //     model.train_ds.batch_size=10 \
-    //     model.dataset.max_seq_length=50 \
-    //     model.dataset.use_cache=false \
-    //     trainer.accelerator=gpu \
-    //     trainer.strategy=ddp \
-    //     trainer.precision=16 \
-    //     trainer.devices=[1] \
-    //     trainer.accelerator="gpu" \
-    //     +trainer.fast_dev_run=true \
-    //     exp_manager=null'
-    //   }
-    // }
-
-    stage('L2: BERT Text Classification') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage ('Text Classification with BERT Test') {
-          steps {
-            sh 'cd examples/nlp/text_classification && \
-            python text_classification_with_bert.py \
-            model.dataset.num_classes=6 \
-            model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-            model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-            model.language_model.pretrained_model_name=distilbert-base-uncased \
-            model.train_ds.batch_size=10 \
-            model.dataset.max_seq_length=50 \
-            model.dataset.use_cache=false \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager=null'
-          }
-        }
-      }
-    }
-
-    stage('L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('BERT SQUAD 1.1') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-        stage('BERT SQUAD 2.0') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-      }
-    }
-
-    stage('L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('BART SQUAD 1.1') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=facebook/bart-base \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-        stage('BART SQUAD 2.0') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=facebook/bart-base \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-      }
-    }
-
-    stage('L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('GPT2 SQUAD 1.1') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=gpt2 \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-        stage('GPT2 SQUAD 2.0') {
-          // Cannot do fast_dev_run because squad needs whole dev dataset
-          steps {
-            sh 'cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=gpt2 \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=[1] \
-            trainer.accelerator="gpu" \
-            exp_manager=null'
-          }
-        }
-      }
-    }
-
-    stage('L2: Intent and Slot Classification Tasks') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: Intent and Slot Classification') {
-          steps {
-            sh 'cd examples/nlp/intent_slot_classification && \
-            python intent_slot_classification.py \
-            model.data_dir=/home/TestData/nlp/retail \
-            model.validation_ds.prefix=dev \
-            model.test_ds.prefix=dev \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager.exp_dir=checkpoints'
-            sh 'rm -rf checkpoints'
-          }
-        }
-        stage('L2: Multi-Label Intent and Slot Classification') {
-          steps {
-            sh 'cd examples/nlp/intent_slot_classification && \
-            python multi_label_intent_slot_classification.py \
-            model.data_dir=/home/TestData/nlp/new_multiatis \
-            model.validation_ds.prefix=dev \
-            model.test_ds.prefix=dev \
-            trainer.devices=[0] \
-            +trainer.fast_dev_run=true \
-            exp_manager.exp_dir=checkpoints2'
-            sh 'rm -rf checkpoints2'
-          }
-        }
-      }
-    }
-
-    // TODO: add when megatron-bert is supported again
-    // stage('L2: Model Parallel Size 2 Megatron Text Classification') {
-    //   when {
-    //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps{
-    //     sh 'cd examples/nlp/text_classification && \
-    //     python text_classification_with_bert.py \
-    //     trainer.devices=[0,1] \
-    //     trainer.accelerator="gpu" \
-    //     trainer.num_nodes=1 \
-    //     trainer.precision=16 \
-    //     trainer.gradient_clip_val=1.0 \
-    //     +trainer.fast_dev_run=true \
-    //     model.dataset.num_classes=6 \
-    //     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    //     model.train_ds.batch_size=4 \
-    //     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    //     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    //     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    //     model.nemo_path=null \
-    //     ~model.infer_samples \
-    //     exp_manager=null'
-    //   }
-    // }
-
-    // stage('L2: Model Parallel Size 2 Megatron Autoresume') {
-    //   when {
-    //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps{
-    //     sh 'cd examples/nlp/text_classification && \
-    //     python text_classification_with_bert.py \
-    //     trainer.devices=[0,1] \
-    //     trainer.accelerator="gpu" \
-    //     trainer.num_nodes=1 \
-    //     trainer.precision=16 \
-    //     trainer.gradient_clip_val=1.0 \
-    //     trainer.max_epochs=1 \
-    //     +trainer.fast_dev_run=true \
-    //     model.dataset.num_classes=6 \
-    //     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    //     model.train_ds.batch_size=4 \
-    //     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    //     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    //     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    //     model.nemo_path=null \
-    //     ~model.infer_samples \
-    //     +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \
-    //     +exp_manager.resume_if_exists=true'
-    //   }
-    // }
-
-    // stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') {
-    //   when {
-    //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps{
-    //     sh 'cd examples/nlp/text_classification && \
-    //     python model_parallel_text_classification_evaluation.py \
-    //     trainer.devices=[0,1] \
-    //     trainer.accelerator="gpu" \
-    //     trainer.num_nodes=1 \
-    //     model.dataset.num_classes=6 \
-    //     model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-    //     model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \
-    //     exp_manager=null'
-    //   }
-    // }
-
-    // stage('L2: Model Parallel Size 2 Megatron Train from .nemo') {
-    //   when {
-    //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps{
-    //     sh 'cd examples/nlp/token_classification && \
-    //     python token_classification_train.py \
-    //     pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \
-    //     model.dataset.data_dir=/home/TestData/nlp/ner/ \
-    //     model.train_ds.batch_size=2 \
-    //     model.dataset.use_cache=false \
-    //     trainer.devices=[0,1] \
-    //     trainer.accelerator="gpu" \
-    //     +trainer.fast_dev_run=true \
-    //     model.dataset.class_balancing="weighted_loss" \
-    //     exp_manager=null'
-    //   }
-    // }
-
-    stage('L2: Parallel NLP Examples 2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage ('NER finetuning from pretrained Test') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            python token_classification_train.py \
-            pretrained_model=ner_en_bert \
-            model.dataset.data_dir=/home/TestData/nlp/ner/ \
-            model.train_ds.batch_size=2 \
-            model.dataset.use_cache=false \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            model.dataset.class_balancing="weighted_loss" \
-            exp_manager.exp_dir=null'
-          }
-        }
-        stage ('Punctuation and capitalization finetuning from pretrained test') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-            python punctuation_capitalization_train_evaluate.py \
-              pretrained_model=punctuation_en_bert \
-              model.train_ds.ds_item="${data_dir}" \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[1] \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              exp_manager.exp_dir=null && \
-            rm -rf "${data_dir}"'
-          }
-        }
-        stage ('NER with TurkuNLP/bert-base-finnish-cased-v1') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            python token_classification_train.py \
-            model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
-            exp_manager.exp_dir=null'
-          }
-        }
-        stage('Evaluation script for Token Classification') {
-          steps {
-            sh 'python examples/nlp/token_classification/token_classification_evaluate.py \
-            model.dataset.data_dir=/home/TestData/nlp/ner/ \
-            model.dataset.use_cache=false \
-            pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo'
-          }
-        }
-        stage('Evaluation script for Punctuation') {
-          steps {
-            sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              model.test_ds.ds_item="${data_dir}" \
-              ~model.train_ds \
-              ~model.validation_ds \
-              +model.test_ds.use_cache=false \
-              pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \
-            rm -rf "${data_dir}"'
-          }
-        }
-        stage('L2: Punctuation & Capitalization, 2GPUs with DistilBERT, Fine-tuning on different data') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${tmp_data_dir}" \
-              model.validation_ds.ds_item="${tmp_data_dir}" \
-              model.test_ds.ds_item="${tmp_data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=true && \
-            tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \
-            mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \
-            rm -rf "${tmp_data_dir}" && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${tmp_data_dir_2}" \
-              model.validation_ds.ds_item="${tmp_data_dir_2}" \
-              model.test_ds.ds_item="${tmp_data_dir_2}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
-              "${tmp_data_dir_2}" \
-              "${output_dir}"'
-          }
-        }
-      }
-    }
-
-    stage('Punctuation & Capitalization tarred dataset') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      stages {
-        stage('create and use tarred dataset') {
-          steps {
-            sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
-              /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \
-              "${data_dir}"/ && \
-            usual_data=${data_dir}/wmt_wiki_10000 && \
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            tarred_data=${output_dir}/train_tarred && \
-            tokens_in_batch=2000 && \
-            max_seq_length=512 && \
-            lm_model=distilbert-base-uncased && \
-            python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \
-              --text ${usual_data}/input.txt \
-              --labels ${usual_data}/labels.txt \
-              --output_dir ${tarred_data} \
-              --tokens_in_batch ${tokens_in_batch} \
-              --max_seq_length 512 \
-              --lines_per_dataset_fragment 2000 \
-              --num_batches_per_tarfile 5 \
-              --tar_file_prefix punctuation_capitalization \
-              --tokenizer_name ${lm_model} \
-              --use_fast_tokenizer \
-              --pad_label O \
-              --n_jobs 3 && \
-            echo "Number of tarred files in dataset:" && \
-            ls ${tarred_data}/*.tar | wc -l && \
-            echo "Label id files in dataset:" && \
-            ls ${tarred_data}/*.csv && \
-            metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.train_ds.ds_item=${tarred_data} \
-              model.language_model.pretrained_model_name=${lm_model} \
-              model.train_ds.use_tarred_dataset=true \
-              model.train_ds.tar_metadata_file=${metadata_file} \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir=${output_dir}/output && \
-            rm -rf "${output_dir}" "${data_dir}"'
-          }
-        }
-      }
-    }
-
-    stage('Punctuation & Capitalization, Different ways of passing labels to model') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      stages {
-        stage('Punctuation & Capitalization, Using model.common_datasest_parameters.label_vocab_dir') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            work_dir="$(mktemp -d -p "$(pwd)")" && \
-            label_vocab_dir="${work_dir}/labels" && \
-            mkdir -p ${label_vocab_dir} && \
-            data_dir="${work_dir}/data" && \
-            mkdir -p "${data_dir}" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
-            output_dir="${work_dir}/output" && \
-            mkdir -p "${output_dir}" && \
-            punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
-            capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
-            printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
-            printf "O\nU\n" > "${capit_label_vocab}" && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${data_dir}" \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
-              model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
-              model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=false && \
-            python punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              ~model.train_ds \
-              ~model.validation_ds \
-              model.test_ds.ds_item="${data_dir}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf "${work_dir}"'
-          }
-        }
-        stage('Punctuation & Capitalization, Using model.common_datasest_parameters.{punct,capit}_label_ids') {
-          steps {
-            sh 'cd examples/nlp/token_classification && \
-            work_dir="$(mktemp -d -p "$(pwd)")" && \
-            output_dir="${work_dir}/output" && \
-            mkdir -p "${output_dir}" && \
-            data_dir="${work_dir}/data" && \
-            mkdir -p "${data_dir}" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
-            conf_name=punctuation_capitalization_config_with_ids && \
-            cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \
-            sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \
-              "${work_dir}/${conf_name}.yaml" && \
-            sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \
-              "${work_dir}/${conf_name}.yaml" && \
-            python punctuation_capitalization_train_evaluate.py \
-              --config-path "${work_dir}" \
-              --config-name "${conf_name}" \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${data_dir}" \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=false && \
-            python punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              ~model.train_ds \
-              ~model.validation_ds \
-              model.test_ds.ds_item="${data_dir}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf "${work_dir}"'
-          }
-        }
-      }
-    }
-    stage('Punctuation & Capitalization inference') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      stages {
-        stage('Restore punctuation and capitalization in long text') {
-          steps {
-            sh 'output_dir="$(mktemp -d -p "$(pwd)")" && \
-            python examples/nlp/token_classification/punctuate_capitalize_infer.py \
-              --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \
-              --output_text "${output_dir}/iwslt_inference_result.txt" \
-              --max_seq_length 92 \
-              --step 8 \
-              --margin 16 \
-              --pretrained_name punctuation_en_bert \
-              --batch_size 32 && \
-            rm -rf "${output_dir}"'
-          }
-        }
-      }
-    }
-
-    stage('L2: Parallel Pretraining BERT pretraining from Text/Preprocessed') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: Pretraining BERT pretraining from Text') {
-            steps {
-              sh 'cd examples/nlp/language_modeling && \
-              python bert_pretraining.py \
-              --config-name=bert_pretraining_from_text_config.yaml \
-              trainer.devices=[0] \
-              trainer.accelerator="gpu" \
-              trainer.precision=16 \
-              +trainer.fast_dev_run=true \
-              model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt  \
-              model.train_ds.batch_size=32 \
-              model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt  \
-              model.validation_ds.batch_size=32 \
-              model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \
-              model.optim.lr=0.01 \
-              model.optim.sched.warmup_ratio=0.1 \
-              model.tokenizer.tokenizer_name=sentencepiece \
-              model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \
-              model.mask_prob=0.15 \
-              model.short_seq_prob=0.1 \
-              exp_manager.exp_dir=PretrainingBERTFromText \
-              '
-              sh 'rm -f /home/TestData/nlp/wikitext-2/*.pkl'
-              sh 'rm -rf examples/nlp/language_modeling/PretrainingBERTFromText'
-              sh 'ls -lha examples/nlp/language_modeling'
-            }
-        }
-        stage('L2: Pretraining BERT from Preprocessed') {
-            steps {
-              sh 'cd examples/nlp/language_modeling && \
-              python bert_pretraining.py \
-              --config-name=bert_pretraining_from_preprocessed_config.yaml \
-              trainer.devices=[1] \
-              trainer.accelerator="gpu" \
-              trainer.precision=16 \
-              +trainer.fast_dev_run=false \
-              +trainer.max_epochs=1 \
-              +trainer.limit_val_batches=0 \
-              +trainer.limit_train_batches=1 \
-              model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \
-              model.train_ds.batch_size=8 \
-              model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \
-              model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \
-              model.optim.lr=0.875e-4 \
-              model.optim.weight_decay=0.01 \
-              model.optim.sched.warmup_ratio=0.01 \
-              exp_manager.exp_dir=PretrainingBERTFromPreprocessed \
-              exp_manager.create_checkpoint_callback=False \
-              '
-              sh 'rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed'
-              sh 'ls -lha examples/nlp/language_modeling'
-            }
-        }
-      }
-    }
-
-    stage('L2: Entity Linking') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage ('Self Alignment Pretraining BERT') {
-           steps {
-             sh 'cd examples/nlp/entity_linking && \
-             python self_alignment_pretraining.py \
-             project_dir=. \
-             trainer.val_check_interval=3 \
-             model.raw_data=None \
-             model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \
-             model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \
-             model.train_ds.batch_size=8 \
-             model.validation_ds.batch_size=8 \
-             exp_manager.exp_dir=null'
-          }
-        }
-      }
-    }
-
-    // TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858
-    // is in the release container
-    stage('L2: NMT Attention is All You Need Training') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: NMT Training Post-LN') {
-            steps {
-              sh 'python examples/nlp/machine_translation/enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=false \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.num_layers=1 \
-              model.encoder.hidden_size=64 \
-              model.encoder.inner_size=256 \
-              model.decoder.num_layers=1 \
-              model.decoder.hidden_size=64 \
-              model.decoder.inner_size=256 \
-              +model.optim.capturable=True \
-              trainer.devices=[0] \
-              trainer.accelerator="gpu" \
-              +trainer.val_check_interval=2 \
-              +trainer.limit_val_batches=1 \
-              +trainer.max_steps=2 \
-              trainer.precision=16 \
-              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
-              +exp_manager.create_checkpoint_callback=true \
-              '
-              sh 'python examples/nlp/machine_translation/enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.num_layers=1 \
-              model.encoder.hidden_size=64 \
-              model.encoder.inner_size=256 \
-              model.decoder.num_layers=1 \
-              model.decoder.hidden_size=64 \
-              model.decoder.inner_size=256 \
-              +model.optim.capturable=True \
-              trainer.devices=[0] \
-              trainer.accelerator="gpu" \
-              +trainer.val_check_interval=10 \
-              +trainer.limit_val_batches=1 \
-              +trainer.limit_test_batches=1 \
-              +trainer.max_steps=10 \
-              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
-              +exp_manager.create_checkpoint_callback=true \
-              +exp_manager.resume_if_exists=True \
-              '
-              sh 'rm -rf examples/nlp/machine_translation/nmt_results'
-            }
-        }
-
-        stage('L2: NMT Training Pre-LN') {
-            steps {
-              sh 'cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.pre_ln=true \
-              model.decoder.pre_ln=true \
-              trainer.devices=[1] \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              +trainer.limit_test_batches=2 \
-              exp_manager=null \
-              '
-            }
-        }
-        stage('L2: NMT Multi-Validation') {
-            steps {
-              sh 'cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-              model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-              model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-              model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-              model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              trainer.devices=[0] \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              +trainer.limit_test_batches=2 \
-              exp_manager=null \
-              '
-            }
-        }
-      }
-    }
-
-    stage('L2: NMT Attention is All You Need Inference') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh 'cd examples/nlp/machine_translation && \
-        python nmt_transformer_infer.py \
-        --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-        --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \
-        --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
-        --target_lang en \
-        --source_lang de \
-        '
-      }
-    }
-
-    stage('L2: NMT Attention is All You Need Finetuning') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "cd examples/nlp/machine_translation && \
-        python enc_dec_nmt_finetune.py \
-        model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-        trainer.devices=[0] \
-        ~trainer.max_epochs \
-        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        +trainer.val_check_interval=10 \
-        +trainer.limit_val_batches=1 \
-        +trainer.limit_test_batches=1 \
-        +trainer.max_steps=10 \
-        +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \
-        +exp_manager.create_checkpoint_callback=True \
-        +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \
-        +exp_manager.checkpoint_callback_params.mode=max \
-        +exp_manager.checkpoint_callback_params.save_best_model=true \
-        "
-        sh "rm -rf examples/nlp/machine_translation/nmt_finetune"
-      }
-    }
-
-
-    stage('L2: NMT Tarred Dataset Creation') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('L2: NMT Auto Tarred Dataset Creation') {
-            steps {
-              sh 'cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_training=false \
-              model.preproc_out_dir=$PWD/preproc_out_dir \
-              model.train_ds.use_tarred_dataset=true \
-              model.train_ds.n_preproc_jobs=2 \
-              model.train_ds.lines_per_dataset_fragment=500 \
-              model.train_ds.num_batches_per_tarfile=10 \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.vocab_size=2000 \
-              model.decoder_tokenizer.vocab_size=2000 \
-              ~model.test_ds \
-              trainer.devices=[0] \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              exp_manager=null \
-              '
-            }
-        }
-
-        stage('L2: NMT Script Tarred Dataset Creation') {
-            steps {
-              sh 'cd examples/nlp/machine_translation && \
-              python create_tarred_parallel_dataset.py \
-              --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              --out_dir $PWD/out_dir \
-              --encoder_tokenizer_vocab_size=2000 \
-              --decoder_tokenizer_vocab_size=2000 \
-              --tokens_in_batch=1000 \
-              --lines_per_dataset_fragment=500 \
-              --num_batches_per_tarfile=10 \
-              --n_preproc_jobs=2 \
-              '
-            }
-        }
-      }
-    }
-    // stage('L2: Megatron NMT Training TP=2') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps {
-    //     sh "python examples/nlp/machine_translation/megatron_nmt_training.py \
-    //     trainer.devices=2 \
-    //     trainer.accelerator=gpu \
-    //     trainer.log_every_n_steps=1 \
-    //     trainer.val_check_interval=10 \
-    //     +trainer.limit_val_batches=2 \
-    //     trainer.accumulate_grad_batches=1 \
-    //     trainer.max_steps=10 \
-    //     trainer.precision=16 \
-    //     trainer.gradient_clip_val=1.0 \
-    //     exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-    //     model.tensor_model_parallel_size=2 \
-    //     model.seq_length=128 \
-    //     model.encoder.num_layers=4 \
-    //     model.encoder.hidden_size=64 \
-    //     model.encoder.num_attention_heads=8 \
-    //     model.encoder.activation='swiglu' \
-    //     model.encoder.masked_softmax_fusion=False \
-    //     model.encoder.bias_activation_fusion=False \
-    //     model.encoder.activations_checkpoint_method='block' \
-    //     model.encoder.activations_checkpoint_num_layers=1 \
-    //     model.decoder.num_layers=2 \
-    //     model.decoder.hidden_size=64 \
-    //     model.decoder.num_attention_heads=8 \
-    //     model.decoder.activation='swiglu' \
-    //     model.decoder.masked_softmax_fusion=False \
-    //     model.decoder.bias_activation_fusion=False \
-    //     model.decoder.activations_checkpoint_method='block' \
-    //     model.decoder.activations_checkpoint_num_layers=1 \
-    //     model.micro_batch_size=2 \
-    //     model.global_batch_size=4 \
-    //     model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //     model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //     model.train_ds.num_workers=1 \
-    //     model.validation_ds.num_workers=1 \
-    //     ~model.test_ds \
-    //     model.train_ds.dataset_type=text_memmap \
-    //     model.encoder_tokenizer.library=sentencepiece \
-    //     model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-    //     model.decoder_tokenizer.library=sentencepiece \
-    //     model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model"
-    //     // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-    //     // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-    //     sh "python examples/nlp/machine_translation/megatron_nmt_training.py \
-    //     trainer.devices=2 \
-    //     trainer.accelerator=gpu \
-    //     trainer.log_every_n_steps=1 \
-    //     trainer.val_check_interval=1 \
-    //     +trainer.limit_val_batches=2 \
-    //     trainer.accumulate_grad_batches=1 \
-    //     trainer.max_steps=10 \
-    //     trainer.precision=16 \
-    //     trainer.gradient_clip_val=1.0 \
-    //     exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-    //     model.tensor_model_parallel_size=2 \
-    //     model.seq_length=128 \
-    //     model.encoder.num_layers=4 \
-    //     model.encoder.hidden_size=64 \
-    //     model.encoder.num_attention_heads=8 \
-    //     model.encoder.activation='swiglu' \
-    //     model.encoder.masked_softmax_fusion=False \
-    //     model.encoder.bias_activation_fusion=False \
-    //     model.encoder.activations_checkpoint_method='block' \
-    //     model.encoder.activations_checkpoint_num_layers=1 \
-    //     model.decoder.num_layers=2 \
-    //     model.decoder.hidden_size=64 \
-    //     model.decoder.num_attention_heads=8 \
-    //     model.decoder.activation='swiglu' \
-    //     model.decoder.masked_softmax_fusion=False \
-    //     model.decoder.bias_activation_fusion=False \
-    //     model.decoder.activations_checkpoint_method='block' \
-    //     model.decoder.activations_checkpoint_num_layers=1 \
-    //     model.micro_batch_size=2 \
-    //     model.global_batch_size=4 \
-    //     model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //     model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //     model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //     model.train_ds.num_workers=1 \
-    //     model.validation_ds.num_workers=1 \
-    //     ~model.test_ds \
-    //     model.train_ds.dataset_type=text_memmap \
-    //     model.encoder_tokenizer.library=sentencepiece \
-    //     model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-    //     model.decoder_tokenizer.library=sentencepiece \
-    //     model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model"
-    //     sh "rm -rf examples/nlp/machine_translation/megatron_nmt_results"
-    //   }
-    // }
-    stage('L2: Megatron BART Perceiver MIM Training TP=2') {
-      // Testing Megatron hidden transformations
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string=\'\"800,100,100\"\' \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5"
-        // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-        // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string=\'\"800,100,100\"\' \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5"
-        sh "rm -rf examples/nlp/language_modeling/megatron_mim_results"
-      }
-    }
-    // stage('L2: NMT Bottleneck Fallback') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('L2: seq2seq (no bottleneck)') {
-    //         steps {
-    //           sh 'cd examples/nlp/machine_translation && \
-    //           enc_dec_nmt-bottleneck.py \
-    //           --config-path=conf \
-    //           --config-name=aayn_bottleneck \
-    //           do_testing=true \
-    //           model.model_type=nll \
-    //           model.encoder.arch=seq2seq \
-    //           model.encoder.hidden_steps=1 \
-    //           model.encoder.hidden_blocks=1 \
-    //           model.encoder.hidden_init_method=params \
-    //           model.encoder.hidden_size=64 \
-    //           model.encoder.inner_size=128 \
-    //           model.encoder.num_attention_heads=2 \
-    //           model.encoder.num_layers=2 \
-    //           model.decoder.hidden_size=64 \
-    //           model.decoder.inner_size=128 \
-    //           model.decoder.num_attention_heads=2 \
-    //           model.decoder.num_layers=2 \
-    //           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-    //           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-    //           model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-    //           model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-    //           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \
-    //           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \
-    //           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           trainer.devices=[1] \
-    //           trainer.accelerator="gpu" \
-    //           +trainer.fast_dev_run=true \
-    //           +trainer.limit_test_batches=2 \
-    //           exp_manager=null \
-    //           '
-    //         }
-    //     }
-    //   }
-    // }
-    // stage('L2: NMT Bottleneck Architecture') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('Bridge Encoder (identity)') {
-    //         steps {
-    //           sh 'cd examples/nlp/machine_translation && \
-    //           enc_dec_nmt-bottleneck.py \
-    //           --config-path=conf \
-    //           --config-name=aayn_bottleneck \
-    //           do_testing=true \
-    //           model.model_type=nll \
-    //           model.encoder.arch=bridge \
-    //           model.encoder.hidden_steps=1 \
-    //           model.encoder.hidden_blocks=1 \
-    //           model.encoder.hidden_init_method=identity \
-    //           model.encoder.hidden_size=64 \
-    //           model.encoder.inner_size=128 \
-    //           model.encoder.num_attention_heads=2 \
-    //           model.encoder.num_layers=2 \
-    //           model.decoder.hidden_size=64 \
-    //           model.decoder.inner_size=128 \
-    //           model.decoder.num_attention_heads=2 \
-    //           model.decoder.num_layers=2 \
-    //           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //		 trainer.devices=[0] \
-    // 		 trainer.accelerator="gpu" \
-    //           +trainer.fast_dev_run=true \
-    //           +trainer.limit_test_batches=2 \
-    //           exp_manager=null \
-    //           '
-    //         }
-    //     }
-    //     stage('Perceiver Encoder (params)') {
-    //         steps {
-    //           sh 'cd examples/nlp/machine_translation && \
-    //           enc_dec_nmt-bottleneck.py \
-    //           --config-path=conf \
-    //           --config-name=aayn_bottleneck \
-    //           do_testing=true \
-    //           model.model_type=nll \
-    //           model.encoder.arch=perceiver \
-    //           model.encoder.hidden_steps=1 \
-    //           model.encoder.hidden_blocks=1 \
-    //           model.encoder.hidden_init_method=params \
-    //           model.encoder.hidden_size=64 \
-    //           model.encoder.inner_size=128 \
-    //           model.encoder.num_attention_heads=2 \
-    //           model.encoder.num_layers=2 \
-    //           model.decoder.hidden_size=64 \
-    //           model.decoder.inner_size=128 \
-    //           model.decoder.num_attention_heads=2 \
-    //           model.decoder.num_layers=2 \
-    //           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           trainer.devices=[1] \
-    //           trainer.accelerator="gpu" \
-    //           +trainer.fast_dev_run=true \
-    //           +trainer.limit_test_batches=2 \
-    //           exp_manager=null \
-    //           '
-    //         }
-    //     }
-    //   }
-    // }
-    // stage('L2: NMT Bottleneck LVM') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   parallel {
-    //     stage('VAE') {
-    //         steps {
-    //           sh 'cd examples/nlp/machine_translation && \
-    //           enc_dec_nmt-bottleneck.py \
-    //           --config-path=conf \
-    //           --config-name=aayn_bottleneck \
-    //           do_testing=true \
-    //           model.model_type=vae \
-    //           model.encoder.arch=perceiver \
-    //           model.encoder.hidden_steps=1 \
-    //           model.encoder.hidden_blocks=1 \
-    //           model.encoder.hidden_init_method=params \
-    //           model.encoder.hidden_size=64 \
-    //           model.encoder.inner_size=128 \
-    //           model.encoder.num_attention_heads=2 \
-    //           model.encoder.num_layers=2 \
-    //           model.decoder.hidden_size=64 \
-    //           model.decoder.inner_size=128 \
-    //           model.decoder.num_attention_heads=2 \
-    //           model.decoder.num_layers=2 \
-    //           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           trainer.devices=[0] \
-    //           trainer.accelerator="gpu" \
-    //           +trainer.fast_dev_run=true \
-    //           +trainer.limit_test_batches=2 \
-    //           exp_manager=null \
-    //           '
-    //         }
-    //     }
-    //     stage('MIM') {
-    //         steps {
-    //           sh 'cd examples/nlp/machine_translation && \
-    //           enc_dec_nmt-bottleneck.py \
-    //           --config-path=conf \
-    //           --config-name=aayn_bottleneck \
-    //           do_testing=true \
-    //           model.model_type=mim \
-    //           model.encoder.arch=perceiver \
-    //           model.encoder.hidden_steps=1 \
-    //           model.encoder.hidden_blocks=1 \
-    //           model.encoder.hidden_init_method=params \
-    //           model.encoder.hidden_size=64 \
-    //           model.encoder.inner_size=128 \
-    //           model.encoder.num_attention_heads=2 \
-    //           model.encoder.num_layers=2 \
-    //           model.decoder.hidden_size=64 \
-    //           model.decoder.inner_size=128 \
-    //           model.decoder.num_attention_heads=2 \
-    //           model.decoder.num_layers=2 \
-    //           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    //           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    //           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-    //           trainer.devices=[1] \
-    //           trainer.accelerator="gpu" \
-    //           +trainer.fast_dev_run=true \
-    //           +trainer.limit_test_batches=2 \
-    //           exp_manager=null \
-    //           '
-    //         }
-    //     }
-    //   }
-    // }
-    stage('L2: Megatron Bert Pretraining and Resume Training with Pipeline Paralleism') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/bert_index_mappings"
-      }
-    }
-    stage('L2: Megatron Bert Pretraining and Resume Training') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.sequence_parallel=True \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/bert_index_mappings"
-      }
-    }
-    stage('L2: Megatron Core Bert Pretraining and Resume Training') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=32 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.mcore_bert=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.sequence_parallel=True \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=32 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.mcore_bert=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/bert_index_mappings"
-      }
-    }
-    stage('L2: NeMo Bert Embedding Finetuning and Resume') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.max_steps=12 \
-        trainer.val_check_interval=4 \
-        trainer.max_epochs=1 \
-        +trainer.num_sanity_val_steps=0 \
-        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_nemo_tiny.nemo \
-        model.num_layers=2 \
-        model.hidden_size=64 \
-        model.ffn_hidden_size=256 \
-        model.num_attention_heads=2 \
-        model.megatron_legacy=False \
-        model.mcore_bert=False \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.optim.lr=0.0005 \
-        model.encoder_seq_length=512 \
-        model.tokenizer.library='huggingface' \
-        model.tokenizer.type='intfloat/e5-large-unsupervised' \
-        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
-        model.data.hard_negatives_to_train=4 \
-        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
-        exp_manager.create_wandb_logger=False \
-        exp_manager.resume_if_exists=False"
-        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.max_steps=36 \
-        trainer.val_check_interval=4 \
-        trainer.max_epochs=1 \
-        +trainer.num_sanity_val_steps=0 \
-        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_nemo_tiny.nemo \
-        model.num_layers=2 \
-        model.hidden_size=64 \
-        model.ffn_hidden_size=256 \
-        model.num_attention_heads=2 \
-        model.megatron_legacy=False \
-        model.mcore_bert=False \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.optim.lr=0.0005 \
-        model.encoder_seq_length=512 \
-        model.tokenizer.library='huggingface' \
-        model.tokenizer.type='intfloat/e5-large-unsupervised' \
-        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
-        model.data.hard_negatives_to_train=4 \
-        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
-        exp_manager.create_wandb_logger=False \
-        exp_manager.resume_if_exists=True"
-        sh "rm -rf examples/nlp/information_retrieval/bert_embedding_results"
-      }
-    }
-    stage('L2: Megatron Core Bert Embedding Finetuning and Resume') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.max_steps=12 \
-        trainer.val_check_interval=4 \
-        trainer.max_epochs=36 \
-        +trainer.num_sanity_val_steps=0 \
-        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_mcore_tiny.nemo \
-        model.num_layers=2 \
-        model.hidden_size=64 \
-        model.ffn_hidden_size=256 \
-        model.num_attention_heads=2 \
-        model.megatron_legacy=False \
-        model.mcore_bert=True \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.optim.lr=0.0005 \
-        model.encoder_seq_length=512 \
-        model.tokenizer.library='huggingface' \
-        model.tokenizer.type='intfloat/e5-large-unsupervised' \
-        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
-        model.data.hard_negatives_to_train=4 \
-        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
-        exp_manager.create_wandb_logger=False \
-        exp_manager.resume_if_exists=False"
-        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.max_steps=16 \
-        trainer.val_check_interval=4 \
-        trainer.max_epochs=1 \
-        +trainer.num_sanity_val_steps=0 \
-        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_mcore_tiny.nemo \
-        model.num_layers=2 \
-        model.hidden_size=64 \
-        model.ffn_hidden_size=256 \
-        model.num_attention_heads=2 \
-        model.megatron_legacy=False \
-        model.mcore_bert=True \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.optim.lr=0.0005 \
-        model.encoder_seq_length=512 \
-        model.tokenizer.library='huggingface' \
-        model.tokenizer.type='intfloat/e5-large-unsupervised' \
-        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
-        model.data.hard_negatives_to_train=4 \
-        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
-        exp_manager.create_wandb_logger=False \
-        exp_manager.resume_if_exists=True"
-        sh "rm -rf examples/nlp/information_retrieval/bert_embedding_results"
-      }
-    }
-    stage('L2: Megatron RETRO Pretraining and Resume Training') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-            trainer.num_nodes=1 \
-            trainer.devices=2 \
-            trainer.precision=bf16 \
-            trainer.accelerator=gpu \
-            model.data.data_prefix=['none'] \
-            exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-            model.mcore_gpt=True \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.optim.name=distributed_fused_adam \
-            model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-            model.data.num_workers=4 \
-            model.micro_batch_size=1 \
-            model.data.shuffle_documents=False \
-            trainer.val_check_interval=30 \
-            +trainer.num_sanity_val_steps=0 \
-            model.init_method_std=0.023 \
-            model.optim.lr=6.0e-4 \
-            model.megatron_amp_O2=True \
-            model.data.splits_string=\'\"98,2,0\"\' \
-            model.data.dataloader_type=cyclic \
-            trainer.max_steps=10"
-        sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-            trainer.num_nodes=1 \
-            trainer.devices=2 \
-            trainer.precision=bf16 \
-            trainer.accelerator=gpu \
-            model.data.data_prefix=['none'] \
-            exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-            model.mcore_gpt=True \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.optim.name=distributed_fused_adam \
-            model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-            model.data.num_workers=4 \
-            model.micro_batch_size=1 \
-            model.data.shuffle_documents=False \
-            trainer.val_check_interval=30 \
-            +trainer.num_sanity_val_steps=0 \
-            model.init_method_std=0.023 \
-            model.optim.lr=6.0e-4 \
-            model.megatron_amp_O2=True \
-            model.data.splits_string=\'\"98,2,0\"\' \
-            model.data.dataloader_type=cyclic \
-            trainer.max_steps=20"
-        sh "rm -rf examples/nlp/language_modeling/mcore_retro_results"
-      }
-    }
-    stage('L2: (Legacy) Megatron RETRO Pretraining and Resume Training') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-        trainer.devices=2 \
-        trainer.num_nodes=1 \
-        trainer.accelerator=gpu \
-        trainer.accumulate_grad_batches=1 \
-        trainer.limit_val_batches=2 \
-        exp_manager.resume_if_exists=True \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-        model.data.data_prefix='' \
-        model.data.knn_index='' \
-        model.data.retrieval_prefix='' \
-        model.tensor_model_parallel_size=2 \
-        model.micro_batch_size=4 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.chunk_size=32 \
-        model.enc_num_layers=2 \
-        model.dec_num_layers=2 \
-        model.enc_cross_attention=[1] \
-        model.dec_cross_attention=[1] \
-        +model.data.mock=True"
-        sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-        trainer.devices=2 \
-        trainer.num_nodes=1 \
-        trainer.accelerator=gpu \
-        trainer.accumulate_grad_batches=1 \
-        trainer.limit_val_batches=2 \
-        exp_manager.resume_if_exists=True \
-        trainer.max_steps=20 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-        model.data.data_prefix='' \
-        model.data.knn_index='' \
-        model.data.retrieval_prefix='' \
-        model.tensor_model_parallel_size=2 \
-        model.micro_batch_size=4 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.chunk_size=32 \
-        model.enc_num_layers=2 \
-        model.dec_num_layers=2 \
-        model.enc_cross_attention=[1] \
-        model.dec_cross_attention=[1] \
-        +model.data.mock=True"
-        sh "rm -rf examples/nlp/language_modeling/retro_legacy_results"
-      }
-    }
-    stage('L2: (Legacy) Megatron RETRO muTransfer Pretraining Performance') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-            sh "python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
-                trainer.devices=2 \
-                trainer.num_nodes=1 \
-                trainer.accelerator=gpu \
-                trainer.accumulate_grad_batches=1 \
-                trainer.max_steps=100 \
-                trainer.log_every_n_steps=1 \
-                trainer.precision=16 \
-                trainer.val_check_interval=100 \
-                trainer.limit_val_batches=0 \
-                trainer.gradient_clip_val=1.0 \
-                +trainer.num_sanity_val_steps=0 \
-                exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results/ \
-                +exp_manager.version=smalltest \
-                model.data.neighbors=2 \
-                model.megatron_amp_O2=False \
-                model.apply_query_key_layer_scaling=False \
-                model.tensor_model_parallel_size=1 \
-                model.optim.name=muadamw \
-                model.optim.weight_decay=0.1 \
-                model.optim.betas=[0.9,0.95] \
-                model.optim.lr=6e-4 \
-                model.optim.sched.warmup_steps=1000 \
-                model.optim.sched.constant_steps=0 \
-                model.optim.sched.min_lr=6e-5 \
-                model.add_position_embedding=False \
-                model.enc_num_layers=2 \
-                model.dec_num_layers=6 \
-                model.enc_cross_attention=[0] \
-                model.dec_cross_attention=[3,5] \
-                model.hidden_size=96 \
-                model.ffn_hidden_size=384 \
-                model.init_method_std=0.023 \
-                model.num_attention_heads=12 \
-                model.max_position_embeddings=1024 \
-                model.encoder_seq_length=1024 \
-                model.tokenizer.library=megatron \
-                model.tokenizer.type=GPT2BPETokenizer \
-                model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \
-                model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \
-                model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \
-                model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \
-                model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \
-                model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \
-                model.data.num_workers=8 \
-                model.micro_batch_size=8 \
-                model.normalization=rmsnorm \
-                model.transformer_block_type=pre_ln \
-                model.bias_activation_fusion=True \
-                model.bias_dropout_add_fusion=False \
-                model.masked_softmax_fusion=True \
-                model.hidden_dropout=0 \
-                model.attention_dropout=0 \
-                model.fp32_residual_connection=True \
-                model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml"
-        sh '''python -c "import pandas as pd
-import pathlib
-from pandas.testing import assert_frame_equal
-from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
-import torch
-if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()):
-    import sys
-    sys.exit(0)
-event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_legacy_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
-ea = EventAccumulator(str(event_file)).Reload()
-vals = []
-for i in ea.Scalars('reduced_train_loss'):
-    vals.append(i.value)
-training_curve = pd.DataFrame({'loss': vals})
-gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv')
-assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
-        sh "rm -rf examples/nlp/language_modeling/retro_legacy_results"
-      }
-    }
-    stage('L2: BioMegatron Bert NER Task') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/token_classification/token_classification_train.py \
-        exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
-        trainer.max_epochs=1 \
-        model.dataset.data_dir=/home/TestData/nlp/ner \
-        model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
-        model.tokenizer.tokenizer_name=null"
-        sh "rm -rf examples/nlp/language_modeling/token_classification_results"
-      }
-    }
-    stage('L2: Megatron GPT Pretraining and Resume Training TETransformerLayerTP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        ++model.name=megatron_gpt_full_te_layer_autocast \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=layernorm1p \
-        model.bias_activation_fusion=True \
-        model.bias_dropout_add_fusion=True \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=null \
-        model.activations_checkpoint_granularity=null \
-        model.activations_checkpoint_num_layers=null \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        ++model.name=megatron_gpt_full_te_layer_autocast \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=layernorm1p \
-        model.bias_activation_fusion=True \
-        model.bias_dropout_add_fusion=True \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=null \
-        model.activations_checkpoint_granularity=null \
-        model.activations_checkpoint_num_layers=null \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
-    // @athitten: Revert limit_val_batches to 2 until limit_val_batches 1.0 leading to no validation is fixed for non DictConfig data_prefix
-    stage('L2: Megatron GPT Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_granularity='full' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_granularity='full' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-
-    stage('L2: Megatron GPT Pretraining and Resume Training TP=2 with Torch Distributed Checkpoint') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.mcore_gpt=True \
-        model.torch_distributed_checkpoint=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.transformer_engine=true \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.mcore_gpt=True \
-        model.torch_distributed_checkpoint=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.transformer_engine=True \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-/*
-    stage('L2: Megatron GPT Pretraining with EP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.expert_model_parallel_size=2 \
-        ++model.num_moe_experts=2 \
-        ++model.moe_router_topk=1 \
-        ++model.megatron_amp_O2=True \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_granularity='full' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-*/
-    stage('L2: Megatron GPT with Rope Pretraining and Resume Training TP=2') {
-     when {
-       anyOf {
-         branch 'main'
-         changeRequest target: 'main'
-       }
-     }
-     failFast true
-     steps {
-       sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-       trainer.devices=2 \
-       trainer.accelerator=gpu \
-       trainer.log_every_n_steps=1 \
-       trainer.val_check_interval=2 \
-       trainer.limit_val_batches=2 \
-       trainer.accumulate_grad_batches=1 \
-       trainer.max_steps=3 \
-       trainer.precision=16 \
-       trainer.gradient_clip_val=1.0 \
-       exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-       model.tensor_model_parallel_size=2 \
-       model.optim.name=fused_adam \
-       model.optim.lr=2e-4 \
-       model.optim.sched.warmup_steps=1 \
-       model.optim.sched.constant_steps=1 \
-       model.optim.sched.min_lr=8e-5 \
-       model.max_position_embeddings=128 \
-       model.encoder_seq_length=128 \
-       model.data.seq_length=128 \
-       model.position_embedding_type=rope \
-       model.rotary_percentage=0.5 \
-       model.normalization=rmsnorm \
-       model.transformer_engine=True \
-       model.bias=False \
-       model.bias_activation_fusion=False \
-       model.bias_dropout_add_fusion=False \
-       model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-       model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-       model.num_layers=8 \
-       model.cpu_offloading_num_layers=7 \
-       model.hidden_size=256 \
-       model.num_attention_heads=8 \
-       model.activations_checkpoint_method='block' \
-       model.activations_checkpoint_granularity='full' \
-       model.activations_checkpoint_num_layers=1 \
-       model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document],validation:[/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document],test:[/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]}' \
-       model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        // commented out to save time on github ci @adithyare
-        //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        //trainer.devices=2 \
-        //trainer.accelerator=gpu \
-        //trainer.log_every_n_steps=1 \
-        //trainer.val_check_interval=2 \
-        //trainer.limit_val_batches=1 \
-        //trainer.accumulate_grad_batches=1 \
-        //trainer.max_steps=6 \
-        //trainer.precision=16 \
-        //trainer.gradient_clip_val=1.0 \
-        //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        //exp_manager.resume_if_exists=True \
-        //model.tensor_model_parallel_size=2 \
-        //model.optim.name=fused_adam \
-        //model.optim.lr=2e-4 \
-        //model.optim.sched.warmup_steps=2 \
-        //model.optim.sched.constant_steps=2 \
-        //model.optim.sched.min_lr=8e-5 \
-        //model.max_position_embeddings=128 \
-        //model.encoder_seq_length=128 \
-        //model.data.seq_length=128 \
-        //model.position_embedding_type=rope \
-        //model.rotary_percentage=0.5 \
-        //model.normalization=rmsnorm \
-        //model.bias=False \
-        //model.bias_activation_fusion=False \
-        //model.bias_dropout_add_fusion=False \
-        //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        //model.num_layers=8 \
-        //model.hidden_size=256 \
-        //model.num_attention_heads=8 \
-        //model.activations_checkpoint_method='block' \
-        //model.activations_checkpoint_granularity='full' \
-        //model.activations_checkpoint_num_layers=1 \
-        //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-       sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-       sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-     }
-
-    // This test requires Ampere but some of the test GPUs are Volta
-    // Need to add a check for compute capability before uncommenting this test
-    // stage('L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps {
-    //     sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-    //     trainer.devices=2 \
-    //     trainer.accelerator=gpu \
-    //     trainer.log_every_n_steps=1 \
-    //     trainer.val_check_interval=2 \
-    //     trainer.limit_val_batches=2 \
-    //     trainer.accumulate_grad_batches=1 \
-    //     trainer.max_steps=3 \
-    //     trainer.precision=16 \
-    //     trainer.gradient_clip_val=1.0 \
-    //     exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-    //     model.tensor_model_parallel_size=2 \
-    //     model.optim.name=fused_adam \
-    //     model.optim.lr=2e-4 \
-    //     model.optim.sched.warmup_steps=1 \
-    //     model.optim.sched.constant_steps=1 \
-    //     model.optim.sched.min_lr=8e-5 \
-    //     model.max_position_embeddings=128 \
-    //     model.encoder_seq_length=128 \
-    //     model.data.seq_length=128 \
-    //     model.position_embedding_type=rope \
-    //     model.rotary_percentage=0.5 \
-    //     model.normalization=rmsnorm \
-    //     model.bias=False \
-    //     model.bias_activation_fusion=False \
-    //     model.bias_dropout_add_fusion=False \
-    //     model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-    //     model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-    //     model.num_layers=8 \
-    //     model.hidden_size=256 \
-    //     model.num_attention_heads=8 \
-    //     model.activations_checkpoint_method='block' \
-    //     model.activations_checkpoint_granularity='full' \
-    //     model.activations_checkpoint_num_layers=1 \
-    //     model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-    //     model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
-    //     model.use_flash_attention=True "
-    //     // commented out to save time on github ci @adithyare
-    //     //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-    //     //trainer.devices=2 \
-    //     //trainer.accelerator=gpu \
-    //     //trainer.log_every_n_steps=1 \
-    //     //trainer.val_check_interval=2 \
-    //     //trainer.limit_val_batches=1 \
-    //     //trainer.accumulate_grad_batches=1 \
-    //     //trainer.max_steps=6 \
-    //     //trainer.precision=16 \
-    //     //trainer.gradient_clip_val=1.0 \
-    //     //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-    //     //exp_manager.resume_if_exists=True \
-    //     //model.tensor_model_parallel_size=2 \
-    //     //model.optim.name=fused_adam \
-    //     //model.optim.lr=2e-4 \
-    //     //model.optim.sched.warmup_steps=2 \
-    //     //model.optim.sched.constant_steps=2 \
-    //     //model.optim.sched.min_lr=8e-5 \
-    //     //model.max_position_embeddings=128 \
-    //     //model.encoder_seq_length=128 \
-    //     //model.data.seq_length=128 \
-    //     //model.position_embedding_type=rope \
-    //     //model.rotary_percentage=0.5 \
-    //     //model.normalization=rmsnorm \
-    //     //model.bias=False \
-    //     //model.bias_activation_fusion=False \
-    //     //model.bias_dropout_add_fusion=False \
-    //     //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-    //     //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-    //     //model.num_layers=8 \
-    //     //model.hidden_size=256 \
-    //     //model.num_attention_heads=8 \
-    //     //model.activations_checkpoint_method='block' \
-    //     //model.activations_checkpoint_granularity='full' \
-    //     //model.activations_checkpoint_num_layers=1 \
-    //     //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-    //     //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
-    //     //model.use_flash_attention=True"
-    //     sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-    //     sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-    //   }
-    // }
-    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
-    stage('L2: Megatron GPT with ALiBi Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.position_embedding_type=alibi \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_granularity='full' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        // not testing resume functionality to save time on ci @adithyare
-        //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        //trainer.devices=2 \
-        //trainer.accelerator=gpu \
-        //trainer.log_every_n_steps=1 \
-        //trainer.val_check_interval=2 \
-        //trainer.limit_val_batches=1 \
-        //trainer.accumulate_grad_batches=1 \
-        //trainer.max_steps=6 \
-        //trainer.precision=16 \
-        //trainer.gradient_clip_val=1.0 \
-        //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        //exp_manager.resume_if_exists=True \
-        //model.tensor_model_parallel_size=2 \
-        //model.optim.name=fused_adam \
-        //model.optim.lr=2e-4 \
-        //model.optim.sched.warmup_steps=2 \
-        //model.optim.sched.constant_steps=2 \
-        //model.optim.sched.min_lr=8e-5 \
-        //model.max_position_embeddings=128 \
-        //model.encoder_seq_length=128 \
-        //model.data.seq_length=128 \
-        //model.position_embedding_type=alibi \
-        //model.normalization=rmsnorm \
-        //model.bias=False \
-        //model.bias_activation_fusion=False \
-        //model.bias_dropout_add_fusion=False \
-        //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        //model.num_layers=8 \
-        //model.hidden_size=256 \
-        //model.num_attention_heads=8 \
-        //model.activations_checkpoint_method='block' \
-        //model.activations_checkpoint_granularity='full' \
-        //model.activations_checkpoint_num_layers=1 \
-        //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
-    stage('L2: Megatron GPT with KERPLE Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.position_embedding_type=kerple \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_granularity='full' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        // commented out to save time on github ci @adithyare
-        //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        //trainer.devices=2 \
-        //trainer.accelerator=gpu \
-        //trainer.log_every_n_steps=1 \
-        //trainer.val_check_interval=2 \
-        //trainer.limit_val_batches=1 \
-        //trainer.accumulate_grad_batches=1 \
-        //trainer.max_steps=6 \
-        //trainer.precision=16 \
-        //trainer.gradient_clip_val=1.0 \
-        //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        //exp_manager.resume_if_exists=True \
-        //model.tensor_model_parallel_size=2 \
-        //model.optim.name=fused_adam \
-        //model.optim.lr=2e-4 \
-        //model.optim.sched.warmup_steps=2 \
-        //model.optim.sched.constant_steps=2 \
-        //model.optim.sched.min_lr=8e-5 \
-        //model.max_position_embeddings=128 \
-        //model.encoder_seq_length=128 \
-        //model.data.seq_length=128 \
-        //model.position_embedding_type=kerple \
-        //model.normalization=rmsnorm \
-        //model.bias=False \
-        //model.bias_activation_fusion=False \
-        //model.bias_dropout_add_fusion=False \
-        //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        //model.num_layers=8 \
-        //model.hidden_size=256 \
-        //model.num_attention_heads=8 \
-        //model.activations_checkpoint_method='block' \
-        //model.activations_checkpoint_granularity='full' \
-        //model.activations_checkpoint_num_layers=1 \
-        //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
-    stage('L2: Megatron GPT Pretraining and Resume Training PP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=bf16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.mcore_gpt=True \
-        model.megatron_amp_O2=True \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.activation=fast-swiglu \
-        model.bias_activation_fusion=False \
-        model.hidden_dropout=0.0 \
-        model.attention_dropout=0.0 \
-        model.transformer_block_type=normformer \
-        model.headscale=True \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=bf16 \
-        trainer.gradient_clip_val=1.0 \
-        model.mcore_gpt=True \
-        model.megatron_amp_O2=True \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.activation=fast-swiglu \
-        model.bias_activation_fusion=False \
-        model.hidden_dropout=0.0 \
-        model.attention_dropout=0.0 \
-        model.transformer_block_type=normformer \
-        model.headscale=True \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.cpu_offloading_num_layers=7 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-      }
-    }
-    stage('L2: Megatron GPT Finetuning PP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        +trainer.limit_val_batches=2 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.peft.peft_scheme=null \
-        model.data.train_ds.micro_batch_size=1 \
-        model.data.train_ds.global_batch_size=4 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
-        model.data.train_ds.num_workers=0 \
-        model.data.test_ds.micro_batch_size=1 \
-        model.data.test_ds.global_batch_size=1 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.test_ds.names=[quarel] \
-        model.data.validation_ds.micro_batch_size=1 \
-        model.data.validation_ds.global_batch_size=1 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.validation_ds.names=[quarel,trec]"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        +trainer.limit_val_batches=2 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.peft.peft_scheme=null \
-        model.data.train_ds.micro_batch_size=1 \
-        model.data.train_ds.global_batch_size=4 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
-        model.data.train_ds.num_workers=0 \
-        model.data.test_ds.micro_batch_size=1 \
-        model.data.test_ds.global_batch_size=1 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.test_ds.names=[quarel] \
-        model.data.validation_ds.micro_batch_size=1 \
-        model.data.validation_ds.global_batch_size=1 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.validation_ds.names=[quarel,trec]"
-        sh "rm -rf examples/nlp/language_modeling/gpt_sft_results"
-      }
-    }
-    stage('L2: Megatron GPT Finetuning StarCoder PP=1') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \
-        trainer.devices=1 \
-        trainer.num_nodes=1 \
-        trainer.precision=32 \
-        trainer.max_steps=4 \
-        trainer.val_check_interval=4 \
-        trainer.enable_checkpointing=False \
-        +trainer.limit_val_batches=2 \
-        +trainer.limit_test_batches=2 \
-        exp_manager.checkpoint_callback_params.save_best_model=False \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-        model.optim.name=distributed_fused_adam \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.num_workers=0 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.test_ds.num_workers=0 \
-        model.data.train_ds.concat_sampling_probabilities=[1.0]"
-        sh "rm -rf examples/nlp/language_modeling/gpt_sft_results"
-      }
-    }
-    stage('L2: Megatron GPT PEFT Lora PP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-        model.peft.peft_scheme='lora' \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]"
-        sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2"
-      }
-    }
-    stage('L2: Megatron GPT PEFT Lora TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/nlp/lora_tuning_tp2"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-        model.peft.peft_scheme='lora' \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-        model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
-        model.tensor_model_parallel_size=2 \
-        trainer.devices=2 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=['quarel4'] \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'"
-        sh "rm -rf /home/TestData/nlp/lora_tuning_tp2"
-      }
-    }
-    stage('L2: Megatron GPT PEFT Lora TP=2 SP') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/nlp/lora_tuning_tp2_sp"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.sequence_parallel=true \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
-        model.peft.peft_scheme='lora' \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]"
-        sh "rm -rf /home/TestData/nlp/lora_tuning_tp2_sp"
-      }
-    }
-    stage('L2: Megatron GPT Eval') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps{
-        sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
-            gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
-            prompts=['How to fix GPU memory? A:'] \
-            tensor_model_parallel_size=1 \
-            inference.tokens_to_generate=32 \
-            trainer.precision=32"
-      }
-    }
-    stage('L2: Megatron GPT Eval PP2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
-            gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-            server=False \
-            tensor_model_parallel_size=1 \
-            pipeline_model_parallel_size=2 \
-            trainer.devices=2 \
-            trainer.num_nodes=1 \
-            trainer.precision=32"
-      }
-    }
-    stage('L2: Megatron GPT SFT Eval (inference seq len > training seq len)') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps{
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
-            model.peft.restore_from_path=null \
-            model.data.test_ds.file_names=['/home/TestData/nlp/megatron_gpt_sft/sample.jsonl'] \
-            model.data.test_ds.names=['test'] \
-            model.data.test_ds.global_batch_size=1 \
-            model.data.test_ds.micro_batch_size=1 \
-            model.data.test_ds.tokens_to_generate=30 \
-            model.data.test_ds.max_seq_length=6000 \
-            model.data.test_ds.write_predictions_to_file=True \
-            model.data.test_ds.output_file_path_prefix='examples/nlp/language_modeling/out' \
-            inference.greedy=True \
-            inference.repetition_penalty=1.0 \
-            inference.outfile_path='examples/nlp/language_modeling/out.jsonl' && \
-            rm -rf examples/nlp/language_modeling/out.jsonl"
-      }
-    }
-
-    // TODO: Add this test back. Test was failing on CI machines due to HW error
-    // stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') {
-    //   when {
-    //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
-    //     }
-    //   }
-    //   failFast true
-    //   steps {
-    //     sh "python -m torch.distributed.launch --nproc_per_node=2 \
-    //     examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \
-    //     --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \
-    //     --checkpoint_name=model_optim_rng.pt \
-    //     --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \
-    //     --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \
-    //     --model_type=gpt \
-    //     --pipeline_model_parallel_size=1 \
-    //     --gpus_per_node=2 \
-    //     --tensor_model_parallel_size=2"
-    //     sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \
-    //     --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \
-    //     --tokens_to_generate=32 \
-    //     --tensor_model_parallel_size=2 \
-    //     --prompt='This is a test.'"
-    //     sh "rm examples/nlp/language_modeling/small_gpt.nemo"
-    //   }
-    // }
-    stage('L2: Megatron Change Partitions') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel{
-        stage('Reduce TP Num Partitions (2 to 1) and PP Num Partitions (1 to 2)'){
-          steps{
-            sh "python examples/nlp/language_modeling/megatron_change_num_partitions.py \
-                --model_file \
-                /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-                --target_file \
-                /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \
-                --tensor_model_parallel_size \
-                2 \
-                --target_tensor_model_parallel_size \
-                1 \
-                --pipeline_model_parallel_size \
-                1 \
-                --target_pipeline_model_parallel_size \
-                2"
-            sh "rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo"
-          }
-        }
-        stage('Increase TP Num Partitions (2 to 4) and PP Num Partitions (1 to 2)'){
-          steps{
-            sh "python examples/nlp/language_modeling/megatron_change_num_partitions.py \
-                --model_file \
-                /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-                --target_file \
-                /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \
-                --tensor_model_parallel_size \
-                2 \
-                --target_tensor_model_parallel_size \
-                4 \
-                --pipeline_model_parallel_size \
-                1 \
-                --target_pipeline_model_parallel_size \
-                1"
-            sh "rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo"
-          }
-        }
-      }
-    }
-    stage('L2: Megatron T5 Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='fast-swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='fast-swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-    stage('L2: Megatron T5 with ALiBi Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=alibi \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=alibi \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-    stage('L2: Megatron T5 with KERPLE Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=kerple \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=kerple \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='pre_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-    stage('L2: Megatron T5 Pretraining and Resume Training PP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.decoder.num_layers=1 \
-        model.encoder.hidden_size=64 \
-        model.decoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.ffn_hidden_size=2048 \
-        model.encoder.activation='gelu' \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='post_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.decoder.num_layers=1 \
-        model.encoder.hidden_size=64 \
-        model.decoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.ffn_hidden_size=2048 \
-        model.encoder.activation='gelu' \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='post_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-    stage('L2: Megatron T5 w/ Mixture of Expert Pretraining') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.decoder.num_layers=1 \
-        model.encoder.num_moe_experts=4 \
-        model.decoder.num_moe_experts=4 \
-        model.encoder.moe_frequency=3 \
-        model.decoder.moe_frequency=1 \
-        model.encoder.hidden_size=64 \
-        model.decoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.ffn_hidden_size=2048 \
-        model.encoder.activation='gelu' \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='pre_ln' \
-        model.decoder.transformer_block_type='post_ln' \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-
-    stage('L2: Megatron UL2 Pretraining and Resume Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='normformer' \
-        model.encoder.headscale=True \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='geglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.decoder.transformer_block_type='normformer' \
-        model.decoder.headscale=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type='normformer' \
-        model.encoder.headscale=True \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='geglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.decoder.transformer_block_type='normformer' \
-        model.decoder.headscale=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings"
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-        sh "rm -rf examples/nlp/language_modeling/t5_index_mappings"
-      }
-    }
-    stage('L2: Megatron T5 Eval') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps{
-        sh "python examples/nlp/language_modeling/megatron_t5_eval.py \
-            --model_file \
-            /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            --prompt \
-            'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
-            --tensor_model_parallel_size 1"
-      }
-    }
-    stage('L2: Megatron BART Pretraining and Resume Training, TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='reglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='reglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'"
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=5 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='reglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='reglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'"
-        sh "rm -rf examples/nlp/language_modeling/bart_pretrain_results"
-      }
-    }
-    stage('L2: Megatron BART Pretraining and Resume Training, PP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='geglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='geglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
-        sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='geglu' \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='geglu' \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
-        sh "rm -rf examples/nlp/language_modeling/bart_pretrain_results"
-      }
-    }
-    stage('L2: Megatron T5 GLUE/XNLI Finetuning') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        // TODO(Oktai15): update it in 1.8.0 version
-        stage('T5 GLUE RTE') {
-          steps {
-            sh "python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-            trainer.devices=1 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            model.pipeline_model_parallel_size=1 \
-            model.pipeline_model_parallel_split_rank=0 \
-            model.data.train_ds.task_name=rte \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=2 \
-            model.data.validation_ds.micro_batch_size=2 \
-            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-            model.data.validation_ds.task_name=rte \
-            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv \
-            "
-            sh "rm -rf examples/nlp/language_modeling/t5_glue_results"
-          }
-        }
-        stage('T5 GLUE XNLI') {
-          steps {
-            sh "python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-            -cn megatron_t5_config_finetune_glue_xnli \
-            trainer.devices=1 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            model.pipeline_model_parallel_size=1 \
-            model.pipeline_model_parallel_split_rank=0 \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=2 \
-            model.data.validation_ds.micro_batch_size=2 \
-            model.data.test_ds.global_batch_size=2 \
-            model.data.test_ds.micro_batch_size=2 \
-            model.data.train_ds.task_name=rte \
-            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-            model.data.validation_ds.task_name=xnli \
-            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
-            model.data.test_ds.task_name=xnli \
-            model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
-            "
-            sh "rm -rf examples/nlp/language_modeling/t5_xnli_results"
-          }
-        }
-      }
-    }
-
-    stage('L2: Megatron T5 PEFT Lora TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/nlp/t5_lora_tuning_tp2"
-        sh "python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.peft_scheme='lora' \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]"
-        sh "python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
-        model.peft.restore_from_ckpt_name=null \
-        model.peft.restore_from_hparams_path=null \
-        model.tensor_model_parallel_size=2 \
-        trainer.devices=2 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=['quarel4'] \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/t5_lora_tuning_tp2/out' \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path='/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl'"
-        sh "rm -rf /home/TestData/nlp/t5_lora_tuning_tp2"
-      }
-    }
-
-    stage('L2: Megatron FIM Dataset') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=1 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        ++model.name=megatron_gpt_full_te_layer_autocast \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=1 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=layernorm1p \
-        model.bias_activation_fusion=True \
-        model.bias_dropout_add_fusion=True \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=null \
-        model.activations_checkpoint_granularity=null \
-        model.activations_checkpoint_num_layers=null \
-        model.data.data_prefix='[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]' \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
-        ++model.data.add_fim=True \
-        ++model.data.fim.extra_tokens.prefix='fim_prefix' \
-        ++model.data.fim.extra_tokens.middle='fim_middle' \
-        ++model.data.fim.extra_tokens.suffix='fim_suffix' \
-        ++model.data.fim.extra_tokens.pad='fim_pad' \
-        ++model.data.fim.extra_tokens.eod='endoftext'"
-        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-      }
-    }
-
-    stage('L2: Megatron Mock Data Generation') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      parallel {
-        stage('MockGPTDataset') {
-          steps {
-            sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.max_steps=10 \
-            trainer.limit_val_batches=7 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.data.data_impl=mock \
-            model.data.data_prefix=[] \
-            "
-            sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-          }
-        }
-        stage('MockT5Dataset') {
-          steps {
-            sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.max_steps=10 \
-            trainer.limit_val_batches=3 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.data.data_impl=mock \
-            model.data.data_prefix=[] \
-            "
-            sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
-          }
-        }
-      }
-    }
-
-    stage('L2: TTS Fast dev runs 1') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      parallel {
-        stage('Tacotron 2') {
-          steps {
-            sh 'python examples/tts/tacotron2.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=[0] \
-            trainer.accelerator="gpu" \
-            +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.decoder.decoder_rnn_dim=256 \
-            model.decoder.attention_rnn_dim=1024 \
-            model.decoder.prenet_dim=128 \
-            model.postnet.postnet_n_convolutions=3 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs \
-            ~trainer.check_val_every_n_epoch \
-            '
-          }
-        }
-        stage('WaveGlow') {
-          steps {
-            sh 'python examples/tts/waveglow.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.waveglow.n_flows=4 \
-            model.waveglow.n_wn_layers=2 \
-            model.waveglow.n_wn_channels=32 \
-            ~trainer.check_val_every_n_epoch'
-          }
-        }
-        stage('FastPitch') {
-          steps {
-            sh 'python examples/tts/fastpitch.py \
-            --config-name fastpitch_align_v1.05 \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/beta_priors \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.symbols_embedding_dim=64 \
-            model.input_fft.d_inner=384 \
-            model.input_fft.n_layer=2 \
-            model.output_fft.d_inner=384 \
-            model.output_fft.n_layer=2 \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs'
-          }
-        }
-        stage('RADTTS') {
-          steps {
-            sh 'python examples/tts/radtts.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            export_dir=/home/TestData/radtts_test \
-            model.optim.lr=0.0001 \
-            model.modelConfig.decoder_use_partial_padding=True \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs'
-          }
-        }
-        stage('Mixer-TTS') {
-          steps {
-            sh 'python examples/tts/mixer_tts.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/sup_data \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs'
-          }
-        }
-        stage('Hifigan') {
-          steps {
-            sh 'python examples/tts/hifigan.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            +trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.generator.upsample_initial_channel=64 \
-            +model.debug=true \
-            ~trainer.check_val_every_n_epoch'
-          }
-        }
-      }
-    }
-    stage('L2: NeRF') {
-      when {
-        anyOf {
-          branch 'r1.21.0'
-          changeRequest target: 'r1.21.0'
-        }
-      }
-      parallel {
-        stage('DreamFusion') {
-          steps {
-            sh 'python examples/multimodal/text_to_image/nerf/main.py \
-            trainer.num_nodes=1 \
-            trainer.devices="[0]" \
-            trainer.max_steps=1000 \
-            model.prompt="a DSLR photo of a delicious hamburger" \
-            exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results'
-            sh 'rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results'
-          }
-        }
-      }
-    }
-    stage('L??: Speech Checkpoints tests') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh 'CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
-            pretrained_name=QuartzNet15x5Base-En  \
-            dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \
-            batch_size=64 \
-            tolerance=0.1012'
-        sh 'rm -f examples/asr/evaluation_transcripts.json'
-      }
-    }
-  }
-
-  post {
-    always {
-      sh 'chmod -R 777 .'
-      cleanWs()
-    }
-  }
-}
diff --git a/ci.groovy b/ci.groovy
deleted file mode 100644
index 27ad659b99a1..000000000000
--- a/ci.groovy
+++ /dev/null
@@ -1,119 +0,0 @@
-@Library('blossom-github-lib@master') 
-import ipp.blossom.*
-
-podTemplate(cloud:'sc-ipp-blossom-prod', yaml : """
-apiVersion: v1
-kind: Pod
-metadata:
-  labels:
-    some-label: some-label-value
-spec:
-  volumes:
-  - name: scratch
-    nfs:
-      server: ipp1-cdot01-col01
-      path: /vol/scratch1/scratch.okuchaiev_blossom
-  containers:
-  - name: latestdlfw
-    image: nvcr.io/nvidia/pytorch:23.02-py3
-    command:
-    - cat
-    volumeMounts:
-    - name: scratch
-      mountPath: /testdata
-    resources:
-          limits:
-             nvidia.com/gpu: 2
-    restartPolicy: Never
-    backoffLimit: 4
-    tty: true
-    shm-size: 32g
-  nodeSelector:
-    kubernetes.io/os: linux
-    nvidia.com/gpu_type: "Tesla_T4x4"
-    nvidia.com/node_type: gpu_tester
-    nvidia.com/driver_version: "510.20"
-"""
-)   {
-      node(POD_LABEL) {
-          def githubHelper
-          stage('Get Token') {
-              withCredentials([usernamePassword(credentialsId: 'GHAtoken', passwordVariable: 'GIT_PASSWORD', usernameVariable: 'GIT_USERNAME')]) {
-                  // create new instance of helper object
-                  githubHelper = GithubHelper.getInstance("${GIT_PASSWORD}", githubData)
-              }
-              
-          }
-          def stageName = '' 
-          try {
-              currentBuild.description = githubHelper.getBuildDescription()
-              container('latestdlfw') {
-                stage('Code checkout') {
-                    // update status on github
-                    githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Running", GitHubCommitState.PENDING)
-                    checkout changelog: true, poll: true, scm: [$class: 'GitSCM', branches: [[name: "pr/"+githubHelper.getPRNumber()]],
-                    doGenerateSubmoduleConfigurations: false,
-                    submoduleCfg: [],
-                    userRemoteConfigs: [[credentialsId: 'github-token', url: githubHelper.getCloneUrl(), refspec: '+refs/pull/*/head:refs/remotes/origin/pr/*']]]              
-                }
-
-                stage('Code Style') {
-                        sh "apt-get update && \
-                            apt-get install -y bc && \
-                            nvidia-smi && \
-                            pip install -r requirements/requirements_test.txt && \
-                            python setup.py style && ls -l /testdata/TestData && ln -s /testdata/TestData /home/TestData && \
-                            ls -l /home && ls -l /home/TestData"
-                }
-                
-                stage('Installation') {
-                  sh "git config --global --add safe.directory '*' && nvidia-smi && ./reinstall.sh release"
-                }
-
-                stage('L0: GPU unit tests') {
-                            sh "NEMO_NUMBA_MINVER=0.53 pytest -m 'not pleasefixme'"
-                }
-
-                parallel( //USE CUDA_VISIBLE_DEVICES to execute 2 single GPU tests in parallel here
-                [
-                    "L1: NMT Training Pre-LN": { sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/machine_translation/enc_dec_nmt.py \
-                            --config-path=conf \
-                            --config-name=aayn_base \
-                            do_testing=true \
-                            model.train_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-                            model.train_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-                            model.validation_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-                            model.validation_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-                            model.test_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-                            model.test_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-                            model.encoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-                            model.decoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-                            model.encoder.pre_ln=true \
-                            model.decoder.pre_ln=true \
-                            trainer.devices=[0] \
-                            trainer.accelerator="gpu" \
-                            +trainer.fast_dev_run=true \
-                            +trainer.limit_test_batches=2 \
-                            exp_manager=null \
-                            '},
-                    "L1: Speech to text": { sh 'CUDA_VISIBLE_DEVICES=1 python examples/asr/asr_ctc/speech_to_text_ctc.py \
-                            model.train_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_train.json \
-                            model.validation_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_val.json \
-                            trainer.devices=[0] \
-                            trainer.accelerator="gpu" \
-                            +trainer.fast_dev_run=True \
-                            exp_manager=null \
-                            '}
-                ]
-                )//end of parallel
-              }
-              githubHelper.updateCommitStatus("$BUILD_URL", "Complete", GitHubCommitState.SUCCESS)
-          }
-          catch (Exception ex){
-              currentBuild.result = 'FAILURE'
-              println ex
-              githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Failed", GitHubCommitState.FAILURE)
-          }
-          
-      }
-  }
\ No newline at end of file

From 3bd8da86028d08bdbb68ff77d565e27aea04e567 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Thu, 9 May 2024 14:51:35 -0400
Subject: [PATCH 059/178] Cleanup deprecated files and temporary changes
 (#9088)

* remove deprecated files

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove temporary changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove incorrect deprecation notice

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert rope fusion defaults

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update tests

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update tests

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix failing test

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add assert

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove jenkinsfile to resolve merge conflict

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               |   3 +-
 .../megatron_gpt_embedder_tuning_config.yaml  |   1 +
 .../conf/megatron_chatglm_config.yaml         |   2 +-
 .../conf/megatron_falcon_config.yaml          |   2 +-
 .../conf/megatron_gpt_config.yaml             |   2 +-
 .../conf/megatron_llama_config.yaml           |   2 +-
 .../conf/megatron_starcoder_config.yaml       |   2 +-
 .../language_modeling/megatron_gpt_eval.py    |   1 -
 .../tuning/conf/megatron_gpt_sft.yaml         | 191 --------------
 .../tuning/megatron_gpt_peft_eval.py          | 153 -----------
 .../tuning/megatron_gpt_peft_tuning.py        |  91 -------
 .../tuning/megatron_gpt_sft.py                | 247 ------------------
 .../tuning/megatron_t5_generate.py            |  12 -
 .../tuning/megatron_t5_peft_tuning.py         |  75 ------
 nemo/collections/multimodal/parts/utils.py    |   1 -
 .../megatron_gpt_embedding_model.py           |   3 +
 .../language_modeling/megatron_base_model.py  |   3 +-
 .../convert_starcoder_hf_to_nemo.py           |   1 -
 18 files changed, 12 insertions(+), 780 deletions(-)
 delete mode 100644 examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
 delete mode 100644 examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py
 delete mode 100644 examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py
 delete mode 100644 examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
 delete mode 100644 examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index ef3206e48f69..bc8f9fb0bab2 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4876,7 +4876,7 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
-            python examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \
+            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
             trainer.devices=1 \
             trainer.num_nodes=1 \
             trainer.precision=32 \
@@ -4887,6 +4887,7 @@ jobs:
             +trainer.limit_test_batches=2 \
             exp_manager.checkpoint_callback_params.save_best_model=False \
             exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+            model.peft.peft_scheme=none \
             model.optim.name=distributed_fused_adam \
             model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
             model.tensor_model_parallel_size=1 \
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
index 315bffd8a1ff..6677dc2ed46c 100644
--- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
+++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
@@ -83,6 +83,7 @@ model:
   mcore_gpt: True # required to be True for newer versions of Megatron-LM based models
   use_flash_attention: True
   precision: bf16
+  apply_rope_fusion: False
 
   peft:
     peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
diff --git a/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml b/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml
index 84fbd1b801d4..5c1191dbe64e 100644
--- a/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml
@@ -81,7 +81,7 @@ model:
   position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope']
   rotary_percentage: 0.5 # If using position_embedding_type=rope, then the per head dim is multiplied by this. For chatglm2, it is 0.5 (https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L754)
   rotary_interleaved: True # chatglm2 use interleaved rotary embedding
-  apply_rope_fusion: False
+  apply_rope_fusion: True
   attention_type: 'multihead' # Attention type. Options ['multihead']
   share_embeddings_and_output_weights: False # Share embedding and output layer weights.
   overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
index 8905abaf3ac2..f5746433cc78 100644
--- a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
@@ -113,7 +113,7 @@ model:
   bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 57c82726ae11..dd92b46b5369 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -130,7 +130,7 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_config.yaml b/examples/nlp/language_modeling/conf/megatron_llama_config.yaml
index 965b511fc7e7..38ed239ec6e1 100644
--- a/examples/nlp/language_modeling/conf/megatron_llama_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_llama_config.yaml
@@ -112,7 +112,7 @@ model:
   bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous
diff --git a/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml b/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml
index 355e575a6d59..b170e82ca983 100644
--- a/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml
@@ -117,7 +117,7 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
   # Miscellaneous
   seed: 1234
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
index 084a4b2642a2..01c56f1e3269 100644
--- a/examples/nlp/language_modeling/megatron_gpt_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -224,7 +224,6 @@ def main(cfg) -> None:
             pretrained_cfg.activations_checkpoint_method = None
             pretrained_cfg.precision = trainer.precision
             pretrained_cfg["use_flash_attention"] = cfg.inference.get("use_flash_attention", False)
-            pretrained_cfg["apply_rope_fusion"] = False
             if pretrained_cfg.get('mcore_gpt', False):
                 # with dist checkpointing we can use the model parallel config specified by the user
                 pretrained_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
deleted file mode 100644
index 27e73996225f..000000000000
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml
+++ /dev/null
@@ -1,191 +0,0 @@
-name: megatron_gpt_sft
-
-trainer:
-  devices: 1
-  accelerator: gpu
-  num_nodes: 1
-  precision: 16
-  logger: False # logger provided by exp_manager
-  enable_checkpointing: False
-  use_distributed_sampler: False
-  max_epochs: 9999
-  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 10 # frequency with which training steps are logged 
-  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
-  gradient_clip_val: 1.0
-
-exp_manager:
-  explicit_log_dir: null
-  exp_dir: null
-  name: ${name}
-  create_wandb_logger: False
-  wandb_logger_kwargs:
-    project: null
-    name: null
-  resume_if_exists: True
-  resume_ignore_no_checkpoint: True
-  create_checkpoint_callback: True
-  checkpoint_callback_params:
-    monitor: validation_${model.data.validation_ds.metric.name}
-    save_top_k: 2
-    mode: max
-    save_nemo_on_train_end: False 
-    filename: 'megatron_gpt_sft--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
-    model_parallel_size: ${model.tensor_model_parallel_size}
-    save_best_model: True
-
-model:
-  seed: 1234
-  tensor_model_parallel_size: 1 # intra-layer model parallelism
-  pipeline_model_parallel_size: 1 # inter-layer model parallelism
-  global_batch_size: 128
-  micro_batch_size: 4
-  restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
-  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
-  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
-  sync_batch_comm: False
-  megatron_amp_O2: False
-
-  ## Sequence Parallelism
-  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
-  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-  sequence_parallel: False
-
-  ## Activation Checkpoint 
-  activations_checkpoint_granularity: null # 'selective' or 'full' 
-  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
-  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
-  # of each chunk at the specified granularity
-  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-  activations_checkpoint_num_layers: null # not used with 'selective'
-  activations_checkpoint_layers_per_pipeline: null
-  # This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml.
-  answer_only_loss: False # not used right now
-  gradient_as_bucket_view: False
-  seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
-  use_flash_attention: null # if not None, will match the base model's value
-
-  hidden_dropout: 0.0
-  attention_dropout: 0.0
-  ffn_dropout: 0.0
-
-  data:
-    chat: False # whether use chatbot data or not
-    chat_prompt_tokens:  # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '<im end><im start>', the '><' sometimes is merged to be a single token. This is not supported, try to avoid
-      system_turn_start: '<extra_id_0>'
-      turn_start: '<extra_id_1>'
-      label_start: '<extra_id_2>'
-      end_of_turn: "\x0A"  # \0x0A is '\n'
-      end_of_name: "\x0A"  # \0x0A is '\n'
-    train_ds:
-      # Example of how to specify paths to multiple datasets
-      # file_names: 
-      #   - /path/to/squad.jsonl
-      #   - /path/to/mnli.jsonl
-      #   - /path/to/boolq.jsonl
-      # Example of how each dataset is formatted
-      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
-      file_names: ??? # Path to a list of JSONL files corresponding to the source data.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: True
-      num_workers: 4
-      memmap_workers: null
-      pin_memory: True
-      max_seq_length: 2048
-      min_seq_length: 1
-      drop_last: True
-      # Example of how to specify concat_sampling_probabilities
-      # concat_sampling_probabilities:
-      #   - 0.5
-      #   - 0.25
-      #   - 0.25
-      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
-      label_key: 'output'
-      add_eos: True
-      add_sep: False
-      add_bos: False
-      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
-      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
-
-    validation_ds:
-      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: null # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: False
-      num_workers: 4
-      memmap_workers: ${model.data.train_ds.memmap_workers}
-      pin_memory: True
-      max_seq_length: ${model.data.train_ds.max_seq_length}
-      min_seq_length: 1
-      drop_last: False
-      label_key: ${model.data.train_ds.label_key}
-      add_eos: ${model.data.train_ds.add_eos}
-      add_sep: ${model.data.train_ds.add_sep}
-      add_bos: ${model.data.train_ds.add_bos}
-      write_predictions_to_file: False
-      output_file_path_prefix: null # Prefix of the file to write predictions to.
-      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
-      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
-      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
-
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'rouge', 'token_f1']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-
-    test_ds:
-      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: null # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: False
-      num_workers: 4
-      memmap_workers: ${model.data.train_ds.memmap_workers}
-      pin_memory: True
-      max_seq_length: ${model.data.train_ds.max_seq_length}
-      min_seq_length: 1
-      drop_last: False
-      label_key: ${model.data.train_ds.label_key}
-      add_eos: ${model.data.train_ds.add_eos}
-      add_sep: ${model.data.train_ds.add_sep}
-      add_bos: ${model.data.train_ds.add_bos}
-      write_predictions_to_file: False
-      output_file_path_prefix: null # Prefix of the file to write predictions to.
-      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
-      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
-      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
-      truncation_method: 'right' # Truncation from which position, Options: Options: ['left', 'right']
-
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-
-  optim:
-    name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
-    lr: 3e-5
-    weight_decay: 0.01 
-    betas: 
-    - 0.9
-    - 0.98
-
-inference:
-  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
-  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
-  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-  temperature: 1.0 # sampling temperature
-  all_probs: False  # whether return the log prob for all the tokens in vocab
-  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
-  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
-  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-  compute_attention_mask: True
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py
deleted file mode 100644
index 11a375391e50..000000000000
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#############################
-# THIS SCRIPT IS DEPRECATED #
-#############################
-
-import asyncio
-import threading
-from functools import partial
-
-import torch
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf
-
-
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
-from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer
-from nemo.collections.nlp.modules.common.text_generation_utils import generate
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated
-
-try:
-    from megatron.core import parallel_state
-
-    HAVE_MEGATRON_CORE = True
-except:
-    pass
-
-mp.set_start_method("spawn", force=True)
-"""
-This is the script to run inference with a PEFT model or an SFT Model.
-
-If you want to evaluate an SFT .nemo file:
-
-python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-	model.restore_from_path=<path_to_sft_nemo_file> \
-	model.peft.restore_from_path=null \
-	trainer.devices=1 model.data.test_ds.file_names=\[<path_to_test_jsonl_file1>, <path_to_test_jsonl_file2>] \
-	model.data.test_ds.names=\['name_for_test_file1', 'name_for_test_file2'] \  # this is not the filename just some identifier
-	model.data.test_ds.global_batch_size=4 \  # or some other value
-	model.data.test_ds.micro_batch_size=4 \
-	model.data.test_ds.tokens_to_generate=30 \
-	inference.greedy=True \
-	inference.outfile_path=\'<path_to_jsonl_output_file>'  
-
-If you want to evaluate a PEFT Model, you should provide a base GPT model and a PEFT model .nemo file
-
-python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-	model.restore_from_path=<path_to_sft_nemo_file> \
-	model.peft.restore_from_path=<path_to_peft_nemo_file> \ # this will be created if you use `megatron_gpt_finetuning.py`
-	trainer.devices=1 model.data.test_ds.file_names=\[<path_to_test_jsonl_file1>, <path_to_test_jsonl_file2>] \
-	model.data.test_ds.names=\['name_for_test_file1', 'name_for_test_file2'] \  # this is not the filename just some identifier
-	model.data.test_ds.global_batch_size=4 \  # or some other value
-	model.data.test_ds.micro_batch_size=4 \
-	model.data.test_ds.tokens_to_generate=30 \
-	inference.greedy=True \
-	inference.outfile_path=\'<path_to_jsonl_output_file>'  
-
-"""
-
-
-def use_inference_server(cfg, model, trainer):
-    if not HAVE_MEGATRON_CORE:
-        raise ValueError('Megatron-core needs to be installed to use this feature!')
-
-    from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo
-
-    trainer.test(model, dataloaders=None)
-
-    if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0:
-        if cfg.web_server:
-            if cfg.chat:
-                defaults = {
-                    'user': cfg.chatbot_config.user,
-                    'assistant': cfg.chatbot_config.assistant,
-                    'system': cfg.chatbot_config.system,
-                }
-                web_ui = partial(
-                    get_chatbot_demo,
-                    defaults=defaults,
-                    value=cfg.chatbot_config.value,
-                    attributes=cfg.chatbot_config.attributes,
-                )
-            else:
-                web_ui = get_demo
-            loop = asyncio.new_event_loop()
-            thread = threading.Thread(
-                target=web_ui, daemon=True, args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop),
-            )
-            thread.start()
-        server = MegatronServer(model.cuda())
-        server.run("0.0.0.0", port=cfg.port)
-
-    while True:
-        choice = torch.cuda.LongTensor(1)
-        torch.distributed.broadcast(choice, 0)
-        if choice[0].item() == 0:
-            generate(model.cuda())
-
-
-banner = '\n'.join(['' "*" * 80] * 5)
-
-
-@deprecated(
-    wait_seconds=20,
-    explanation=f"\n{banner}\nmegatron_gpt_peft_eval.py is renamed to megatron_gpt_generate.py with the "
-    f"same functionality. \nPlease switch to the new name.\n{banner}\n",
-)
-@hydra_runner(config_path="conf", config_name="megatron_gpt_generate_config")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-
-    if cfg.model.peft.restore_from_path:
-        model_cfg = MegatronGPTSFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
-    else:
-        model_cfg = MegatronGPTSFTModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
-
-    model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-
-    if cfg.model.peft.restore_from_path:
-        model.load_adapters(cfg.model.peft.restore_from_path)
-
-    model.freeze()
-    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
-
-    if not cfg.model.get('use_flash_attention', False):
-        cfg.inference.compute_attention_mask = True
-    config = OmegaConf.to_container(cfg.inference, resolve=True)
-    model.set_inference_config(config)
-
-    if not cfg.server:
-        trainer.test(model)
-    else:
-        use_inference_server(cfg, model, trainer)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py
deleted file mode 100644
index 1137866ccb8b..000000000000
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#############################
-# THIS SCRIPT IS DEPRECATED #
-#############################
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf
-
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
-
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated
-from nemo.utils.exp_manager import exp_manager
-
-mp.set_start_method("spawn", force=True)
-
-"""
-This is the script to finetuning a GPT Model with any PEFT method.
-A base GPT Model is required as a starting point. This script will then insert
-Adapters into each Transformer layer and will train/update only these adapters
-during training. The base GPT Model weights will remain frozen.
-
-During training this script will only save the newly trained Adapter weights
-in checkpoints. At the end of training a .nemo file of Adapter weights will 
-be saved.
-
-Usage:
-    Assuming the base model is a 125m GPT Model, with TP=1, PP=1:
-    a. run a training run for a base gpt nemo file:
-        python megatron_gpt_finetuning.py \
-            "model.data.train_ds.file_names=[PATH TO TRAINING JSONL FILE]",
-            "model.data.train_ds.concat_sampling_probabilities=[SAMPLING VAL]",
-            "model.data.validation_ds.file_names=[PATH TO VALIDATION JSONL FILE]",
-            "model.data.validation_ds.names=[NAME FOR METRIC LOGGING]",
-            model.restore_from_path="PATH TO BASE GPT MODEL .nemo FILE"
-            model.peft.peft_scheme='lora'  # lora, ptuning, adapter, ia3, or none for full fineutning
-            name="NAME OF TRAINING RUN"
-            exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE",
-Please see lora.ipynb for a step-by-step guide.
-"""
-
-banner = '\n'.join(['' "*" * 80] * 5)
-
-
-@deprecated(
-    wait_seconds=20,
-    explanation=f"\n{banner}\nmegatron_gpt_peft_tuning.py is renamed to megatron_gpt_finetuning.py with the "
-    f"same functionality. \nPlease switch to the new name.\n{banner}\n",
-)
-@hydra_runner(config_path="conf", config_name="megatron_gpt_finetuning_config")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
-
-    if cfg.model.peft.restore_from_path is not None:
-        # initialize peft weights from a checkpoint instead of randomly
-        # This is not the same as resume training because optimizer states are not restored.
-        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
-        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
-    elif peft_cfg_cls is not None:
-        logging.info("Adding adapter weights to the model for PEFT")
-        model.add_adapter(peft_cfg_cls(model_cfg))
-    else:
-        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
deleted file mode 100644
index 506ddd0364eb..000000000000
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#############################
-# THIS SCRIPT IS DEPRECATED #
-#############################
-import os
-import tempfile
-
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-
-from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import get_prompt_template_example
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
-from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
-from nemo.collections.nlp.parts.nlp_overrides import (
-    CustomProgressBar,
-    GradScaler,
-    MegatronHalfPrecisionPlugin,
-    NLPDDPStrategy,
-    NLPSaveRestoreConnector,
-    PipelineMixedPrecisionPlugin,
-)
-from nemo.core.config import hydra_runner
-from nemo.utils import AppState, logging
-from nemo.utils.decorators import deprecated
-from nemo.utils.exp_manager import exp_manager
-from nemo.utils.model_utils import inject_model_parallel_rank
-
-mp.set_start_method("spawn", force=True)
-
-
-def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
-    """
-    This function modifies the original gpt pre-training config (gpt_cfg) with attributes from the finetuning config (cfg).
-    The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
-    """
-    OmegaConf.set_struct(gpt_cfg, True)
-    OmegaConf.resolve(cfg)
-    with open_dict(gpt_cfg):
-        gpt_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
-        gpt_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
-        gpt_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
-        gpt_cfg.sequence_parallel = cfg.model.get("sequence_parallel", False)
-        gpt_cfg.activations_checkpoint_granularity = cfg.model.get("activations_checkpoint_granularity", None)
-        gpt_cfg.activations_checkpoint_num_layers = cfg.model.get("activations_checkpoint_num_layers", None)
-        gpt_cfg.activations_checkpoint_method = cfg.model.get("activations_checkpoint_method", None)
-        gpt_cfg.activations_checkpoint_layers_per_pipeline = cfg.model.get(
-            "activations_checkpoint_layers_per_pipeline", None
-        )
-        gpt_cfg.data = cfg.model.data
-        gpt_cfg.optim = cfg.model.optim
-        gpt_cfg.precision = cfg.trainer.precision
-        gpt_cfg.answer_only_loss = cfg.model.answer_only_loss
-        gpt_cfg.restore_from_path = cfg.model.restore_from_path
-        gpt_cfg.resume_from_checkpoint = cfg.model.resume_from_checkpoint
-        gpt_cfg.save_nemo_on_validation_end = cfg.model.save_nemo_on_validation_end
-        gpt_cfg.gradient_as_bucket_view = cfg.model.gradient_as_bucket_view
-        gpt_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.0)
-        gpt_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.0)
-        gpt_cfg.ffn_dropout = cfg.model.ffn_dropout
-        gpt_cfg.use_flash_attention = cfg.model.get('use_flash_attention', False)
-        gpt_cfg.tensor_model_parallel_size = cfg.model.get('tensor_model_parallel_size', 1)
-        gpt_cfg.expert_model_parallel_size = cfg.model.get('expert_model_parallel_size', 1)
-        gpt_cfg.pipeline_model_parallel_size = cfg.model.get('pipeline_model_parallel_size', 1)
-        gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get('pipeline_model_parallel_split_rank', 0)
-
-        if cfg.model.data.get('chat', False):
-            # chat model, overwrite the prompt template
-            prompt_template = get_prompt_template_example(cfg.model.data.chat_prompt_tokens)
-            gpt_cfg.data.train_ds.prompt_template = prompt_template
-            gpt_cfg.data.validation_ds.prompt_template = prompt_template
-            gpt_cfg.data.test_ds.prompt_template = prompt_template
-
-        sft_cls = MegatronGPTSFTModel
-        gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}"
-
-        if cfg.model.get('use_flash_attention', None) is not None:
-            gpt_cfg.use_flash_attention = cfg.model.use_flash_attention
-
-        if cfg.model.get('seq_len_interpolation_factor', None) is not None:
-            gpt_cfg.seq_len_interpolation_factor = cfg.model.seq_len_interpolation_factor
-
-        if cfg.model.get('rotary_base', None) is not None:
-            gpt_cfg.rotary_base = cfg.model.rotary_base
-
-        sft_cls = MegatronGPTSFTModel
-        gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}"
-
-        # This is needed when modifying a hparam file directly to load `.ckpt` files.
-        # This is not needed to modify the cfg in `.nemo` files.
-        if add_cfg_to_tree:
-            OmegaConf.resolve(gpt_cfg)
-            gpt_cfg.cfg = gpt_cfg
-
-    return gpt_cfg
-
-
-def load_from_nemo(cls, cfg, trainer, gpt_cfg, modify_confg_fn):
-    gpt_cfg = modify_confg_fn(gpt_cfg, cfg, add_cfg_to_tree=False)
-    save_restore_connector = NLPSaveRestoreConnector()
-    if os.path.isdir(cfg.model.restore_from_path):
-        save_restore_connector.model_extracted_dir = cfg.model.restore_from_path
-    model = cls.restore_from(
-        restore_path=cfg.model.restore_from_path,
-        trainer=trainer,
-        override_config_path=gpt_cfg,
-        save_restore_connector=save_restore_connector,
-    )
-    return model
-
-
-def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn):
-    app_state = AppState()
-    if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1:
-        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size
-        app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size
-        app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size
-        (
-            app_state.tensor_model_parallel_rank,
-            app_state.pipeline_model_parallel_rank,
-            app_state.model_parallel_size,
-            app_state.data_parallel_size,
-            app_state.pipeline_model_parallel_split_rank,
-            app_state.virtual_pipeline_model_parallel_rank,
-        ) = fake_initialize_model_parallel(
-            world_size=app_state.model_parallel_size,
-            rank=trainer.global_rank,
-            tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size,
-            pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size,
-            pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank,
-        )
-    checkpoint_path = inject_model_parallel_rank(
-        os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name)
-    )
-    hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file)
-    gpt_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True)
-    with tempfile.NamedTemporaryFile(suffix='.yaml') as f:
-        OmegaConf.save(config=gpt_cfg, f=f.name)
-        model = cls.load_from_checkpoint(checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,)
-        return model
-
-
-def validate_checkpoint_loading_args(cfg):
-    if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir):
-        raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.')
-    if cfg.checkpoint_name is None:
-        raise ValueError(f'Checkpoint name {cfg.checkpoint_name} is not valid.')
-    if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file):
-        raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.')
-
-
-banner = '\n'.join(['' "*" * 80] * 5)
-
-
-@deprecated(
-    wait_seconds=20,
-    explanation=f"\n{banner}\n{__file__} is deprecated. PEFT and SFT scripts are now consolidated"
-    f"See updated scripts `megatron_gpt_finetuning.py` and `megatron_gpt_generate.py` for examples.\n{banner}\n",
-)
-@hydra_runner(config_path="conf", config_name="megatron_gpt_sft")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
-    with_distributed_adam = cfg.model.optim.get('name', 'fused_adam') == 'distributed_fused_adam'
-    plugins = []
-    strategy = NLPDDPStrategy(
-        no_ddp_communication_hook=True,
-        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-        find_unused_parameters=False,
-    )
-    if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
-        scaler = None
-        if cfg.trainer.precision in [16, '16', '16-mixed']:
-            scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
-                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
-                hysteresis=cfg.model.get('hysteresis', 2),
-            )
-            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
-            plugin_precision = '16-mixed'
-        else:
-            plugin_precision = 'bf16-mixed'
-        if megatron_amp_O2 and not with_distributed_adam:
-            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-        else:
-            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
-        # precision plugins and precision to exist
-        cfg.trainer.precision = None
-
-    if cfg.get('cluster_type', None) == 'BCP':
-        plugins.append(TorchElasticEnvironment())
-
-    callbacks = []
-    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
-    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
-        callbacks.append(CustomProgressBar())
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
-
-    exp_manager(trainer, cfg.exp_manager)
-
-    # update resume from checkpoint found by exp_manager
-    if cfg.model.resume_from_checkpoint is not None:
-        trainer.ckpt_path = cfg.model.resume_from_checkpoint
-    logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
-
-    if cfg.model.restore_from_path:
-        save_restore_connector = NLPSaveRestoreConnector()
-        if os.path.isdir(cfg.model.restore_from_path):
-            save_restore_connector.model_extracted_dir = cfg.model.restore_from_path
-        gpt_cfg = MegatronGPTSFTModel.restore_from(
-            restore_path=cfg.model.restore_from_path,
-            trainer=trainer,
-            return_config=True,
-            save_restore_connector=save_restore_connector,
-        )
-        model = load_from_nemo(MegatronGPTSFTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config)
-    else:
-        validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
-        model = load_from_checkpoint_dir(MegatronGPTSFTModel, cfg, trainer, modify_confg_fn=_modify_config)
-
-    if 'inference' in cfg:
-        if not cfg.model.use_flash_attention:
-            cfg.inference.compute_attention_mask = True
-        config = OmegaConf.to_container(cfg.inference, resolve=True)
-        model.set_inference_config(config)
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_generate.py b/examples/nlp/language_modeling/tuning/megatron_t5_generate.py
index d7328c5ca780..80c2e352aa5b 100644
--- a/examples/nlp/language_modeling/tuning/megatron_t5_generate.py
+++ b/examples/nlp/language_modeling/tuning/megatron_t5_generate.py
@@ -11,9 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#############################
-# THIS SCRIPT IS DEPRECATED #
-#############################
 
 import asyncio
 import threading
@@ -30,7 +27,6 @@
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
-from nemo.utils.decorators import deprecated
 
 try:
     from megatron.core import parallel_state
@@ -111,14 +107,6 @@ def use_inference_server(cfg, model, trainer):
             generate(model.cuda())
 
 
-banner = '\n'.join(['' "*" * 80] * 5)
-
-
-@deprecated(
-    wait_seconds=20,
-    explanation=f"\n{banner}\nmegatron_t5_peft_eval.py is renamed to megatron_t5_generate.py with the "
-    f"same functionality. \nPlease switch to the new name.\n{banner}\n",
-)
 @hydra_runner(config_path="conf", config_name="megatron_t5_generate_config")
 def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py b/examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py
deleted file mode 100644
index ad4624ee4233..000000000000
--- a/examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#############################
-# THIS SCRIPT IS DEPRECATED #
-#############################
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf
-
-from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.decorators import deprecated
-from nemo.utils.exp_manager import exp_manager
-
-mp.set_start_method("spawn", force=True)
-
-"""
-This is the script to finetuning a T5 Model with any PEFT method.
-A base T5 Model is required as a starting point. This script will then insert
-Adapters into each Transformer layer and will train/update only these adapters
-during training. The base T5 Model weights will remain frozen.
-
-This script is exactly the same as the peft tuning script for GPT. For more details
-please refer to the GPT script and docs.
-"""
-
-banner = '\n'.join(['' "*" * 80] * 5)
-
-
-@deprecated(
-    wait_seconds=20,
-    explanation=f"\n{banner}\nmegatron_t5_peft_tuning.py is renamed to megatron_t5_finetuning.py with the "
-    f"same functionality. \nPlease switch to the new name.\n{banner}\n",
-)
-@hydra_runner(config_path="conf", config_name="megatron_t5_finetuning_config")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronT5SFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    model = MegatronT5SFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
-
-    if cfg.model.peft.restore_from_path is not None:
-        # initialize peft weights from a checkpoint instead of randomly
-        # This is not the same as resume training because optimizer states are not restored.
-        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
-        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
-    elif peft_cfg_cls is not None:
-        logging.info("Adding adapter weights to the model for PEFT")
-        model.add_adapter(peft_cfg_cls(model_cfg))
-    else:
-        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 2a739a96dbbf..223fe22bd00a 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -366,7 +366,6 @@ def create_neva_model_and_processor(cfg):
             neva_cfg.activations_checkpoint_method = None
             neva_cfg.precision = trainer.precision
             neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None)
-            neva_cfg.apply_rope_fusion = False
             neva_cfg.fp8 = False
         #    neva_cfg.mm_cfg.vision_encoder.from_pretrained = None
 
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
index 4cdeba1d67e2..d477b337cd29 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
@@ -61,6 +61,9 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         assert (
             self.cfg.get("post_process", False) is False
         ), "post_process must be False to get hidden states in the loss_func"
+        assert (
+            self.cfg.get('apply_rope_fusion', True) is False
+        ), "RoPE fusion should be set to False for MegatronGPTEmbeddingModel"
 
     def model_provider_func(self, pre_process, post_process):
         # (@adithyare) We need post_process to be False to get hidden states in the loss_func
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 09cf25db61fc..8b7c7a38045c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -509,8 +509,7 @@ def build_transformer_config(self) -> TransformerConfig:
 
         bias_dropout_fusion = self.cfg.get('bias_dropout_add_fusion', True)
 
-        # @chcui default rope fusion to false until #8590 is closed.
-        apply_rope_fusion = self.cfg.get('apply_rope_fusion', False)
+        apply_rope_fusion = self.cfg.get('apply_rope_fusion', True)
 
         # TODO: need to check if recompute APIs are matching up properly
         recompute_granularity = self.cfg.get('activations_checkpoint_granularity', None)
diff --git a/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py b/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
index 9dceba544068..e600c65e6de1 100644
--- a/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
@@ -147,7 +147,6 @@ def get_new_key(old_key):
         "encoder_seq_length": hf_config.n_positions,
         "max_position_embeddings": hf_config.n_positions,
         "num_layers": hf_config.n_layer,
-        "cpu_offloading_num_layers": hf_config.n_layer - 1,  # @chcui temp workaround before m-lm !1124 is merged
         "num_attention_heads": hf_config.n_head,
         "ffn_hidden_size": hf_config.n_inner,
         "layernorm_epsilon": hf_config.layer_norm_epsilon,

From 8d4e7ea29a24daf36d781a587faf124eaed9af05 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Thu, 9 May 2024 14:32:32 -0700
Subject: [PATCH 060/178] Use DP+CP groups as the FSDP sharding domain (#9145)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 4cfa4c220b25..97661c752c52 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -655,8 +655,8 @@ def setup_environment(self) -> None:
                     app_state.tensor_model_parallel_size == 1
                 ), "FSDP hybrid sharding cannot be used when tensor_model_parallel_size > 1."
             init_model_parallel(self.sharp, self.nccl_communicator_config_path)
-            # Set the FSDP process group as DP process group
-            self._process_group = parallel_state.get_data_parallel_group()
+        # Set the FSDP process group as DP(+CP) process group
+        self.kwargs["process_group"] = parallel_state.get_data_parallel_group(with_context_parallel=True)
 
         # Set the params to omit from sharding.
         self.kwargs["ignored_states"] = []

From 0cb78484981f83fe72e75dbad0333dbfe4a1dcde Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Thu, 9 May 2024 15:43:28 -0700
Subject: [PATCH 061/178] Update support for push_to_hf_hub() (#9159)

* Update support for push_to_hf_hub()

Signed-off-by: smajumdar <titu1994@gmail.com>

* Revert nlp comment out

Signed-off-by: smajumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 ...blish_NeMo_Model_On_Hugging_Face_Hub.ipynb | 180 ++++++++++++++++--
 1 file changed, 167 insertions(+), 13 deletions(-)

diff --git a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
index ae4f43867c8d..1771d65c5e50 100644
--- a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
+++ b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
@@ -3,9 +3,7 @@
   "nbformat_minor": 0,
   "metadata": {
     "colab": {
-      "name": "Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
+      "provenance": []
     },
     "kernelspec": {
       "name": "python3",
@@ -155,7 +153,7 @@
       "source": [
         "# Model Name\n",
         "\n",
-        "NeMo adheres to strict requirements when naming a model for upload to NGC / Hugging Face Hub. \n",
+        "NeMo adheres to strict requirements when naming a model for upload to NGC / Hugging Face Hub.\n",
         "\n",
         "It is **mandatory** to share the model name across the model card, the NeMo file itself. Otherwise NeMo model from Hugging Face will fail to restore correctly."
       ],
@@ -170,7 +168,7 @@
         "\n",
         "NeMo model names can vary based on domain and purpose. While we attempt to conform to standard guidelines when naming our models, we do not expect the same level of strictness for community contributions.\n",
         "\n",
-        "Here are some common guidelines we encourage (but do not enforce) users to follow : \n",
+        "Here are some common guidelines we encourage (but do not enforce) users to follow :\n",
         "\n",
         "- `Task name`: Usually a short 2-3 character representation of the task that the model performs.\n",
         "  - `stt` = Speech To Text (ASR)\n",
@@ -189,7 +187,7 @@
         "\n",
         "-----\n",
         "\n",
-        "As an example of the following model we will try today : \n",
+        "As an example of the following model we will try today :\n",
         "\n",
         "`{task name}_{language id}_{model identifier}_[OPTIONAL modifiers]` = `stt_en_conformer_ctc_small`"
       ],
@@ -290,6 +288,87 @@
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Uploading a model to the hub\n",
+        "\n",
+        "There are two ways to upload a NeMo model to the Hugging Face hub -\n",
+        "\n",
+        "1) `push_to_hf_hub()`: This is the recommended and automated way to upload NeMo models to the HuggingFace Hub. NeMo will handle all parts of checkpoint and artifact management for you.\n",
+        "\n",
+        "2) Hugging Face Hub API: We provide steps to use the lower level Hugging Face Hub API to manually upload a NeMo checkpoint to the hub."
+      ],
+      "metadata": {
+        "id": "Ij6npcneH5tM"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Automatic Upload\n",
+        "\n",
+        "First, let's take a look at the automated way to upload a checkpoint to the hub *after* creating an empty model.\n",
+        "\n",
+        "One important argument is `pack_nemo_file` which decides whether to upload a single `.nemo` file to the hub, or the unpacked NeMo file (with all the individual components of the NeMo file extracted into the directory)."
+      ],
+      "metadata": {
+        "id": "cANDXRY9ImuW"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "repo_id = f'{username}/{MODEL_NAME}'\n",
+        "\n",
+        "model.push_to_hf_hub(repo_id=repo_id, pack_nemo_file=True)"
+      ],
+      "metadata": {
+        "id": "eCn7y9BmJGEe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "----\n",
+        "\n",
+        "Before showing the manual approach, lets remove the uploaded repository first"
+      ],
+      "metadata": {
+        "id": "62W_zIBrM563"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "try:\n",
+        "  api.delete_repo(repo_id=MODEL_NAME, missing_ok=True)\n",
+        "  api.create_repo(repo_id=MODEL_NAME)\n",
+        "  print(\"Successfully created repository !\")\n",
+        "except Exception as e:\n",
+        "  print(\"Repository is possibly already created. Refer to error here - \\n\\n\", e)"
+      ],
+      "metadata": {
+        "id": "Mqu0Aqg2M8ly"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Manual Upload\n",
+        "\n",
+        "Next, let's take a look at how to use the Hugging Face Hub API to upload a NeMo checkpoint to the hub.\n",
+        "\n",
+        "Note: Both ways will get similar results, so we recommend the automated way to make it easier."
+      ],
+      "metadata": {
+        "id": "UJYwaRA9I1Oc"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
@@ -304,7 +383,7 @@
     {
       "cell_type": "markdown",
       "source": [
-        "Note two essential names - \n",
+        "Note two essential names -\n",
         "\n",
         "- `hf_model_name`: A string name that is the composite of your `username` and `MODEL_NAME` as set above. This name is used for multiple purposes, so keep track of it.\n",
         "\n",
@@ -346,7 +425,7 @@
     {
       "cell_type": "markdown",
       "source": [
-        "## Test if the model works \n",
+        "## Test if the model works\n",
         "\n",
         "Now that we uploaded the model, let's try to use it in NeMo !\n",
         "\n",
@@ -565,7 +644,7 @@
         "To train, fine-tune or play with the model you will need to install [NVIDIA NeMo](https://github.com/NVIDIA/NeMo). We recommend you install it after you've installed latest Pytorch version.\n",
         "```\n",
         "pip install nemo_toolkit['all']\n",
-        "``` \n",
+        "```\n",
         "\n",
         "## How to Use this Model\n",
         "\n",
@@ -618,7 +697,7 @@
         "\n",
         "## Performance\n",
         "\n",
-        "<LIST THE SCORES OF THE MODEL - \n",
+        "<LIST THE SCORES OF THE MODEL -\n",
         "      OR\n",
         "USE THE Hugging Face Evaluate LiBRARY TO UPLOAD METRICS>\n",
         "\n",
@@ -626,7 +705,7 @@
         "\n",
         "<DECLARE ANY POTENTIAL LIMITATIONS OF THE MODEL>\n",
         "\n",
-        "Eg: \n",
+        "Eg:\n",
         "Since this model was trained on publicly available speech datasets, the performance of this model might degrade for speech which includes technical terms, or vernacular that the model has not been trained on. The model might also perform worse for accented speech.\n",
         "\n",
         "\n",
@@ -672,7 +751,7 @@
         "    f.write(OmegaConf.to_yaml(config))\n",
         "    f.write(\"\\n---\\n\\n\")\n",
         "    f.write(TEMPLATE)\n",
-        "  "
+        ""
       ],
       "metadata": {
         "id": "0vk5KK4gzpSU"
@@ -744,7 +823,7 @@
       "cell_type": "code",
       "source": [
         "hf_model_name = f'{username}/{MODEL_NAME}'\n",
-        "metric_value = 8.1  # value obtained from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_small \n",
+        "metric_value = 8.1  # value obtained from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_small\n",
         "\n",
         "evaluate.push_to_hub(\n",
         "    model_id=hf_model_name,\n",
@@ -778,6 +857,81 @@
       "metadata": {
         "id": "f3YYa7liO_m3"
       }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Bonus: Uploading a Model and its Model Card automatically\n",
+        "\n",
+        "`push_to_hf_hub()` also supports uploading a model card to Hugging Face Hub in the same step, so here we show an example of this feature."
+      ],
+      "metadata": {
+        "id": "EtIMGjGwN3fa"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Use a different repo name (v2)\n",
+        "try:\n",
+        "  api.create_repo(repo_id=MODEL_NAME + \"_v2\")\n",
+        "  print(\"Successfully created repository !\")\n",
+        "except Exception as e:\n",
+        "  print(\"Repository is possibly already created. Refer to error here - \\n\\n\", e)"
+      ],
+      "metadata": {
+        "id": "b1PSkKZVHREc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Add some rows to template with placeholders wrapped by {}\n",
+        "TEMPLATE_2 = TEMPLATE + \"\"\"\n",
+        "## Original Model Name: {model_name}\n",
+        "## Repo ID: {repo_id}\n",
+        "\"\"\"\n",
+        "kwargs = {\"model_name\": \"ABC\", \"repo_id\": \"nvidia/ABC_XYZ\"}\n",
+        "model_card_v2 = model.generate_model_card(template=TEMPLATE_2, template_kwargs=kwargs, type=\"hf\")  # This is a HF ModelCard object"
+      ],
+      "metadata": {
+        "id": "L4yX-ULnO9EO"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.push_to_hf_hub(repo_id=hf_model_name + \"_v2\", pack_nemo_file=False, model_card=model_card_v2)"
+      ],
+      "metadata": {
+        "id": "AgZ5zxVwPNWr"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "hf_model2 = nemo_asr.models.ASRModel.from_pretrained(hf_model_name + \"v2\")"
+      ],
+      "metadata": {
+        "id": "WDgwrr2aQyUS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "unQ-6_2-RUFB"
+      },
+      "execution_count": null,
+      "outputs": []
     }
   ]
 }
\ No newline at end of file

From 15e09a4612b6aca960323fdd9df3424c0192f516 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Thu, 9 May 2024 16:28:48 -0700
Subject: [PATCH 062/178] CUDA memory profile (#9096)

* CUDA memory profile

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../clip/megatron_clip_models.py              | 10 ++-
 .../language_modeling/megatron_bert_model.py  | 10 ++-
 .../language_modeling/megatron_gpt_model.py   | 10 ++-
 .../megatron_gpt_sft_model.py                 |  3 +
 .../megatron_vit_classification_models.py     | 10 ++-
 nemo/core/classes/modelPT.py                  | 75 +++++++++++++++++--
 6 files changed, 100 insertions(+), 18 deletions(-)

diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
index fe35ae148026..7be7407b98ae 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
@@ -358,12 +358,16 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.transformer_engine = cfg.get('transformer_engine', False)
 
         # Convert the global-batch-based profile index to micro-batch index
-        if hasattr(self, '_nsys_profile_enabled'):
+        if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'):
             mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1)
             data_parallel_world_size = trainer.world_size // mp_size
             grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size)
-            self._nsys_profile_start_step *= grad_accum_steps
-            self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_nsys_profile_enabled'):
+                self._nsys_profile_start_step *= grad_accum_steps
+                self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_memory_profile_enabled'):
+                self._memory_profile_start_step *= grad_accum_steps
+                self._memory_profile_end_step *= grad_accum_steps
         self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True)
         self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 0f1fa76f9b01..984fca5f1259 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -136,12 +136,16 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             # Model wrapper to convert both model and inputs to half precision
             self._wrap_model_for_O2()
 
-        if hasattr(self, '_nsys_profile_enabled'):
+        if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'):
             mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1)
             data_parallel_world_size = trainer.world_size // mp_size
             grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size)
-            self._nsys_profile_start_step *= grad_accum_steps
-            self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_nsys_profile_enabled'):
+                self._nsys_profile_start_step *= grad_accum_steps
+                self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_memory_profile_enabled'):
+                self._memory_profile_start_step *= grad_accum_steps
+                self._memory_profile_end_step *= grad_accum_steps
 
     def model_provider_func(self, pre_process, post_process):
         cfg = self.cfg
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 7fbb1f9e6d95..ea56429f4de1 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -364,13 +364,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self._inference_config = None
 
         # Convert the global-batch-based profile index to micro-batch index
-        if hasattr(self, '_nsys_profile_enabled'):
+        if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'):
             mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1)
             cp_size = cfg.get('context_parallel_size', 1)
             data_parallel_world_size = trainer.world_size // (mp_size * cp_size)
             grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size)
-            self._nsys_profile_start_step *= grad_accum_steps
-            self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_nsys_profile_enabled'):
+                self._nsys_profile_start_step *= grad_accum_steps
+                self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_memory_profile_enabled'):
+                self._memory_profile_start_step *= grad_accum_steps
+                self._memory_profile_end_step *= grad_accum_steps
 
         self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True)
         self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 17938beb449e..d7a5cf3f26bf 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -94,6 +94,9 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         if hasattr(self, '_nsys_profile_enabled'):
             self._nsys_profile_start_step = self.cfg.nsys_profile.get('start_step', 0)
             self._nsys_profile_end_step = self.cfg.nsys_profile.get('end_step', 0)
+        if hasattr(self, '_memory_profile_enabled'):
+            self._memory_profile_start_step = self.cfg.memory_profile.get('start_step', 0)
+            self._memory_profile_end_step = self.cfg.memory_profile.get('end_step', 0)
 
         self.virtual_tokens = 0
         self.init_global_step = 0
diff --git a/nemo/collections/vision/models/megatron_vit_classification_models.py b/nemo/collections/vision/models/megatron_vit_classification_models.py
index ea6d3578c540..46788d2c882c 100644
--- a/nemo/collections/vision/models/megatron_vit_classification_models.py
+++ b/nemo/collections/vision/models/megatron_vit_classification_models.py
@@ -181,12 +181,16 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.transformer_engine = cfg.get('transformer_engine', False)
 
         # Convert the global-batch-based profile index to micro-batch index
-        if hasattr(self, '_nsys_profile_enabled'):
+        if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'):
             mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1)
             data_parallel_world_size = trainer.world_size // mp_size
             grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size)
-            self._nsys_profile_start_step *= grad_accum_steps
-            self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_nsys_profile_enabled'):
+                self._nsys_profile_start_step *= grad_accum_steps
+                self._nsys_profile_end_step *= grad_accum_steps
+            if hasattr(self, '_memory_profile_enabled'):
+                self._memory_profile_start_step *= grad_accum_steps
+                self._memory_profile_end_step *= grad_accum_steps
         self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True)
         self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
 
diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index 95a3b3309315..0a9054c23da8 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -203,10 +203,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.training_step = model_utils.wrap_training_step(self.training_step)
 
         # Setup nsys profiling if it has been enabled in the model config
-        self._setup_nsys_profiling()
+        self._setup_profiling()
 
         # A flag for the profile generation
-        self._profile_complete = False
+        self._nsys_profile_started = False
+        self._nsys_profile_complete = False
+        self._memory_profile_started = False
+        self._memory_profile_complete = False
 
     def __init_subclass__(cls) -> None:
         cls._save_restore_connector = SaveRestoreConnector()
@@ -1706,7 +1709,7 @@ def update_save_restore_connector(cls, save_restore_connector):
         else:
             setattr(cls, '_save_restore_connector', save_restore_connector)
 
-    def _setup_nsys_profiling(self):
+    def _setup_profiling(self):
         """ Enables nsys profiling
             To use, add the following optoins to the model config:
             ## Nsys profiling options
@@ -1718,6 +1721,15 @@ def _setup_nsys_profiling(self):
             And then wrap the model training script with:
             nsys profile -s none -o <profile filepath>  -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/...
             See more options at: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling
+
+            Enables CUDA memory profiling
+            To use, add the following optoins to the model config:
+            ## CUDA memory profiling options
+            memory_profile: False
+                start_step: 10  # Global batch to start profiling
+                end_step: 10 # Global batch to end profiling
+                rank: 0 # Global rank ID to profile
+                output_path: None # Path to store the profile output file
         """
         if self.cfg.get('nsys_profile', None) is not None:
             if self.cfg.nsys_profile.get('enabled', False):
@@ -1745,6 +1757,39 @@ def _setup_nsys_profiling(self):
                 else:
                     raise ValueError(f'Nsys end_step must be greater than or equal to nsys start_step')
 
+        if self.cfg.get('memory_profile', None) is not None:
+            if self.cfg.memory_profile.get('enabled', False):
+                # CUDA memory profiling options
+                self._memory_profile_enabled = True
+                self._memory_profile_start_step = self.cfg.memory_profile.get('start_step', 0)
+                self._memory_profile_end_step = self.cfg.memory_profile.get('end_step', 0)
+                self._memory_profile_rank = self.cfg.memory_profile.get('rank', 0)
+                self._memory_profile_output_path = self.cfg.memory_profile.get('output_path', None)
+
+                if type(self._memory_profile_start_step) == int:
+                    logging.info(f'Nsys profiling setup with start_step: {self._memory_profile_start_step}')
+                else:
+                    raise ValueError(
+                        f'CUDA memory start_step must be of type int. Found: {type(self._memory_profile_start_step)}'
+                    )
+
+                if type(self._memory_profile_end_step) == int:
+                    logging.info(f'CUDA memory profiling setup with end_step: {self._memory_profile_end_step}')
+                else:
+                    raise ValueError(
+                        f'CUDA memory end_step must be of type int. Found: {type(self._memory_profile_end_step)}'
+                    )
+
+                if self._memory_profile_end_step >= self._memory_profile_start_step:
+                    pass
+                else:
+                    raise ValueError(f'CUDA memory end_step must be greater than or equal to memory start_step')
+
+                if self._memory_profile_output_path is None or not os.path.isdir(self._memory_profile_output_path):
+                    raise ValueError(
+                        f'Memory profile output path ({self._memory_profile_output_path}) is not set or does not exist.'
+                    )
+
     def on_train_start(self):
         """ PyTorch Lightning hook:
             https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-start
@@ -1773,12 +1818,20 @@ def on_train_batch_start(self, batch: Any, batch_idx: int, unused: int = 0) -> O
         # nsys profiling
         if self.device.type == 'cuda':
             if hasattr(self, '_nsys_profile_enabled'):
-                if self._nsys_profile_enabled and not self._profile_complete:
+                if self._nsys_profile_enabled and not self._nsys_profile_started:
                     if batch_idx >= self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
                         logging.info("====== Start nsys profiling ======")
                         torch.cuda.cudart().cudaProfilerStart()
                         if self._nsys_profile_gen_shape:
                             torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
+                        self._nsys_profile_started = True
+
+            if hasattr(self, '_memory_profile_enabled'):
+                if self._memory_profile_enabled and not self._memory_profile_started:
+                    if batch_idx >= self._memory_profile_start_step and get_rank() == self._memory_profile_rank:
+                        logging.info("====== Start CUDA memory profiling ======")
+                        torch.cuda.memory._record_memory_history(max_entries=100000)
+                        self._memory_profile_started = True
 
         # dynamic freezing
         if hasattr(self, '_freeze_cfg') and self._freeze_cfg is not None:
@@ -1810,11 +1863,21 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int, unused: int =
 
         if self.device.type == 'cuda':
             if hasattr(self, '_nsys_profile_enabled'):
-                if self._nsys_profile_enabled and not self._profile_complete:
+                if self._nsys_profile_enabled and not self._nsys_profile_complete:
                     if batch_idx >= self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
                         logging.info("====== End nsys profiling ======")
                         torch.cuda.cudart().cudaProfilerStop()
-                        self._profile_complete = True
+                        self._nsys_profile_complete = True
+
+            if hasattr(self, '_memory_profile_enabled'):
+                if self._memory_profile_enabled and not self._memory_profile_complete:
+                    if batch_idx >= self._memory_profile_end_step and get_rank() == self._memory_profile_rank:
+                        logging.info("====== End CUDA memory profiling ======")
+                        torch.cuda.memory._dump_snapshot(
+                            f'{self._memory_profile_output_path}/memory_profile_rank{self._memory_profile_rank}.pickle'
+                        )
+                        torch.cuda.memory._record_memory_history(enabled=None)
+                        self._memory_profile_complete = True
 
     def _cleanup_on_execution_end(self):
         """

From 17769bc1256f4ca513446580a6d9d556df1e03ab Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Thu, 9 May 2024 16:36:27 -0700
Subject: [PATCH 063/178] comment out flaky PTQ tests (#9160)

---
 .github/workflows/cicd-main.yml | 122 ++++++++++++++++----------------
 1 file changed, 61 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index bc8f9fb0bab2..252843bcc0ce 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -403,68 +403,68 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  L2_PTQ_Llama2_FP8:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options:
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-            tensor_model_parallel_size=2 \
-            trainer.devices=2 \
-            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-            quantization.algorithm=fp8 \
-            quantization.num_calib_size=8 \
-            inference.batch_size=2 \
-            export.inference_tensor_parallel=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
-
-            rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
-  L2_PTQ_Llama2_INT8_SQ:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options:
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-            quantization.algorithm=int8_sq \
-            quantization.num_calib_size=8 \
-            inference.batch_size=2 \
-            model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+  # L2_PTQ_Llama2_FP8:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   timeout-minutes: 10
+  #   container:
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+  #     options:
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g
+  #       --env TRANSFORMERS_OFFLINE=0
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /mnt/datadrive/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v4
+  #       - run: |
+  #           python examples/nlp/language_modeling/megatron_llama_quantization.py \
+  #           model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+  #           tensor_model_parallel_size=2 \
+  #           trainer.devices=2 \
+  #           quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+  #           quantization.algorithm=fp8 \
+  #           quantization.num_calib_size=8 \
+  #           inference.batch_size=2 \
+  #           export.inference_tensor_parallel=2 \
+  #           model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+
+  #           rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+  #         if: "failure()"
 
-            rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+  # L2_PTQ_Llama2_INT8_SQ:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   timeout-minutes: 10
+  #   container:
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+  #     options:
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g
+  #       --env TRANSFORMERS_OFFLINE=0
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /mnt/datadrive/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v4
+  #       - run: |
+  #           python examples/nlp/language_modeling/megatron_llama_quantization.py \
+  #           model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+  #           quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+  #           quantization.algorithm=int8_sq \
+  #           quantization.num_calib_size=8 \
+  #           inference.batch_size=2 \
+  #           model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+
+  #           rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+  #         if: "failure()"
 
   #L2_PTQ_Llama2_INT4_AWQ:
   #  needs: [cicd-test-container-setup]

From fcd3f8392f7ac1e3850964c6be1fdf9124664730 Mon Sep 17 00:00:00 2001
From: gdengk <160076886+gdengk@users.noreply.github.com>
Date: Thu, 9 May 2024 19:22:18 -0700
Subject: [PATCH 064/178] Fix missing func for T5 model (#9141)

* add conditions

Signed-off-by: Gao Deng <gdeng@nvidia.com>

* condition check

Signed-off-by: Gao Deng <gdeng@nvidia.com>

---------

Signed-off-by: Gao Deng <gdeng@nvidia.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 97661c752c52..ab8c4eeeb19f 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -239,7 +239,8 @@ def configure_ddp(self):
             hasattr(self.model, 'with_distributed_adam') and self.model.with_distributed_adam
         ):
             # do not use DDP if using megatron amp O2 or distributed optimizer
-            self.model.setup_mcore_distributed_parallel()
+            if self.model.use_mcore_dist_optim:
+                self.model.setup_mcore_distributed_parallel()
             self._model = self.model
         else:
             app_state = AppState()

From 1d750864c75c7196895cf6b6ea5c63e1b12a46e7 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Fri, 10 May 2024 08:05:49 +0400
Subject: [PATCH 065/178] Update black to the latest version incrementally
 (#8476)

* Update black version

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Skip pre-commit.ci

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Allow to fix PRs

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix requirements

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add github action for reformatting

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Update github action

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Test change python file

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Use allowed github actions

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Remove installation

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix add-and-commit

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Test DCO

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix isort version

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix isort version

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix isort version and add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Trigger only on PRs

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Fix action (requires token)

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Update formatter versions

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Test reformat file

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Check with default github token

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Use custom token

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Test user

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Remove user

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Test change

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Use github token

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Test change

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Add token to changed files

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Test changes

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* Fix fetch depth

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix fetch depth

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Avoid setting repository

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Avoid setting ref

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Use pull_request_target

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix version

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Test change

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Revert change

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
---
 .github/workflows/code-formatting.yml | 66 +++++++++++++++++++++++++++
 .pre-commit-config.yaml               | 16 +++++--
 pyproject.toml                        | 31 ++++++++++++-
 requirements/requirements_test.txt    |  2 +-
 4 files changed, 107 insertions(+), 8 deletions(-)
 create mode 100644 .github/workflows/code-formatting.yml

diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml
new file mode 100644
index 000000000000..a4b8cf3d4072
--- /dev/null
+++ b/.github/workflows/code-formatting.yml
@@ -0,0 +1,66 @@
+name: Isort and Black Formatting
+# Incrementally reformat only changed files with black, all files with isort
+#
+# Replaces pre-commit.ci, since it reformats all the files.
+# See issue https://github.com/pre-commit-ci/issues/issues/90
+#
+# The action requires a custom token to trigger workflow after pushing reformatted files back to the branch.
+# `secrets.GITHUB_TOKEN` can be used instead, but this will result
+# in not running necessary checks after reformatting, which is undesirable.
+# For details see https://github.com/orgs/community/discussions/25702
+
+on:
+  pull_request_target:
+    paths:
+      - '**.py'
+
+jobs:
+  reformat_with_isort_and_black:
+    runs-on: ubuntu-latest
+    permissions:
+      # write permissions required to commit changes
+      contents: write
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@v4
+        with:
+          # setup repository and ref for PRs, see
+          # https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+          # custom token is required to trigger actions after reformatting + pushing
+          token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
+
+      # https://github.com/tj-actions/changed-files
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v44
+        with:
+          files: |
+            **.py
+
+      - name: Setup Python env
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: black
+        uses: psf/black@stable
+        with:
+          options: "--verbose"
+          # apply only to changed files (pass explicitly the files)
+          src: "${{ steps.changed-files.outputs.all_changed_files }}"
+          version: "~= 24.3"
+
+      - name: isort
+        uses: isort/isort-action@v1
+        with:
+          isort-version: "5.13.2"
+          # reformat all files with isort – safe since the whole repo is already reformatted
+          configuration: ""
+
+      - uses: EndBug/add-and-commit@v9
+        # Commit changes. Nothing is committed if no changes.
+        with:
+            message: Apply isort and black reformatting
+            commit: --signoff
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 75d1a6c51a1e..3f2213062872 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,6 +19,8 @@ ci:
   autofix_prs: true
   autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
   autoupdate_schedule: quarterly
+  # skip all hooks that can change the files, use GitHub Action "code-formatting.yml" for this
+  skip: [black,isort]
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -32,15 +34,19 @@ repos:
       - id: requirements-txt-fixer
 
   - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
         name: Format imports
         exclude: docs/
 
-  - repo: https://github.com/psf/black
-    rev: 19.10b0
+  # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 24.3.0
     hooks:
       - id: black
-        name: Format code
-        additional_dependencies: ['click==8.0.2']
+        # It is recommended to specify the latest version of Python
+        # supported by your project here, or alternatively use
+        # pre-commit's default_language_version, see
+        # https://pre-commit.com/#top_level-default_language_version
+        language_version: python3.10
diff --git a/pyproject.toml b/pyproject.toml
index 680eccf8156b..a9c1ba7938fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,8 +27,35 @@ extend_skip = ["setup.py", "docs/source/conf.py"]
 [tool.black]
 line_length = 119
 skip_string_normalization = true
-required_version = "19.10b0"  # recongized by future versions, disallows to reformat code with incompatible versions
-
+# major year version is stable, see details in
+# https://black.readthedocs.io/en/stable/the_black_code_style/index.html
+# `required_version` is necessary for consistency (other `black` versions will fail to reformat files)
+required_version = "24"
+target-version = ['py310', 'py311', 'py312']
+extend-exclude = '''
+# A regex preceded with ^/ will apply only to files and directories
+# in the root of the project.
+# include here only current collections, new collections should not be ignored
+# exclude the collection once it is reformatted (due to changes in PRs)
+(
+  ^\/docs\/
+  | ^\/external\/
+  | ^\/examples\/
+  | ^\/nemo\/collections\/asr\/
+  | ^\/nemo\/collections\/common\/
+  | ^\/nemo\/collections\/multimodal\/
+  | ^\/nemo\/collections\/nlp\/
+  | ^\/nemo\/collections\/tts\/
+  | ^\/nemo\/collections\/vision\/
+  | ^\/nemo\/core\/
+  | ^\/nemo\/utils\/
+  | ^\/scripts\/
+  | ^\/tests\/
+  | ^\/tools\/
+  | ^\/tutorials\/
+  | ^\/setup.py
+)
+'''
 
 [tool.pytest.ini_options]
 # durations=0 will display all tests execution time, sorted in ascending order starting from from the slowest one.
diff --git a/requirements/requirements_test.txt b/requirements/requirements_test.txt
index 9440405bc55b..f0a35f5b087e 100644
--- a/requirements/requirements_test.txt
+++ b/requirements/requirements_test.txt
@@ -1,4 +1,4 @@
-black==19.10b0
+black~=24.3
 click==8.0.2
 isort>5.1.0,<6.0.0
 parameterized

From 865839e8769d7af08fc28fbf7b5bf1e79378238c Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Fri, 10 May 2024 07:48:43 +0200
Subject: [PATCH 066/178] Add knob for load_directly_on_device (#9125)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add knob for load_directly_on_device

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../conf/megatron_gpt_config.yaml                 |  3 ++-
 .../nlp/parts/megatron_trainer_builder.py         |  2 +-
 nemo/collections/nlp/parts/nlp_overrides.py       |  4 ++--
 nemo/utils/callbacks/dist_ckpt_io.py              | 15 +++++++++++++--
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index dd92b46b5369..aa43dfe7e53e 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -150,8 +150,9 @@ model:
   fsdp_grad_reduce_dtype: 32 # Gradient reduction data type.
   fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
 
-  # Distributed checkpoint format
+  # Distributed checkpoint setup
   dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
 
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 367cf46c6fd0..a97b9301fb26 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -138,7 +138,7 @@ def _plugins(self) -> list:
             self.cfg.model.get('mcore_gpt', False) or self.cfg.model.get('mcore_bert', False)
         )
         if use_dist_ckpt:
-            plugins.append(DistributedCheckpointIO(self.cfg.model.get('dist_ckpt_format', 'zarr')))
+            plugins.append(DistributedCheckpointIO.from_config(self.cfg.model))
 
         return plugins
 
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index ab8c4eeeb19f..1c68ebff8121 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -475,7 +475,7 @@ def use_distributed_checkpointing(self):
             logging.warning(
                 'Distributed checkpoints requires DistributedCheckpointIO plugin to be used. Setting up a default now.'
             )
-            self.checkpoint_io = DistributedCheckpointIO(self.lightning_module.cfg.get('dist_ckpt_format', 'zarr'))
+            self.checkpoint_io = DistributedCheckpointIO.from_config(self.lightning_module.cfg)
         if not has_sharded_state_dict and has_dist_ckpt_io:
             logging.warning(
                 'DistributedCheckpointIO configured but should not be used. Reverting back to TorchCheckpointIO'
@@ -1151,7 +1151,7 @@ def dummy():
                 tmp_model_weights_ckpt = os.path.join(tmpdir, self.model_weights_ckpt)
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
-                checkpoint_io = DistributedCheckpointIO(conf.get('dist_ckpt_format', 'zarr'))
+                checkpoint_io = DistributedCheckpointIO.from_config(conf)
                 checkpoint = checkpoint_io.load_checkpoint(tmp_model_weights_dir, sharded_state_dict=checkpoint)
                 instance.on_load_checkpoint(checkpoint)
                 if hasattr(instance, 'setup_transformer_engine_tp_groups'):
diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
index 7dff9b458a0d..2e695dd7bbaa 100644
--- a/nemo/utils/callbacks/dist_ckpt_io.py
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -15,14 +15,25 @@ class DistributedCheckpointIO(CheckpointIO):
 
     Args:
         save_ckpt_format (str): Distributed checkpoint format to use for checkpoint saving.
+        load_directly_on_device (bool, optional): if True, loads the weights directly
+            on GPU. Has effect only for `zarr` based checkpoints (PyT Distributed
+            always loads on device). Defaults to True.
     """
 
-    def __init__(self, save_ckpt_format: str):
+    def __init__(self, save_ckpt_format: str, load_directly_on_device: bool = True):
         super().__init__()
         self.save_ckpt_format = save_ckpt_format
+        self.load_directly_on_device = load_directly_on_device
 
         self.save_sharded_strategy = self.determine_dist_ckpt_save_strategy()
 
+    @classmethod
+    def from_config(cls, model_cfg):
+        return cls(
+            save_ckpt_format=model_cfg.get('dist_ckpt_format', 'zarr'),
+            load_directly_on_device=model_cfg.get('dist_ckpt_load_on_device', True),
+        )
+
     def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
         """ Saves a distributed checkpoint. Creates the checkpoint root directory if doesn't exist.
 
@@ -59,7 +70,7 @@ def load_checkpoint(
         if map_location is not None:
             raise ValueError('DistributedCheckpointIO doesnt handle map_location argument')
 
-        if self.save_ckpt_format == 'zarr':
+        if self.save_ckpt_format == 'zarr' and self.load_directly_on_device:
             sharded_strategy = tensorstore.TensorStoreLoadShardedStrategy(load_directly_on_device=True)
         else:
             sharded_strategy = None

From 47b1553c948ac25ef27eb8710f57ec3ab0f946c2 Mon Sep 17 00:00:00 2001
From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com>
Date: Sat, 11 May 2024 10:24:11 +0800
Subject: [PATCH 067/178] Add SpeechLM to main (#8741)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update package info

Signed-off-by: ericharper <complex451@gmail.com>

* fix the mpt chatbot (#6957)

Signed-off-by: Yi Dong <yidong@nvidia.com>

* Remove `compute_on_step` from metrics (#6979)

* Remove `compute_on_step` from metrics

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove confusing log message

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update tests

Signed-off-by: smajumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Hybrid conformer export (#6983)

* Implemented generic kv-pair setting of export_config from args

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Hybrid conformer export

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Hybrid decoder export

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Cleanup

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Changed from **kwargs

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Docstring

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Docs added

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Stringify args

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Added docs for ASR export configs

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* lowercase ctc

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

---------

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Cache handling without input tensors mutation (#6980)

* Cache handling without input tensors mutation

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Cleanup

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Cleanup#2

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Cleanup#3

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

---------

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* fixes for spellmapper (#6994)

Signed-off-by: Alexandra Antonova <antonova_sasha@list.ru>

* Fixing an issue with confidence ensembles (#6987)

* Bug fix for the confidence ensembles

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* Relax constraints for the test

Signed-off-by: Igor Gitman <igitman@nvidia.com>

---------

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* [TTS] Append pretrained FastPitch & SpectrogamEnhancer pair to available models (#7012)

* [TTS] fastpitch: add english libritts model with asr stft parameters (25 ms 10 ms)

Signed-off-by: Roman Korostik <rkorostik@nvidia.com>

* [TTS] enhancer: add pretrained model intended for asr finetuning

Signed-off-by: Roman Korostik <rkorostik@nvidia.com>

---------

Signed-off-by: Roman Korostik <rkorostik@nvidia.com>

* Add ASR with TTS Tutorial. Fix enhancer usage. (#6955)

* Add ASR with TTS Tutorial
* Fix enhancer usage

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* install_bs (#7019)

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>

* fix tab text gen (#7022)

Signed-off-by: Yi Dong <yidong@nvidia.com>

* TE bug fix (#7027)

Signed-off-by: Dmytro Pykhtar <dpykhtar@nvidia.com>

* Add support for Numba FP16 RNNT Loss (#6991) (#7038)

* Force working space memory to always be in fp32

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add support for fp16 testing in Numba

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add support for fp16 testing in Numba

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add support for fp16 testing in Numba

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix cost calculation by upcasting to fp32

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix cost calculation by upcasting to fp32

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add support to check if numba fp16 is available

Signed-off-by: smajumdar <titu1994@gmail.com>

* add RNN-T loss implemented by PyTorch and test code (#5312)

* Fix the bugs in cache-aware streaming Conformer (#5032)

Signed-off-by: Vahid <vnoroozi@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* IA3 support for GPT and T5 (#4909)

* init commit for ia3 adater training in GPT

Signed-off-by: arendu <adithya.r@gmail.com>

* ia3 adater training in GPT, models and adapter classes

Signed-off-by: arendu <adithya.r@gmail.com>

* reshape to operate even on non-contiguous tensors

Signed-off-by: arendu <adithya.r@gmail.com>

* configs

Signed-off-by: arendu <adithya.r@gmail.com>

* fixed none init

Signed-off-by: arendu <adithya.r@gmail.com>

* adding adapter and ia3 support for T5 based models

Signed-off-by: arendu <adithya.r@gmail.com>

* style fix

Signed-off-by: arendu <adithya.r@gmail.com>

* config update and t5 model adapter and ia3

Signed-off-by: arendu <adithya.r@gmail.com>

* removed unused imports

Signed-off-by: arendu <adithya.r@gmail.com>

* predict step for inference

Signed-off-by: arendu <adithya.r@gmail.com>

* style fix

Signed-off-by: arendu <adithya.r@gmail.com>

* style fix

Signed-off-by: arendu <adithya.r@gmail.com>

* adapter inference for t5

Signed-off-by: arendu <adithya.r@gmail.com>

* style fix

Signed-off-by: arendu <adithya.r@gmail.com>

* fixed bug micro and global batch size in eval

Signed-off-by: arendu <adithya.r@gmail.com>

* minor edit

Signed-off-by: arendu <adithya.r@gmail.com>

* agressive truncation if in test examples if no truncation field is given

Signed-off-by: arendu <adithya.r@gmail.com>

* corrected for language_model_path name changes in main

Signed-off-by: arendu <adithya.r@gmail.com>

* removed unused import

Signed-off-by: arendu <adithya.r@gmail.com>

* name change for language_model_path

Signed-off-by: arendu <adithya.r@gmail.com>

* include inter_attention to IA3

Signed-off-by: arendu <adithya.r@gmail.com>

* minor fix in confg

Signed-off-by: arendu <adithya.r@gmail.com>

* minor fixes

Signed-off-by: arendu <adithya.r@gmail.com>

* removed unused flag

Signed-off-by: arendu <adithya.r@gmail.com>

* addressing PR comments

Signed-off-by: arendu <adithya.r@gmail.com>

* address PR comments

Signed-off-by: arendu <adithya.r@gmail.com>

* minor fix

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* style fix

Signed-off-by: arendu <adithya.r@gmail.com>

* CI test

Signed-off-by: arendu <adithya.r@gmail.com>

* minor fix in jenkinsfile

Signed-off-by: arendu <adithya.r@gmail.com>

Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Bug fix - Limit val batches set to 1.0  (#5023)

* Bug fix

Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Adressed sandeep's comments

* Fixing limit val batches support in bert

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixing limit val batches support in bert

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* [bug_fix] kv_channels is used when available (#5066)

* fix bug s.t kv_channels is used when available

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* P&C Docs (#5068) (#5069)

Signed-off-by: Matvei Novikov <mattyson.so@gmail.com>

Signed-off-by: Matvei Novikov <mattyson.so@gmail.com>

Signed-off-by: Matvei Novikov <mattyson.so@gmail.com>
Co-authored-by: Matvei Novikov <mattyson.so@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Add spe_split_by_unicode_script arg (#5072)

* Add spe_split_by_unicode_script arg

Signed-off-by: Anas <aabouallaban@pm.me>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Anas <aabouallaban@pm.me>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* probabilites -> probabilities (#5078) (#5079)

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* increase PR and Issue sweep quantity and active close PRs. (#5073)

* increase PR and Issue sweep quantity and active close PRs.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* update with stricter rules, 30 days to be stale and 7 days to be closed for both Issues and PRs.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* [TTS] added missing German phoneme tokenizer. (#5070) (#5074)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* rename to match prompt leanring (#5076)

Signed-off-by: arendu <adithya.r@gmail.com>

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Missing fixes from r1.11.0 to T5 finetuning eval (#5054) (#5061)

* Fixes to seq2seq eval

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Style

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Notebook bug fixes (#5084) (#5085)

* Notebook bug fixes

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Turned nemo install back on

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* reverted notebook

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Updated one line in entity linking nb

Signed-off-by: Virginia Adams <vadams@nvidia.com>

Signed-off-by: Virginia Adams <vadams@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

Signed-off-by: Virginia Adams <vadams@nvidia.com>
Co-authored-by: Virginia Adams <78445382+vadam5@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* update strategy in notebook from ddp_fork to dp (#5088) (#5089)

Co-authored-by: Zhilin Wang <wangzhilin12061996@hotmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Fix bug in Squeezeformer Conv block (#5011) (#5024)

* Fix bug in Squeezeformer Conv block

Signed-off-by: smajumdar <smajumdar@nvidia.com>

* Fix kernel context

Signed-off-by: smajumdar <smajumdar@nvidia.com>

* Fix access mixin

Signed-off-by: smajumdar <smajumdar@nvidia.com>

Signed-off-by: smajumdar <smajumdar@nvidia.com>

Signed-off-by: smajumdar <smajumdar@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* fixed megatron lm conversion bug (PTL related) (#5038) (#5063)

Signed-off-by: David Mosallanezhad <dmosallanezh@nvidia.com>

Signed-off-by: David Mosallanezhad <dmosallanezh@nvidia.com>
Co-authored-by: David Mosallanezhad <dmosallanezh@nvidia.com>

Signed-off-by: David Mosallanezhad <dmosallanezh@nvidia.com>
Co-authored-by: David <amosalla@asu.edu>
Co-authored-by: David Mosallanezhad <dmosallanezh@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Fix Unhashable type list for Numba Cuda spec augment kernel (#5093) (#5094)

Signed-off-by: smajumdar <smajumdar@nvidia.com>

Signed-off-by: smajumdar <smajumdar@nvidia.com>

Signed-off-by: smajumdar <smajumdar@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Fix numba (#5098)

Signed-off-by: smajumdar <titu1994@gmail.com>

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Make it possible to specify output_filename in normalize_with_audio.py (#5092)

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Greedy decoding confidence for CTC and RNNT (#4931)

* rnnt confidence draft

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* word confidence

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* advanced entropies added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* refactoring

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* oops forgot a file

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* metrics and benchmarking script added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* style fix

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* texterrors installation added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* lgtm and bug fix

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* fix comments

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* fix typos

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* add missing import after rebase

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: Aleksandr Laptev <alaptev@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* [Add] SLURP models and examples (#4668)

* add model, util and loss

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor annd update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update available models

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor data processing

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix typo

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor and update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* move transformer to asr.modules

Signed-off-by: stevehuang52 <heh@nvidia.com>

* move transformer to asr.modules

Signed-off-by: stevehuang52 <heh@nvidia.com>

* get rid of jsonlines

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* revert changes to nlp

Signed-off-by: stevehuang52 <heh@nvidia.com>

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Jagadeesh Balam <4916480+jbalam-nv@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* only optimize params that are part of the adapter modules (#5086)

Signed-off-by: arendu <adithya.r@gmail.com>

Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: Virginia Adams <78445382+vadam5@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Pipeline Parallel T5 Prompt Learning (#4956)

* Added pre process flag checks and pipeline parallel in fwd

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Added rank check for pipeline parallel

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* T5 prompt learning works!

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* IA3 passing CI

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Fixed typo

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* removed optimizer setup so Adi's change will not conflict

Signed-off-by: Virginia Adams <vadams@nvidia.com>

Signed-off-by: Virginia Adams <vadams@nvidia.com>
Signed-off-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* [TTS] remove phonemizer.py (#5090)

remove phonemizer.py and convert code block to markdown in the tutorial.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* T5 Decoding with PP > 2 fix (#5091) (#5103)

* set sequence lenghts in the pipeline properly

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Fix

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* [TTS] fixed wrong val loss for epoch 0 and inconsistent metrics names (#5087) (#5102)

* fixed hifigan configs as well
* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Fix and refactor consumed samples save/restore for Megatron models. (#5077)

* Fixes and refactor

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Fix

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Remove unused imports

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Empty

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Fix

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* RIR corpus generator tool (#4927)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Multiprocessing fix (#5106) (#5107)

Signed-off-by: Matvei Novikov <mattyson.so@gmail.com>

Signed-off-by: Matvei Novikov <mattyson.so@gmail.com>

Signed-off-by: Matvei Novikov <mattyson.so@gmail.com>
Co-authored-by: Matvei Novikov <mattyson.so@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* [Bug fix] PC lexical + audio (#5109) (#5110)

* training running

Signed-off-by: ekmb <ebakhturina@nvidia.com>

* revert

Signed-off-by: ekmb <ebakhturina@nvidia.com>

* revert

Signed-off-by: ekmb <ebakhturina@nvidia.com>

Signed-off-by: ekmb <ebakhturina@nvidia.com>

Signed-off-by: ekmb <ebakhturina@nvidia.com>
Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* [Fix] schedulers with no max_steps param (#4564)

* fix schedulers

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update to use python inspect module

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* T5 prompt learning fixes missing from r.11.0 merge (#5075) (#5101)

* Fix special tokens

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Fix

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Empty

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: David <amosalla@asu.edu>

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: David <amosalla@asu.edu>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* [TTS] Add NeMo TTS Primer Tutorial (#4933)

* [TTS] Add NeMo TTS Primer Tutorial

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Add Squeezeformer CTC model checkpoints on Librispeech (#5121)

Signed-off-by: smajumdar <titu1994@gmail.com>

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* adding loss normalization options to rnnt joint  (#4829)

* adding normalization options to rnnt joint loss

* moving the param to joint

* moving loss normalization to rnnt loss config

* style

* cleaning up

* fixing sum reduction in joint

Signed-off-by: Dima Rekesh <drekesh@nvidia.com>

* moving reduction into RNNT loss class

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactoring

* typos

Signed-off-by: Dima Rekesh <drekesh@nvidia.com>

Signed-off-by: Dima Rekesh <drekesh@nvidia.com>
Co-authored-by: Dima Rekesh <drekesh@nvidia.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Asr concat dataloader (#5108)

* forced precision

* typo

* initial commit

Signed-off-by: Dima Rekesh <bmwshop@gmail.com>

* typos and bugs

Signed-off-by: Dima Rekesh <drekesh@nvidia.com>

* reverting conformer encoder

Signed-off-by: Dima Rekesh <drekesh@nvidia.com>

* additional checks

Signed-off-by: Dima Rekesh <bmwshop@gmail.com>

* adding support to CTC models as well

* reverting conformer_encoder

Signed-off-by: Dima Rekesh <bmwshop@gmail.com>

* typo

Signed-off-by: Dima Rekesh <bmwshop@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactoring

Signed-off-by: Dima Rekesh <bmwshop@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refactoring

Signed-off-by: Dima Rekesh <drekesh@nvidia.com>

* merging

Signed-off-by: Dima Rekesh <drekesh@nvidia.com>

Signed-off-by: Dima Rekesh <bmwshop@gmail.com>
Signed-off-by: Dima Rekesh <drekesh@nvidia.com>
Co-authored-by: Dima Rekesh <drekesh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* fix blossom ci unittests

Signed-off-by: Oleksii Kuchaiev <okuchaiev@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* bugfix: pybtex.database.InvalidNameString: Too many commas in author field. (#5112) (#5115)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Uppdate container version to 22.09 (#5105)

* update container version

Signed-off-by: ericharper <complex451@gmail.com>

* pin click

Signed-off-by: ericharper <complex451@gmail.com>

* pin click 8.0.2

Signed-off-by: ericharper <complex451@gmail.com>

Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Remove unsupported arguments from MegatronNMT (#5065)

* Fixes

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Fixes

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Style

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Fix

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* More fixes

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* pp2 support for T5 IA3 learning and T5 Adapters learning (#5116)

* enabling pp2

Signed-off-by: arendu <adithya.r@gmail.com>

* optimizer update

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* T5 pp>1 support for adapters and ia3

Signed-off-by: arendu <adithya.r@gmail.com>

* fix bug with missing adapter_tuning

Signed-off-by: arendu <adithya.r@gmail.com>

* inference error fixed, pp=2

Signed-off-by: arendu <adithya.r@gmail.com>

Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* T5 Prompt Learning Fixes for Pipeline Parallel (#5120)

* Initial fixes

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* Added back validation acc

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Put num workers back

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* added relative encoding if statament

Signed-off-by: Virginia Adams <vadams@selene-login-01.nvidia.com>

* Added back val loss only validation

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Revert "Added back val loss only validation"

This reverts commit 86d8f4806fe30335c40c3716ce18259939df500f.

* Removed val acc for PP > 1

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Removed enc_seq_len if statement

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Added back validation acc calc

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Virginia Adams <vadams@nvidia.com>
Signed-off-by: Virginia Adams <vadams@selene-login-01.nvidia.com>
Co-authored-by: Virginia Adams <vadams@nvidia.com>
Co-authored-by: Virginia Adams <78445382+vadam5@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Virginia Adams <vadams@selene-login-01.nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* add doc info (#4721)

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* [TTS] Add SpanishCharsTokenizer (#5135)

* [TTS] Add SpanishCharsTokenizer

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Update megatron interface to dialogue (#4936)

* fix style formatting

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update template to include description of intent

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* changes based on requests in review

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add compatibility with assistant dataset

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkins

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove dialogue_state_tracking

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update huggingface utils for dialogue

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* rename dialogue_state_tracking_hybrid to dialogue_state_tracking_sgdqa

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* fix style

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix nemo/collections/nlp/models/dialogue_state_tracking_sgdqa/__init__.py

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile for SGDGEN

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile for SGDGEN

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile for SGDGEN

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile for SGDGEN

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile for SGDGEN

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* fix typo

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add docstrings for assistant data processsor

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkins for SGDGEN local checkpoint

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update style

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* use local vocab file for Jenkinsfile

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* patch for Jenkins CI using local file

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add slot filling prediction and metrics

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove unused code

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* refactor metrics code out of Dialogue GPT Model

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* integrate backward compatible support for IntentSlotClassificationModel (bert model)

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* save prediction file for IntentSlotClassification

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update dialogue gpt model training for megatron gpt

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove batch generate for HF GPT2, which causes lower performance

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add few shot capability to dialogue gpt model

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile and remove unused import

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update code description and clarity

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* address PR comments

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* integrate compatibility with ZeroShotIntentModel

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* rename folder to dialogue due to increased scope and further refactor for clarity

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* added dialogue GPT for sequence generation task (e.g. answer extender)

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add CI test for DialogueGPTGenerationModel

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* integrate DialogueS2SGenerationModel for generation task (e.g. answer extender)

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* modify huggingface utils to support HF t5/BART models

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove unused imports

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update bleu metric

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* fix bleu metric style

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* debug bleu metric

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* debug bleu metric

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update based on PR #3893

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update 2 based on PR #3893

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update 3 based on PR #3893

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* integrate sgd generation based on user user utterance and system slot-values to generate system utterance

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add validation model saving capabilities

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* cleaned up code for SGD Based Answer extender

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Dialogue Generation CI

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkinsfile

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* fix Jenkins CI issue"

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add support for design dataset

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove unnecessary imports

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkins

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update jenkins

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update jenkins

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* support megatron for dialogue_s2s_generation_model

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* reduce loaded samples in MSMarcoDataProcessor to 64 when cfg.model.dataset.debug_mode=True

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update CI

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update checkpoint and predictions filename to include epoch number

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* integrate HF BART MNLI into zero shot intent model

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* integrate Dialogue Nearest Neighbour Model

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkins

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Jenkins

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* refactor Dialogue SGD Data Processor to make interface for models cleaner

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update jenkins

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update Dialogue S2S Generation model for DialogueSGDDataProcessor interface

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update jenkins

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update jenkins

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* support sgd and drive thru datasets by zero shot model and nearest neighbour model

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add prediction saving code to nearest neighbour and zero shot intent models

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* fix typo in sgd data processor

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* integrate Dialogue Mellon QA Data Processor

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update mellon qa

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update dialogue.py to remove outdated info

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update dialogue_config.yaml

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update dialogue_config.yaml

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add dialogue docs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* address review comments

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix for cfg

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* make dependency on apex optional

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* change NLPDDPluggin calling logic to make it possible to run without apex

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add first draft of tutorial

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* reduce ms marco size by removing lines without wellFormedAnswers

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* address pr comments

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update colab tutorial link in dialogue docs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* include unit test and some refactor to facilitate unit test

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* address pr issues

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove typos in dialogue tutorial

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* support larger files for question answering

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove unnecessary artifacts to reduce memory use

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* put 0 tensor to device

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update link within dialogue tutorial

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* restore previously delete files

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error handling when loss = nan

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update nan handling

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update spanning loss func

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update spanning loss

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* fix type error raised in qa_dataset.py

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add error checking message

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* revert back to float32

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* revert back to float32

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error msgs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error msgs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error msgs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error msgs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error msgs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error msgs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error msgs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error msgs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update exp logging

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error msgs

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update loading of large file from pickle to json

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update loading of large file from pickle to json

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* limit number of negative samples

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* revert post processing

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* revert post processing

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove unused methods and style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add more documentation

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove unused imports

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* changes base on PR review

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* set wandb logger falseby default

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update interface with megatron gpt prompt learning

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update inline documentation

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update prompt_ids

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update error msg

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update config

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update config

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* set inference = False for dialgue prompt learning during trainng

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* set inference = False for dialgue prompt learning during trainng

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove unused code

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update config yaml

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* fix bug for megatron gpt prompt learning

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove unused import

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* address comments in PR

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* address comments in PR

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* address typo

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* add megatron t5 inference

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* fix bug due to bert tokenizer not being space-aware

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update style

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update IntentSlotModel onnx export test

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update style

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update exportable

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* address PR comments

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* replace functools.cache_property with functools.lru_cache to maintain python 3.7 compatibility

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* improve speed of rank_candidates and support for p tuning

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update dialogue.py

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* fix megatron prompt learning saving bug

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update generate_candidate method

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* remove repeated init text ids and invert attention masks

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update typo

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* custom collate fn to remove excess padding in batch

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* style fix

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update complete method to mitigate issue when max seq len is low

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* address pr comments

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

* update generation interface

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>
Co-authored-by: Zhilin Wang <zhilinw@nvidia.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Added save inference ready .nemo file with every checkpoint (#5055)

* Added save inference ready .nemo file with every checkpoint

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Python style fix

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* addressed Adi's comment

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Added ptuning check in model checkpoint saving

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Changed save_nemo_on_valdaition default to False

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Changes global batch size of adapter CI

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* Changed num workers to 0

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* added first stage of pipeline check

Signed-off-by: Virginia Adams <vadams@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Virginia Adams <vadams@nvidia.com>
Signed-off-by: Virginia Adams <78445382+vadam5@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Fixes for docs/typos + remove max_utts parameter from tarred datasets as it causes hang in training (#5118)

* Remove ; from jupyter notebook cells

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* Fix typos in documentation/code

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* Fix output message to have 'or equal'

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* Link formatting fixes

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* Add error if max_utts is used in tarred datasets

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* Remove max_utts parameter from tarred datasets

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* Fix max_utts removal in tests

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* Fix typo if -> is

Signed-off-by: Igor Gitman <igitman@nvidia.com>

Signed-off-by: Igor Gitman <igitman@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Merge r1.12.0 main (#5139)

* update branch

Signed-off-by: ericharper <complex451@gmail.com>

* Add cherry-pick action (#4958)

* add cherry-pick action

Signed-off-by: ericharper <complex451@gmail.com>

* Pin Transformers version to fix CI (#4955)

* Pin transformers version in CI to prevent offline tokenizer loading error

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Drop version

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Disable offline temporarily

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Disable offline temporarily

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Enable offline

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>
Co-authored-by: Sean Naren <snarenthiran@nvidia.com>

* upper bound transformers

Signed-off-by: ericharper <complex451@gmail.com>

* remove duplicate transformers requirement

Signed-off-by: ericharper <complex451@gmail.com>

* Release SOTA Lang ID model  (#5080)

* add pretrained lang id model ambernet

Signed-off-by: fayejf <fayejf07@gmail.com>

* update doc and style fix

Signed-off-by: fayejf <fayejf07@gmail.com>

Signed-off-by: fayejf <fayejf07@gmail.com>

* update branch and package info

Signed-off-by: ericharper <complex451@gmail.com>

* remove upper bounds on lightning and transformers

Signed-off-by: ericharper <complex451@gmail.com>

* remove transformers offline from ci

Signed-off-by: ericharper <complex451@gmail.com>

* upper bound transformers

Signed-off-by: ericharper <complex451@gmail.com>

Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>
Signed-off-by: fayejf <fayejf07@gmail.com>
Co-authored-by: Sean Naren <snarenthiran@nvidia.com>
Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Added ASR model comparison to SDE (#5043)

SDE: Added ASR model comparison tool to SDE
transcribe speech: Added support for many predictions in one file, as well as custom field names
Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* fix nmt eval sampler (#5154)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Fix Global init steps (#5143)

* move global step to base

Signed-off-by: Yi Dong <yidong@nvidia.com>

* fix fused softmax

Signed-off-by: Yi Dong <yidong@nvidia.com>

* add the missing file

Signed-off-by: Yi Dong <yidong@nvidia.com>

* update the fused kernel

Signed-off-by: Yi Dong <doyend@gmail.com>

* fix import error

Signed-off-by: Yi Dong <doyend@gmail.com>

* fix import again

Signed-off-by: Yi Dong <yidong@nvidia.com>

Signed-off-by: Yi Dong <yidong@nvidia.com>
Signed-off-by: Yi Dong <doyend@gmail.com>
Co-authored-by: Yi Dong <doyend@gmail.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* [TTS] bug fix - sample rate was being ignored in vocoder dataset (#4518)

* bug fix - sample rate was being ignored in vocoder dataset when not loading mel
* handled n segments for a different sampling rate than original sampling rate
* Added case for n_segments 0, warning for n_segments greater than file length

Signed-off-by: Paarth Neekhara <paarth.n@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Jocelyn <jocelynh@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Add EMA support to NeMo (#4764)

* Added Base files

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Some refactors, swap to using MNIST Lnet

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add a few more tests, allow the callback to be set via the exp manager

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Actually run validation for testing

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Run isort

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add test for saving state/fix saving state

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Use dummy model

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Fix test

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add copyright

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Support saving separate EMA weight module

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add standalone functionality/logging

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Expose more parameters

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Modify to allow option to replace validation

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add jenkins test, formatting

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Pin Transformers version to fix CI (#4955)

* Pin transformers version in CI to prevent offline tokenizer loading error

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Drop version

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Disable offline temporarily

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Disable offline temporarily

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Enable offline

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add cherry-pick action (#4958) (#4961)

* add cherry-pick action

Signed-off-by: ericharper <complex451@gmail.com>

* Pin Transformers version to fix CI (#4955)

* Pin transformers version in CI to prevent offline tokenizer loading error

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Drop version

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Disable offline temporarily

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Disable offline temporarily

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Enable offline

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>
Co-authored-by: Sean Naren <snarenthiran@nvidia.com>

Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Sean Naren <snarenthiran@nvidia.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Fix changelog builder (#4962) (#4963)

Signed-off-by: smajumdar <smajumdar@nvidia.com>

Signed-off-by: smajumdar <smajumdar@nvidia.com>

Signed-off-by: smajumdar <smajumdar@nvidia.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* fix cherry pick workflow (#4964) (#4965)

Signed-off-by: ericharper <complex451@gmail.com>

Signed-off-by: ericharper <complex451@gmail.com>

Signed-off-by: ericharper <complex451@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* reorder model check (#4959) (#4967)

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* check for active conda environment (#4970) (#4971)

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* [TTS] fix broken tutorial for MixerTTS. (#4949) (#4976)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Checkpoint averaging class fix (#4946)

* 1. Added args.class_path to provide it externally.

Signed-off-by: Micha Livne <mlivne@cs.toronto.edu>

* 1. Fixed style.

Signed-off-by: Micha Livne <mlivne@cs.toronto.edu>

Signed-off-by: Micha Livne <mlivne@cs.toronto.edu>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add ability to give seperate datasets for test, train and validation (#4798)

* Add ability to give seperate datasets for test, train and validation

* Addressed Sandeeps comments

* Addressed Sandeeps comments

* Add ability to give seperate datasets for test, train and validation

* Add ability to give seperate datasets for test, train and validation

* Addressed review comments

* Bug fix for common dataset utils

* Add CI tests

Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com>

* Reformat code

Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com>

* Bug fix

Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com>

* Bug fix

* Bug Fix

* Bug Fix

* Update Jenkinsfile

* Addressed comments

* Addressed Eriks comments.

* Addressed Sandeep

* Update Jenkinsfile

* Update Jenkinsfile

* Update dataset_utils.py

* Update Jenkinsfile

* Update Jenkinsfile

* Use GPT CI config

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com>
Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* fix label models restoring issue from wrighted cross entropy (#4968) (#4975)

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add simple pre-commit file (#4983)

* Add simple pre-commit file

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Exclude docs folder

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Revert "[pre-commit.ci] auto fixes from pre-commit.com hooks"

This reverts commit 053bd5ba579537a5f311b431871c21f3381b43eb.

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Import pycuda.autoprimaryctx or pycuda.autoinit to init pycuda execution environment (#4951)

Signed-off-by: Jin Li <liji@nvidia.com>

Signed-off-by: Jin Li <liji@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Adding speaker embedding conditioning in fastpitch (#4986)

Signed-off-by: subhankar-ghosh <subhankar2321@gmail.com>

Signed-off-by: subhankar-ghosh <subhankar2321@gmail.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Fix ASR issues (#4984) (#4991)

* Fix ASR issues

Signed-off-by: smajumdar <smajumdar@nvidia.com>

* Revert fix

Signed-off-by: smajumdar <smajumdar@nvidia.com>

Signed-off-by: smajumdar <smajumdar@nvidia.com>

Signed-off-by: smajumdar <smajumdar@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Fix current tests

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* More test coverage

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Address reviews

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Address review

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Drop bf16 test

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Address review

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* remove print

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add bf16

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>
Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: smajumdar <smajumdar@nvidia.com>
Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Micha Livne <mlivne@cs.toronto.edu>
Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com>
Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Jin Li <liji@nvidia.com>
Signed-off-by: subhankar-ghosh <subhankar2321@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Micha Livne <michalivne@users.noreply.github.com>
Co-authored-by: shanmugamr1992 <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: liji-nv <59594262+liji-nv@users.noreply.github.com>
Co-authored-by: Subhankar Ghosh <subhankar2321@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Fix BF16 test (#5162)

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Fix errors in speaker diarization nemo docs (#5153)

* fix docs and docstrings for MSDD

Signed-off-by: Taejin Park <tango4j@gmail.com>

* fix nemo docs errors

Signed-off-by: Taejin Park <tango4j@gmail.com>

* reflected review comments

Signed-off-by: Taejin Park <tango4j@gmail.com>

Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* Add interleaved pipeline schedule to GPT (#5025)

* add virtual pipeline size to config

Signed-off-by: ericharper <complex451@gmail.com>

* convert model to list of modules

Signed-off-by: ericharper <complex451@gmail.com>

* convert model to list of modules

Signed-off-by: ericharper <complex451@gmail.com>

* convert model to list of modules

Signed-off-by: ericharper <complex451@gmail.com>

* update for list of modules

Signed-off-by: ericharper <complex451@gmail.com>

* add virtual to init

Signed-off-by: ericharper <complex451@gmail.com>

* update first last stage embedding all reduce

Signed-off-by: ericharper <complex451@gmail.com>

* update sequence parallel all reduce for virtual models

Signed-off-by: ericharper <complex451@gmail.com>

* runs but we get an error

Signed-off-by: ericharper <complex451@gmail.com>

* set virtual rank 0 after looping

Signed-off-by: ericharper <complex451@gmail.com>

* account for virtual when determinining first and last pipeline stages

Signed-off-by: ericharper <complex451@gmail.com>

* checkpointing for virtual models in progress

Signed-off-by: ericharper <complex451@gmail.com>

* add checkpoint hooks

Signed-off-by: ericharper <complex451@gmail.com>

* working on validation when resuming

Signed-off-by: ericharper <complex451@gmail.com>

* skip sanity val steps by default in config

Signed-off-by: ericharper <complex451@gmail.com>

* remove comment

Signed-off-by: ericharper <complex451@gmail.com>

* log number of params

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* style

Signed-off-by: ericharper <complex451@gmail.com>

* check if self.model is a list

Signed-off-by: ericharper <complex451@gmail.com>

* make virtual pipeline default size None on init

Signed-off-by: ericharper <complex451@gmail.com>

* make virtual pipeline default to None in config

Signed-off-by: ericharper <complex451@gmail.com>

* remove ensure_divisibility call

Signed-off-by: ericharper <complex451@gmail.com>

* fix lgtm alerts

Signed-off-by: ericharper <complex451@gmail.com>

* remove num_sanity_val_steps from config

Signed-off-by: ericharper <complex451@gmail.com>

* default virtual pipeline size to none

Signed-off-by: ericharper <complex451@gmail.com>

* check for list

Signed-off-by: ericharper <complex451@gmail.com>

* update assert to make sure we are only doing virtual for gpt

Signed-off-by: ericharper <complex451@gmail.com>

* revert change to get_params_for_weight_decay

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* init var

Signed-off-by: ericharper <complex451@gmail.com>

* add import guard for set virtual model parallel world size

Signed-off-by: ericharper <complex451@gmail.com>

* use import guard

Signed-off-by: ericharper <complex451@gmail.com>

* update calls to fake init in eval scripts

Signed-off-by: ericharper <complex451@gmail.com>

* add _get_fwd_bwd_function

Signed-off-by: ericharper <complex451@gmail.com>

* log all total model parameters

Signed-off-by: ericharper <complex451@gmail.com>

* remove unused import

Signed-off-by: ericharper <complex451@gmail.com>

Signed-off-by: ericharper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* reduced to 14 inactive days to be stale for PRs. (#5165)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>

* refactor TTS documentation organization and add new contents. (#5137)

* refactor TTS documentation organization and add new contents.
* fix asr api bug.
* fix broken links.
* fix unexpected indentation errors.
* fixed unexpected indentation.
* fixed broken paper reference.
* fixed cross-reference and typos.
* fixed toctree errors.
* revert to 'Augmentors'
* reordered TTS tutorial list in starthere.
* ordered api classes alphabetically for each Section.
* fixed underscore typo for fastpitch checkpoint.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* upcase 'Tuning'

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* fixed typo for RAD-TTS Aligner

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* reorder aligner section after mel-gen and vocoders in models.rst.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* clarify Mixer-TTS-X and reorder model descriptions alphabetically.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* fixed some typos and formats.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* removed old megatron.rst.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* fixed block quote ends without a blank line warnings.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* remove duplicate reference; fixed missing key nlp-megatron-shoeybi2019megatron

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Revert "removed old megatron.rst."

This reverts commit c5ea1dc3f23272eecfe8040e3abfa54fa122cf73.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* removed Russian, a hyphen, and add a note about G2P in tts/…

* Remove pyyaml (#7052)

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix typo and branch in tutorial (#7048)

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Refined export_config (#7053)

* Refined export_config

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Rolling back hierarchy change

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

---------

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* fix pos id - hf update (#7075)

* fix pos id - hf update

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* add missing import

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Evelina <ebakhturina@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix documentation for Numba (#7065)

* Fix documentation for Numba

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update force float32 flag dynamically

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update force float32 flag dynamically

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix nemo version

Signed-off-by: smajumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* small Bugfix (#7079)

* fix branch

Signed-off-by: fayejf <fayejf07@gmail.com>

* fix typo

Signed-off-by: fayejf <fayejf07@gmail.com>

* fix link

Signed-off-by: fayejf <fayejf07@gmail.com>

---------

Signed-off-by: fayejf <fayejf07@gmail.com>

* Fix caching bug in causal convolutions for cache-aware ASR models (#7034)

* Adding docs and models for multiple lookahead cache-aware ASR (#7067)

* added docs on multiple look-ahead.

Signed-off-by: vnoroozi <vnoroozi@nvidia.com>

* added docs on multiple look-ahead.

Signed-off-by: vnoroozi <vnoroozi@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added models.

Signed-off-by: vnoroozi <vnoroozi@nvidia.com>

* added models.

Signed-off-by: vnoroozi <vnoroozi@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added models.

Signed-off-by: vnoroozi <vnoroozi@nvidia.com>

* added models.

Signed-off-by: vnoroozi <vnoroozi@nvidia.com>

---------

Signed-off-by: vnoroozi <vnoroozi@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix syntax error introduced in PR-7079 (#7102)

* fix syntax error introduced in PR-7079

Signed-off-by: Alexandra Antonova <antonova_sasha@list.ru>

* fixes for pr review

Signed-off-by: Alexandra Antonova <antonova_sasha@list.ru>

---------

Signed-off-by: Alexandra Antonova <antonova_sasha@list.ru>

* fix links for TN (#7117)

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* Add updated fc ctc and rnnt xxl models (#7128)

* add updated fc xxl ctc and rnnt models

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add to docs

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* update branch (#7135)

Signed-off-by: ericharper <complex451@gmail.com>

* Fixed main and merging this to r1.20 (#7127)

* Fixed main and merging this to r1.20

Signed-off-by: Taejin Park <tango4j@gmail.com>

* Update vad_utils.py

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

---------

Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* fix default attention size (#7141)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* Update evaluator.py (#7151)

reflecting changes in https://github.com/NVIDIA/NeMo/pull/7150

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Eagerly accumulate embedding grads into fp32 buffer (#6958)

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Modular SpeechLLM implementation for Sept. 2023 submission (SALM) (#7634)

* add initial impl of ModularizedSpeechGPTModel and integration test

* fix typo in the test name (#1)

approve the nit change

* clean a initial version of example config; make sure it works by test (#2)

approve as no need to review

* add the test for training_step and fix the code correspondingly (test passed now) (#3)

* add test for validation_step (#4)

* mv audio and text emb concat to prepare_llm_input so as to write test to guard the llm input

* Merge heh and zhehuai's initial version of frozen am+llm (#5)

* Merge heh and zhehuai's initial version of frozen am+llm

The previous differences are summarized here:
https://docs.google.com/document/d/1zNI4hC6vJtUfcHbrUSPaMuYWRBQdN_36H0P2NiBiuPY/edit

This PR includes
1. Finish merging the model, dataset, and config code
2. Previous tests are still enabled and passed (prepare_llm_input, training_step,
    validation_step)
3. the example training script with LS960 has been run to make sure the training
pipeline works

The major remaining works are listed here
https://docs.google.com/document/d/1o0AM7v4gcTQkPZjE0Vl9TTX4vYnGTrbXEFGWh0UhGlk/edit#bookmark=id.pzvdadt5oxyw

---------

Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* fix a nit init bug broke test (#6)

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* Clean up implementation for SALM paper and sync to NEMO v1.20.0 (#18)

* wip

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix data

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix consumed_samples

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix the training restart problem by storing adapter+perception model and
init them from the ckpt

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* refix state dict

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support wer and inf

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* nan guard

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* reimpl inf and bug fix

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* multi loader

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* unfreeze lm

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* flag for load am

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* tokenizer

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* overwrite vocab size

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support bpe dropout

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* add tarred datasets

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix sample_alpha

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix bpe dropout bugs in the mismatched context in tokenization

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* add bleu metric

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update metrics

Signed-off-by: stevehuang52 <heh@nvidia.com>

* support inference and fix a bug in wer calculation

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix bucketing dataset

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix bleu implementation

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support question set file per dataset/data loader in preparation for
multitask understanding; also fix bleu implementation

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support simple random context for word boosting

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* use sacrebleu.corpus_bleu to be consistent with the rest

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* make audio_file optional in the data loader

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* add a tool to materialize mt and text data

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* compatible with tar dataset

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* temp fix for metric and speed up materialization

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* make num of context configurable

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* val_check_interval fix; make manifest dumping consistent with speech models

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* random_context_positive_ratio configurable to control precision

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* bug fix: freeze_llm flag is not passed to the model cfg

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* overwrite tensor_model_parallel_size

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support both stt and ssl models for loading audio encoder

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix the inference config so as to use sampling; allow inference config update in training

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* refactorize and clean up code for preprocessing collections, dataset interface, model inference and rename some classes to be consistent with salm paper.
also make sure test passed

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* Undo changes in megatron_gpt_peft_models.py and move them to speechllm_models.py; make sure the correctness by test_speechllm_models.py::TestModularizedAudioGPTModel::test_predict_step

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update default inference config and test golden value accordingly

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* integration test and minor fix

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* nit bug fix on manifest_filepath introduced by code cleanup

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update workspace/ files; consider moving to examples later

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* further remove unnecessary stuff in the inference implementation

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* revert the update in default end_string to be compatible with legacy models

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

---------

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* rename 'ModularizedAudioGPTModel' to 'ModularAudioGPTLoRAModel'; move speechllm stuff under nemo/collections/multimodal/speechllm

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update copyright; remove workspace/scripts and workspace/tools folders since the main branch has LLaMA support

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

---------

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: Zhehuai Chen <chenzhehuai.sjtu@aispeech.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>

* Add few-shot in-context learning and MLP modality adapter (#7705)

* add few-shot in-context learning and MLP modality adapter

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add init and copyright

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor fsl

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for mlp modality adapter (#7715)

* add few-shot in-context learning and MLP modality adapter

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add init and copyright

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor fsl

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for mlp modality adapter

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix speechllm few-shot inference (#7732)

* add few-shot in-context learning and MLP modality adapter

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add init and copyright

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor fsl

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for mlp modality adapter

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix few-shot inference

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Add training support for multiple audios in a sample (#7796)

* add few-shot in-context learning and MLP modality adapter

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add init and copyright

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor fsl

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for mlp modality adapter

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix few-shot inference

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix to allow num_workers > 0

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add training with multiple audios

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Create README.md

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update README.md

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update README.md

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* rename

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Update SpeechLLM code (#8475)

* add pleasefixme marker for potential failed nightly tests. (#7678)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Add new text segmentation library for better TTS quality (#7645)

* Add new text segmentation library for better TTS quality
* Update zh_cn_pinyin.py

added detailed instruction on how to install pkuseg.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Update requirements_tts.txt

remove pkuseg as the default dependency of NeMo TTS, and instead, direct users to manually install pkuseg if they really need.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>


---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Create PrecisionPlugin for megatron_ckpt_to_nemo.py trainer (#7767) (#7774)

* Create PrecisionPlugin for megatron_ckpt_to_nemo.py trainer


* Add ddp_find_unused_parameters_true for punctuation_capitalization_train_evaluate.py


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add '32-true' for precision values


---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix(clustering_diarizer.py): fix typo (#7772)

Signed-off-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>

* fix(diarization-README): typo (#7771)

Signed-off-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>

* Fix bug wrt change decoding strategy for bpe models (#7762) (#7764)

* Fix bug wrt change decoding strategy for bpe models


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Remove incorrect extra argument for load_from_checkpoint_dir() (#7500)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add nemo to mcore GPT conversion script  (#7730)

* add conversion script

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove references to 'ckpt'

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add one more sanity check to make sure there is no unexpected keys in state dict

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* make cpu loading work

Signed-off-by: Chen Cui <chcui@nvidia.com>

* make script work for llama2 models

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address code check

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove trainer precision (was for old sanity check)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix script for llama2 model

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove commented code

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix bug in ConditionalInput: cat along the feature dim, not the batch dim (#7785)

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* Add some docs and update scripts for ASR (#7790)

* Add some docs and update scripts

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* set context for text memmap to fork (#7784)

* set context for text memmap to fork

Signed-off-by: arendu <adithyare@nvidia.com>

* typo

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>

* add training with multiple audios

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Support flash decoding (#7744)

* Add flash-decoding

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>

* Change accelerator to 'auto' in nlp_checkpoint_port.py (#7761)

* Change accelerator to 'auto' in nlp_checkpoint_port.py (#7747)

* Change accelerator to auto

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass omegaconf object to trainer in nlp_checkpoint_port.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass omegaconf object to trainer in export.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

* docs: fix typos (#7758)

Signed-off-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Snake act (#7736)

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update gpt_dataset.py (#6963)

Signed-off-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Signed-off-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Xin Yao <yaox12@outlook.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>

* Add selection criteria for reference audios in the `GlobalStyleToken` submodule (#7788)

* add selection criteria for reference audios

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* Update configuration files

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* add informative comment in config files

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* sample random index for reference audio selection

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: anferico <f.cariaggi4@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* update text server to support compute logprobs (#7733)

* update text server to support compute logprobs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

---------

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* add multi-layer feat extract and fix random question insertion

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Configure MCore logger (#7781)

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Revert "PEFT eval fix (#7626) (#7638)" (#7693)

This reverts commit f03dd660bd26d88fd569e76c6f74b83a7c203ff9.

* remove TN from ctc_segm tut (#7807)

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* [TTS] Support audio offsets in TTS data loaders (#7156)

* [TTS] Support audio offsets in TTS data loaders

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Change docstring mentions of .pt to .npy

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* Update Apex install command in Dockerfile (#7794) (#7804)

* move core install to /workspace (#7706)


* update apex install in dockerfile


* use fetch head


---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Abhinav Khattar <aklife97@gmail.com>

* fix typo

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Nemo to HF converter for LLaMA model (#7770)

* Create config_llama_truncate.yaml

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Add files via upload

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update config_llama_truncate.yaml

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* clean up trainer

* remove dependency on yaml config. load config from nemo file instead.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* enable ckpt saving into other precision formats

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* support 70b + cleanup qkv slice logic

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix bug

* move hf model folder code from comment to function and add instruction to run

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* Save best NeMo model only when necessary (#7836)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* add guard if its a distributed checkpoint (#7845)

Signed-off-by: Gerald Shen <geshen@nvidia.com>

* Fix tn duplex (#7808)

* fix duplex tn infer

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* fix typo

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix TN docs

Signed-off-by: Evelina <ebakhturina@nvidia.com>

---------

Signed-off-by: Evelina <ebakhturina@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update transformers cache on Jenkins (#7854)

* update transformers cache

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* add cd

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>

* Update README.rst for container update (#7844)

Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com>

* Add support for finetuning with huggingface datasets (#7834)

* add finetune with huggingface dataset

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update yaml

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add extrac hf text and update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* move dataset dependency to common

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Add to Dics

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add ci test

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add max steps in jenkins

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* reduce max steps

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* jenkins test

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add bs=2

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

* Multimodal merge (#7728)

* ControlNet TRT export

* Final MR before release

* SD2 update

* Fixed export issue

* Fix for instruct p2p and reformat

* Fix SD export issue

* Add nemo clip export for DB

* Fix ins pix2pix

* fix sd2 config

* [Mingyuan Ma] BF16 and SD conversion script

* [Imagen] NHWC Feature

* Fix .nemo loading issue for NeMo CLIP in SD

* NeMo r1.20.0 Multimodal Merge

* fix the inductor issue in inference

* Fix inductor loading .nemo issue

* Add Neva Model Support

* Imagen Optimizations

* Neva inference code

* NeMo TOT 1.21 to Internal/main

* Update neva_inference.yaml

* REBASING  for latest code changes

* Update internal/main to main tot

* Parallel DDIM implementation

* 1. Fixing indentation bug. (#7352)

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* NeMo MCore llama2 support + MCore PEFT adapters (#7299)

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* add TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* add todo

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* remove import

Signed-off-by: ericharper <complex451@gmail.com>

* small clean up

Signed-off-by: ericharper <complex451@gmail.com>

* update hidden size in peft base model, add mcore commit to jenkins

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* add config obj to flash attention tests

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove sequence parallel arg

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to test

Signed-off-by: ericharper <complex451@gmail.com>

* get hidden_size from config

Signed-off-by: ericharper <complex451@gmail.com>

* add try except

Signed-off-by: ericharper <complex451@gmail.com>

* use default

Signed-off-by: ericharper <complex451@gmail.com>

* update config with hidden size

Signed-off-by: ericharper <complex451@gmail.com>

* remove arg

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* comment out jenkins test

Signed-off-by: ericharper <complex451@gmail.com>

* revert import

Signed-off-by: ericharper <complex451@gmail.com>

* build transformer config

Signed-off-by: ericharper <complex451@gmail.com>

* add model to provider func

Signed-off-by: ericharper <complex451@gmail.com>

* update forward and float16 wrapper

Signed-off-by: ericharper <complex451@gmail.com>

* instantiate model parallel config after init model parallel

Signed-off-by: ericharper <complex451@gmail.com>

* set virtual rank

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add GQA config to megatron gpt model (#7096)

* Add GQA config in gpt config file

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Verify mcore is enabled when using GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* mcore llama2 ckpt conversion & small fix

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Add inference & sft config by Hongbin

Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add inference param. update TP/PP script to support mcore gpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* modify ckpt conversion script (adding model cast)

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ckpt conversion use relative path for config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* add todo

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove import

Signed-off-by: ericharper <complex451@gmail.com>

* small clean up

Signed-off-by: ericharper <complex451@gmail.com>

* update hidden size in peft base model, add mcore commit to jenkins

Signed-off-by: ericharper <complex451@gmail.com>

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add config obj to flash attention tests

Signed-off-by: ericharper <complex451@gmail.com>

* remove args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove sequence parallel arg

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to test

Signed-off-by: ericharper <complex451@gmail.com>

* get hidden_size from config

Signed-off-by: ericharper <complex451@gmail.com>

* add try except

Signed-off-by: ericharper <complex451@gmail.com>

* use default

Signed-off-by: ericharper <complex451@gmail.com>

* update config with hidden size

Signed-off-by: ericharper <complex451@gmail.com>

* remove arg

Signed-off-by: ericharper <complex451@gmail.com>

* comment out jenkins test

Signed-off-by: ericharper <complex451@gmail.com>

* revert import

Signed-off-by: ericharper <complex451@gmail.com>

* remove optimizer_idx

Signed-off-by: eharper <eharper@nvidia.com>

* prefetch num microbatches

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* fix for p-tuning sequence parallel

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support SFT/distOpt mcore (#7207)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* change layer names for SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug in SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* build transformer config

Signed-off-by: ericharper <complex451@gmail.com>

* add model to provider func

Signed-off-by: ericharper <complex451@gmail.com>

* update forward and float16 wrapper

Signed-off-by: ericharper <complex451@gmail.com>

* instantiate model parallel config after init model parallel

Signed-off-by: ericharper <complex451@gmail.com>

* set virtual rank

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add GQA config to megatron gpt model (#7096)

* Add GQA config in gpt config file

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Verify mcore is enabled when using GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* remove import

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rollback model cast for p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update for dist adam

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* use get_gpt_module_list

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ptl2.0 patch for llama config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add plugins to trainer in scripts

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix activation checkpointing mcore

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix variable names

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* overwrite normalization type for mcore/te

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Update megatron_llama_sft.yaml

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* add PEFT adapter support for mcore gpt path (#7276)

* implementation for mcore adapter/mxins

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* small fix for lora and ptuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support layerwise peft

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support multiple target layers

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support lora GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support amp O2

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert & more O2 fix

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* lora inject to attention

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support lora weight tying

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add copyright header

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback ptuning name change. full string match mcore target

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove comment

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* clean up config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Sync llama branch (#7297)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* change layer names for SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug in SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug: cpu initialization is not really enabled

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* add use_cpu_initialization to TransformerConfig

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug: wrong config path when using relative cjpt path

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* revert mcore config change

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

* clean up ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback git merge errors

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update mcore, add check for mcore+te

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* formatting

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* make sft test dataset optional. fix indentation in config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* one more fix for optional test set

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support merging lora weights in mcore

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update mcore for cpu init

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update ckpt conversion for code llama

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add seq_len_interpolation_factor support for long-context llama ckpts (#7312)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add seq_len_interpolation_factor

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

* fix old ptuning model, update mcore to support seq_len_interpolation_factor

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support fused layernorm linear, fix ptuning O2

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* drop loss mask for mcore for now

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* disable dist ckpt in peft

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix loading non dist ckpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add ckpt conversion to CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* mcore_mixin docstring

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor change in mcore peft error message

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix amp o2 in lora weight tying

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* correct mcore fp8 config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add TE installation

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support mcore adapter tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* comment out new CI test. rollback docker image

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ignore FA tests, try new CI on 23.08

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* mark new CI as L2, put to beginning to test

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor fix for prompt learning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback to 23.06. comment out CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor fix ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor rollback gpt model change

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: ericharper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: eharper <eharper@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Kelvin Liu <lhb8125@users.noreply.github.com>

* Hiddens modules documentation (#7303)

* 1. Changed hiddens transformations module from `transformations` to `hiddens`.

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* 1. Finished doc.

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

---------

Signed-off-by: Micha Livne <mlivne@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Support for flash attention 2.0 (#7063)

* Add flash attn 2

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add FA2 feature

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove debugging

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* lora merge fix for O2 names (#7325)

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* adjust key names based on O2

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* minor

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* multiple fields can form a context (#7147)

* list of context fields and flexible prompt template

Signed-off-by: arendu <adithya.r@gmail.com>

* list of fields for context

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add multiple truncation fields and middle truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Compatible to old ckpt

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix tokenize detokenize issue

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove detokenization, add truncation augmentation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Resolve comments

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove unused import

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert eos

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add tokenizer space_sensitive attribute

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix error

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix erorr and use re

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Change assert logic

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Follow adi suggestion

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove merge function

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add example and comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove context_key and add comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove random truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix template none

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* Load buffers in checkpoint (#7357)

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* Add migration guide for lightning 2.0 upgrade (#7360)

* Add lightning 2.0 migration guide in NeMo docs

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add remaining guide for lightning 2.0 upgrade

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove line spill over and continue in next line

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add missing dataloader_iter in the guide

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix minor typo

Signed-off-by: Abhishree <abhishreetm@gmail.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* adding bias_dropout_add_fusion option for BERT (#7332)

Signed-off-by: Alexander Jipa <azzhipa@amazon.com>
Co-authored-by: Alexander Jipa <azzhipa@amazon.com>

* [TTS] Change audio codec token type to TokenIndex (#7356)

Signed-off-by: Ryan <rlangman@nvidia.com>

* enable selective unfreeze (#7326)

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* avoid PTL method conflicts

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix typos (#7361)

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

---------

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* pin numba=0.57.1 to fix reinstall.sh error (#7366)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Update new conversion script for converting safetensors.

* Upgrade pytorch container to 23.08 (#7353)

* upgrade pytorch container

Signed-off-by: eharper <eharper@nvidia.com>

* use mcore

Signed-off-by: eharper <eharper@nvidia.com>

* revert test change

Signed-off-by: eharper <eharper@nvidia.com>

* pleasefixme

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for ampere

Signed-off-by: eharper <eharper@nvidia.com>

* comment test temporarily

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* enable fp32 optimizer for output_layer in mcore (#7355)

Signed-off-by: lhb8125 <lhb8125@gmail.com>

* revert comment (#7368)

Signed-off-by: eharper <eharper@nvidia.com>

* Update to core 23.08 branch ToT (#7371)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* upper bounding ptl (#7370)

Signed-off-by: eharper <eharper@nvidia.com>

* fix pipeline parallel inference (#7367)

* fix pp inference

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix for peft tied weights (#7372)

Signed-off-by: arendu <adithyare@nvidia.com>

* fixed trainer.strategy=auto from None. (#7369)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add O2 option in gpt eval (#7358)

* add O2 option in eval

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add doc for O2 config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add to llama inference config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Move model precision copy (#7336)

* move cfg precision set to megatron base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* remove copy from other models

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* modify attribute not arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix gpt model test for ptl 2.0

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename function and add docstring

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* replace precision to dtype conditionals with func call

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unnecessary function and cfg reset

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set default value

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix precision lookup in a few more places

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename mapping function

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* ununsed import

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* save torch datatype to model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set weights precision wrt amp o2

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Revert "set weights precision wrt amp o2"

This reverts commit 313a4bfe5eb69d771a6d2433898c0685836aef5c.

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* revert half precision at inference attempt

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move autocast dtype to base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move params dtype to base model, enable fp16 O2 inf

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unused imports

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Fix PEFT checkpoint loading (#7388)

* Fix PEFT checkpoint loading

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Use distributed optimizer support for multiple dtypes (#7359)

* Update distopt wrapper with multiple dtype support

Remove manual handling of separate FP32 optimizer.

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Use distopt support for contiguous buffers with multiple dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Fix typo

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Separate distopt buckets for first GPT layer and non-overlapped params

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Add distopt logic for int dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Remove unused variables

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit in README and Jenkensfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Debug Dockerfile and Jenkinsfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* minor fix for llama ckpt conversion script (#7387)

* minor fix for llama ckpt conversion script

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* Update Jenkinsfile

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* remove fast_swiglu configuration

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix wrong calling of librosa.get_duration() in notebook (#7376)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* [PATCH] PEFT import mcore (#7393)

* [PATCH] PEFT import mcore

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [TTS] Added a callback for logging initial data (#7384)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Update Core Commit (#7402)

* Update Core Commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* update commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* Use cfg attribute in bert (#7394)

* use cfg attribute instead of arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use torch_dtype in place of cfg.precision

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move precision copy before super constructor

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use trainer arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Add support for bias conversion in Swiglu models (#7386)

* Add support for bias conversion in Swiglu models

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix issue with missing tokenizer

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update save_to and restore_from for dist checkpointing (#7343)

* add dist ckpt to save to, in progress

Signed-off-by: eharper <eharper@nvidia.com>

* move dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update restore from, need to figure out how to initialize distributed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* launch distrib if needed when restoring dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* when using mcore we can change tp pp on the fly

Signed-off-by: eharper <eharper@nvidia.com>

* add load_from_checkpoint support for dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update llama convert script to save dist .nemo

Signed-off-by: eharper <eharper@nvidia.com>

* fix load dist ckpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup TE TP groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup te tp groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* remove import

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>

* fix forward for with mcore=false (#7403)

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>

* Fix logging to remove 's/it' from progress bar in Megatron models and add train_step_timing (#7374)

* Add CustomProgressBar class to exp_manager and trainer callbacks

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix the progress bar to reflect total microbatch cnt

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Modify CustomProgressBar class

1) Modify CustomProgressBar class to update progress bar per global_step instead of per microbatch
2) Add the callback to other megatron training/finetuning files that are not using MegatronTrainerBuilder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add CustomProgressBar callback to tuning files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Set Activation Checkpointing Defaults (#7404)

* Set Activation Checkpointing Defaults

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for None

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* make loss mask default to false (#7407)

Signed-off-by: eharper <eharper@nvidia.com>

* Add dummy userbuffer config files (#7408)

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* add missing ubconf files (#7412)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* New tutorial on Speech Data Explorer (#7405)

* Added Google Colab based tutorial on Speech Data Explorer

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>

* Update ptl training ckpt conversion script to work with dist ckpt (#7416)

* update ptl convert script

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* don't break legacy

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Allow disabling sanity checking when num_sanity_val_steps=0 (#7413)

* Allow disabling sanity checking when num_sanity_val_steps=0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update num_sanity_val_steps to be a multiple of num_microbatches

Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add comprehensive error messages (#7261)

Signed-off-by: Anton Peganov <apeganov@nvidia.com>

* check NEMO_PATH (#7418)

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>

* layer selection for ia3 (#7417)

* layer selection for ia3

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix missing pip package 'einops' (#7397)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Fix failure of pyaudio in Google Colab (#7396)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update README.md: output_path --> output_manifest_filepath (#7442)

Signed-off-by: Samuele Cornell <cornellsamuele@gmail.com>

* Updating FlashAttention API to match FlashAttentionV2

* Multiple fixes for mm

* Fix CI inductor issue and update to torch compile

* Remove suppress error

* Fix when conversion config uses fp16 and it complains about precision plugin

* Fixing FAv2 API usage

* Initial release of content filtering model

* Added synthetic dataloader for precached and online mode

* Mingyuanm/dreambooth opt

* Add llama2 support in neva training

* Fix sampler length

* Fix all precision issues in nemo multimodal

* Add rope dynamic linear scaling (#7437)

* Add dynamic linear scaling

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>

* Fix None dataloader issue in PTL2.0 (#7455)

* Fix None dataloader issue in PTL2.0

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [ASR] Confidence measure -> method renames (#7434)

* measure -> method

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add steps for document of getting dataset 'SF Bilingual Speech' (#7378)

* Add steps for document of getting dataset 'SF Bilingual Speech'

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update datasets.rst

added a link from a tutorial demonstrating detailed data prep steps.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* RNN-T confidence and alignment bugfix (#7381)

* new frame_confidence and alignments lists are now always created after the while loop

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* tests added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* Fix resume from checkpoint in exp_manager (#7424) (#7426)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix checking of cuda/cpu device for inputs of Decoder (#7444)

* Fix checking of cuda/cpu device for inputs of Decoder

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update tacotron2.py

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>

* Fix failure of ljspeech's get_data.py (#7430)

* Fix failure of ljspeech's get_data.py

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [TTS] Fix audio codec type checks (#7373)

* [TTS] Fix audio codec type checks

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix audio codec tests

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add dataset to path of logged artifacts (#7462)

* [TTS] Add dataset to path of logged artifacts

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Revert axis name back to Audio Frames

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* Fix sft dataset truncation (#7464)

* Add fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Automatic Lip Reading Recognition (ALR) - ASR/CV (Visual ASR) (#7330)

* striding_conv1d_k5 and dw_striding_conv1d_k5 subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* transpose conv1d inputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, s…

* Update README.md

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* update speechllm (#8486)

* fix(clustering_diarizer.py): fix typo (#7772)

Signed-off-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>

* fix(diarization-README): typo (#7771)

Signed-off-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>

* Fix bug wrt change decoding strategy for bpe models (#7762) (#7764)

* Fix bug wrt change decoding strategy for bpe models


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Remove incorrect extra argument for load_from_checkpoint_dir() (#7500)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add nemo to mcore GPT conversion script  (#7730)

* add conversion script

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove references to 'ckpt'

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add one more sanity check to make sure there is no unexpected keys in state dict

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* make cpu loading work

Signed-off-by: Chen Cui <chcui@nvidia.com>

* make script work for llama2 models

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address code check

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove trainer precision (was for old sanity check)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix script for llama2 model

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove commented code

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix bug in ConditionalInput: cat along the feature dim, not the batch dim (#7785)

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* Add some docs and update scripts for ASR (#7790)

* Add some docs and update scripts

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* set context for text memmap to fork (#7784)

* set context for text memmap to fork

Signed-off-by: arendu <adithyare@nvidia.com>

* typo

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>

* add training with multiple audios

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Support flash decoding (#7744)

* Add flash-decoding

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>

* Change accelerator to 'auto' in nlp_checkpoint_port.py (#7761)

* Change accelerator to 'auto' in nlp_checkpoint_port.py (#7747)

* Change accelerator to auto

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass omegaconf object to trainer in nlp_checkpoint_port.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass omegaconf object to trainer in export.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

* docs: fix typos (#7758)

Signed-off-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Snake act (#7736)

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update gpt_dataset.py (#6963)

Signed-off-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Signed-off-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Xin Yao <yaox12@outlook.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>

* Add selection criteria for reference audios in the `GlobalStyleToken` submodule (#7788)

* add selection criteria for reference audios

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* Update configuration files

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* add informative comment in config files

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* sample random index for reference audio selection

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: anferico <f.cariaggi4@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* update text server to support compute logprobs (#7733)

* update text server to support compute logprobs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

---------

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* add multi-layer feat extract and fix random question insertion

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Configure MCore logger (#7781)

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Revert "PEFT eval fix (#7626) (#7638)" (#7693)

This reverts commit f03dd660bd26d88fd569e76c6f74b83a7c203ff9.

* remove TN from ctc_segm tut (#7807)

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* [TTS] Support audio offsets in TTS data loaders (#7156)

* [TTS] Support audio offsets in TTS data loaders

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Change docstring mentions of .pt to .npy

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* Update Apex install command in Dockerfile (#7794) (#7804)

* move core install to /workspace (#7706)


* update apex install in dockerfile


* use fetch head


---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Abhinav Khattar <aklife97@gmail.com>

* fix typo

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Nemo to HF converter for LLaMA model (#7770)

* Create config_llama_truncate.yaml

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Add files via upload

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update config_llama_truncate.yaml

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* clean up trainer

* remove dependency on yaml config. load config from nemo file instead.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* enable ckpt saving into other precision formats

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* support 70b + cleanup qkv slice logic

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix bug

* move hf model folder code from comment to function and add instruction to run

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* Save best NeMo model only when necessary (#7836)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* add guard if its a distributed checkpoint (#7845)

Signed-off-by: Gerald Shen <geshen@nvidia.com>

* Fix tn duplex (#7808)

* fix duplex tn infer

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* fix typo

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix TN docs

Signed-off-by: Evelina <ebakhturina@nvidia.com>

---------

Signed-off-by: Evelina <ebakhturina@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update transformers cache on Jenkins (#7854)

* update transformers cache

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* add cd

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>

* Update README.rst for container update (#7844)

Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com>

* Add support for finetuning with huggingface datasets (#7834)

* add finetune with huggingface dataset

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update yaml

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add extrac hf text and update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* move dataset dependency to common

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Add to Dics

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add ci test

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add max steps in jenkins

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* reduce max steps

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* jenkins test

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add bs=2

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

* Multimodal merge (#7728)

* ControlNet TRT export

* Final MR before release

* SD2 update

* Fixed export issue

* Fix for instruct p2p and reformat

* Fix SD export issue

* Add nemo clip export for DB

* Fix ins pix2pix

* fix sd2 config

* [Mingyuan Ma] BF16 and SD conversion script

* [Imagen] NHWC Feature

* Fix .nemo loading issue for NeMo CLIP in SD

* NeMo r1.20.0 Multimodal Merge

* fix the inductor issue in inference

* Fix inductor loading .nemo issue

* Add Neva Model Support

* Imagen Optimizations

* Neva inference code

* NeMo TOT 1.21 to Internal/main

* Update neva_inference.yaml

* REBASING  for latest code changes

* Update internal/main to main tot

* Parallel DDIM implementation

* 1. Fixing indentation bug. (#7352)

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* NeMo MCore llama2 support + MCore PEFT adapters (#7299)

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* add TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* add todo

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* remove import

Signed-off-by: ericharper <complex451@gmail.com>

* small clean up

Signed-off-by: ericharper <complex451@gmail.com>

* update hidden size in peft base model, add mcore commit to jenkins

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* add config obj to flash attention tests

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove sequence parallel arg

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to test

Signed-off-by: ericharper <complex451@gmail.com>

* get hidden_size from config

Signed-off-by: ericharper <complex451@gmail.com>

* add try except

Signed-off-by: ericharper <complex451@gmail.com>

* use default

Signed-off-by: ericharper <complex451@gmail.com>

* update config with hidden size

Signed-off-by: ericharper <complex451@gmail.com>

* remove arg

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* comment out jenkins test

Signed-off-by: ericharper <complex451@gmail.com>

* revert import

Signed-off-by: ericharper <complex451@gmail.com>

* build transformer config

Signed-off-by: ericharper <complex451@gmail.com>

* add model to provider func

Signed-off-by: ericharper <complex451@gmail.com>

* update forward and float16 wrapper

Signed-off-by: ericharper <complex451@gmail.com>

* instantiate model parallel config after init model parallel

Signed-off-by: ericharper <complex451@gmail.com>

* set virtual rank

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add GQA config to megatron gpt model (#7096)

* Add GQA config in gpt config file

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Verify mcore is enabled when using GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* mcore llama2 ckpt conversion & small fix

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Add inference & sft config by Hongbin

Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add inference param. update TP/PP script to support mcore gpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* modify ckpt conversion script (adding model cast)

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ckpt conversion use relative path for config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* add todo

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove import

Signed-off-by: ericharper <complex451@gmail.com>

* small clean up

Signed-off-by: ericharper <complex451@gmail.com>

* update hidden size in peft base model, add mcore commit to jenkins

Signed-off-by: ericharper <complex451@gmail.com>

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add config obj to flash attention tests

Signed-off-by: ericharper <complex451@gmail.com>

* remove args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove sequence parallel arg

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to test

Signed-off-by: ericharper <complex451@gmail.com>

* get hidden_size from config

Signed-off-by: ericharper <complex451@gmail.com>

* add try except

Signed-off-by: ericharper <complex451@gmail.com>

* use default

Signed-off-by: ericharper <complex451@gmail.com>

* update config with hidden size

Signed-off-by: ericharper <complex451@gmail.com>

* remove arg

Signed-off-by: ericharper <complex451@gmail.com>

* comment out jenkins test

Signed-off-by: ericharper <complex451@gmail.com>

* revert import

Signed-off-by: ericharper <complex451@gmail.com>

* remove optimizer_idx

Signed-off-by: eharper <eharper@nvidia.com>

* prefetch num microbatches

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* fix for p-tuning sequence parallel

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support SFT/distOpt mcore (#7207)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* change layer names for SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug in SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* build transformer config

Signed-off-by: ericharper <complex451@gmail.com>

* add model to provider func

Signed-off-by: ericharper <complex451@gmail.com>

* update forward and float16 wrapper

Signed-off-by: ericharper <complex451@gmail.com>

* instantiate model parallel config after init model parallel

Signed-off-by: ericharper <complex451@gmail.com>

* set virtual rank

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add GQA config to megatron gpt model (#7096)

* Add GQA config in gpt config file

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Verify mcore is enabled when using GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* remove import

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rollback model cast for p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update for dist adam

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* use get_gpt_module_list

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ptl2.0 patch for llama config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add plugins to trainer in scripts

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix activation checkpointing mcore

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix variable names

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* overwrite normalization type for mcore/te

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Update megatron_llama_sft.yaml

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* add PEFT adapter support for mcore gpt path (#7276)

* implementation for mcore adapter/mxins

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* small fix for lora and ptuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support layerwise peft

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support multiple target layers

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support lora GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support amp O2

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert & more O2 fix

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* lora inject to attention

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support lora weight tying

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add copyright header

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback ptuning name change. full string match mcore target

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove comment

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* clean up config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Sync llama branch (#7297)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* change layer names for SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug in SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug: cpu initialization is not really enabled

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* add use_cpu_initialization to TransformerConfig

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug: wrong config path when using relative cjpt path

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* revert mcore config change

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

* clean up ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback git merge errors

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update mcore, add check for mcore+te

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* formatting

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* make sft test dataset optional. fix indentation in config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* one more fix for optional test set

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support merging lora weights in mcore

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update mcore for cpu init

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update ckpt conversion for code llama

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add seq_len_interpolation_factor support for long-context llama ckpts (#7312)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add seq_len_interpolation_factor

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

* fix old ptuning model, update mcore to support seq_len_interpolation_factor

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support fused layernorm linear, fix ptuning O2

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* drop loss mask for mcore for now

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* disable dist ckpt in peft

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix loading non dist ckpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add ckpt conversion to CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* mcore_mixin docstring

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor change in mcore peft error message

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix amp o2 in lora weight tying

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* correct mcore fp8 config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add TE installation

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support mcore adapter tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* comment out new CI test. rollback docker image

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ignore FA tests, try new CI on 23.08

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* mark new CI as L2, put to beginning to test

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor fix for prompt learning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback to 23.06. comment out CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor fix ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor rollback gpt model change

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: ericharper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: eharper <eharper@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Kelvin Liu <lhb8125@users.noreply.github.com>

* Hiddens modules documentation (#7303)

* 1. Changed hiddens transformations module from `transformations` to `hiddens`.

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* 1. Finished doc.

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

---------

Signed-off-by: Micha Livne <mlivne@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Support for flash attention 2.0 (#7063)

* Add flash attn 2

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add FA2 feature

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove debugging

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* lora merge fix for O2 names (#7325)

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* adjust key names based on O2

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* minor

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* multiple fields can form a context (#7147)

* list of context fields and flexible prompt template

Signed-off-by: arendu <adithya.r@gmail.com>

* list of fields for context

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add multiple truncation fields and middle truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Compatible to old ckpt

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix tokenize detokenize issue

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove detokenization, add truncation augmentation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Resolve comments

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove unused import

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert eos

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add tokenizer space_sensitive attribute

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix error

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix erorr and use re

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Change assert logic

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Follow adi suggestion

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove merge function

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add example and comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove context_key and add comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove random truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix template none

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* Load buffers in checkpoint (#7357)

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* Add migration guide for lightning 2.0 upgrade (#7360)

* Add lightning 2.0 migration guide in NeMo docs

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add remaining guide for lightning 2.0 upgrade

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove line spill over and continue in next line

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add missing dataloader_iter in the guide

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix minor typo

Signed-off-by: Abhishree <abhishreetm@gmail.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* adding bias_dropout_add_fusion option for BERT (#7332)

Signed-off-by: Alexander Jipa <azzhipa@amazon.com>
Co-authored-by: Alexander Jipa <azzhipa@amazon.com>

* [TTS] Change audio codec token type to TokenIndex (#7356)

Signed-off-by: Ryan <rlangman@nvidia.com>

* enable selective unfreeze (#7326)

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* avoid PTL method conflicts

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix typos (#7361)

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

---------

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* pin numba=0.57.1 to fix reinstall.sh error (#7366)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Update new conversion script for converting safetensors.

* Upgrade pytorch container to 23.08 (#7353)

* upgrade pytorch container

Signed-off-by: eharper <eharper@nvidia.com>

* use mcore

Signed-off-by: eharper <eharper@nvidia.com>

* revert test change

Signed-off-by: eharper <eharper@nvidia.com>

* pleasefixme

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for ampere

Signed-off-by: eharper <eharper@nvidia.com>

* comment test temporarily

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* enable fp32 optimizer for output_layer in mcore (#7355)

Signed-off-by: lhb8125 <lhb8125@gmail.com>

* revert comment (#7368)

Signed-off-by: eharper <eharper@nvidia.com>

* Update to core 23.08 branch ToT (#7371)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* upper bounding ptl (#7370)

Signed-off-by: eharper <eharper@nvidia.com>

* fix pipeline parallel inference (#7367)

* fix pp inference

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix for peft tied weights (#7372)

Signed-off-by: arendu <adithyare@nvidia.com>

* fixed trainer.strategy=auto from None. (#7369)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add O2 option in gpt eval (#7358)

* add O2 option in eval

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add doc for O2 config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add to llama inference config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Move model precision copy (#7336)

* move cfg precision set to megatron base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* remove copy from other models

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* modify attribute not arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix gpt model test for ptl 2.0

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename function and add docstring

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* replace precision to dtype conditionals with func call

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unnecessary function and cfg reset

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set default value

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix precision lookup in a few more places

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename mapping function

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* ununsed import

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* save torch datatype to model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set weights precision wrt amp o2

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Revert "set weights precision wrt amp o2"

This reverts commit 313a4bfe5eb69d771a6d2433898c0685836aef5c.

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* revert half precision at inference attempt

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move autocast dtype to base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move params dtype to base model, enable fp16 O2 inf

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unused imports

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Fix PEFT checkpoint loading (#7388)

* Fix PEFT checkpoint loading

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Use distributed optimizer support for multiple dtypes (#7359)

* Update distopt wrapper with multiple dtype support

Remove manual handling of separate FP32 optimizer.

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Use distopt support for contiguous buffers with multiple dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Fix typo

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Separate distopt buckets for first GPT layer and non-overlapped params

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Add distopt logic for int dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Remove unused variables

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit in README and Jenkensfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Debug Dockerfile and Jenkinsfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* minor fix for llama ckpt conversion script (#7387)

* minor fix for llama ckpt conversion script

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* Update Jenkinsfile

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* remove fast_swiglu configuration

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix wrong calling of librosa.get_duration() in notebook (#7376)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* [PATCH] PEFT import mcore (#7393)

* [PATCH] PEFT import mcore

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [TTS] Added a callback for logging initial data (#7384)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Update Core Commit (#7402)

* Update Core Commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* update commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* Use cfg attribute in bert (#7394)

* use cfg attribute instead of arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use torch_dtype in place of cfg.precision

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move precision copy before super constructor

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use trainer arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Add support for bias conversion in Swiglu models (#7386)

* Add support for bias conversion in Swiglu models

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix issue with missing tokenizer

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update save_to and restore_from for dist checkpointing (#7343)

* add dist ckpt to save to, in progress

Signed-off-by: eharper <eharper@nvidia.com>

* move dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update restore from, need to figure out how to initialize distributed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* launch distrib if needed when restoring dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* when using mcore we can change tp pp on the fly

Signed-off-by: eharper <eharper@nvidia.com>

* add load_from_checkpoint support for dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update llama convert script to save dist .nemo

Signed-off-by: eharper <eharper@nvidia.com>

* fix load dist ckpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup TE TP groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup te tp groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* remove import

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>

* fix forward for with mcore=false (#7403)

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>

* Fix logging to remove 's/it' from progress bar in Megatron models and add train_step_timing (#7374)

* Add CustomProgressBar class to exp_manager and trainer callbacks

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix the progress bar to reflect total microbatch cnt

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Modify CustomProgressBar class

1) Modify CustomProgressBar class to update progress bar per global_step instead of per microbatch
2) Add the callback to other megatron training/finetuning files that are not using MegatronTrainerBuilder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add CustomProgressBar callback to tuning files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Set Activation Checkpointing Defaults (#7404)

* Set Activation Checkpointing Defaults

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for None

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* make loss mask default to false (#7407)

Signed-off-by: eharper <eharper@nvidia.com>

* Add dummy userbuffer config files (#7408)

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* add missing ubconf files (#7412)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* New tutorial on Speech Data Explorer (#7405)

* Added Google Colab based tutorial on Speech Data Explorer

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>

* Update ptl training ckpt conversion script to work with dist ckpt (#7416)

* update ptl convert script

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* don't break legacy

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Allow disabling sanity checking when num_sanity_val_steps=0 (#7413)

* Allow disabling sanity checking when num_sanity_val_steps=0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update num_sanity_val_steps to be a multiple of num_microbatches

Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add comprehensive error messages (#7261)

Signed-off-by: Anton Peganov <apeganov@nvidia.com>

* check NEMO_PATH (#7418)

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>

* layer selection for ia3 (#7417)

* layer selection for ia3

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix missing pip package 'einops' (#7397)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Fix failure of pyaudio in Google Colab (#7396)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update README.md: output_path --> output_manifest_filepath (#7442)

Signed-off-by: Samuele Cornell <cornellsamuele@gmail.com>

* Updating FlashAttention API to match FlashAttentionV2

* Multiple fixes for mm

* Fix CI inductor issue and update to torch compile

* Remove suppress error

* Fix when conversion config uses fp16 and it complains about precision plugin

* Fixing FAv2 API usage

* Initial release of content filtering model

* Added synthetic dataloader for precached and online mode

* Mingyuanm/dreambooth opt

* Add llama2 support in neva training

* Fix sampler length

* Fix all precision issues in nemo multimodal

* Add rope dynamic linear scaling (#7437)

* Add dynamic linear scaling

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>

* Fix None dataloader issue in PTL2.0 (#7455)

* Fix None dataloader issue in PTL2.0

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [ASR] Confidence measure -> method renames (#7434)

* measure -> method

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add steps for document of getting dataset 'SF Bilingual Speech' (#7378)

* Add steps for document of getting dataset 'SF Bilingual Speech'

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update datasets.rst

added a link from a tutorial demonstrating detailed data prep steps.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* RNN-T confidence and alignment bugfix (#7381)

* new frame_confidence and alignments lists are now always created after the while loop

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* tests added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* Fix resume from checkpoint in exp_manager (#7424) (#7426)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix checking of cuda/cpu device for inputs of Decoder (#7444)

* Fix checking of cuda/cpu device for inputs of Decoder

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update tacotron2.py

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>

* Fix failure of ljspeech's get_data.py (#7430)

* Fix failure of ljspeech's get_data.py

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [TTS] Fix audio codec type checks (#7373)

* [TTS] Fix audio codec type checks

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix audio codec tests

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add dataset to path of logged artifacts (#7462)

* [TTS] Add dataset to path of logged artifacts

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Revert axis name back to Audio Frames

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* Fix sft dataset truncation (#7464)

* Add fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Automatic Lip Reading Recognition (ALR) - ASR/CV (Visual ASR) (#7330)

* striding_conv1d_k5 and dw_striding_conv1d_k5 subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* transpose conv1d inputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* Update subsampling.py

change striding_conv1d_k5 to striding_conv1d

Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* cv branch

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* video manifest

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* add collection classes

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test_step_outputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest bug when having only audio or only videos

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest bug when having only audio or only videos

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* clean references

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* freeze unfreeze transcribe cv models

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest get_full_path bug

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* update for PR

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* guard torchvision

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/cv/data/video_to_text_dataset.py

Co-aut…

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc and infer

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* minor update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix import

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix pretrained info

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update dockerfile

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for merging main

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix for merge main

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix speechlm test

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix multi-layer feat

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for webdataset

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* force str to avoid bugs with implicit conversion of str  to bool type

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Update examples/multimodal/speech_llm/README.md

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update examples/multimodal/speech_llm/README.md

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for saving nemo

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update eval and ngc ckpt

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Update nemo/collections/multimodal/speech_llm/data/audio_text_qa_dataset.py

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update tests/collections/multimodal/test_speechllm_models.py

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* refactor and remove nlp adapter mixin assert

Signed-off-by: stevehuang52 <heh@nvidia.com>

* remove random context augmentation

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* minor refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor and fix missing import

Signed-off-by: stevehuang52 <heh@nvidia.com>

* major refactor on input format and minor update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix codeQL

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for NGC ckpt and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* skip speechlm test until data moved to CI machines

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor and update to avoid changing nlp_adapter_mixin

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: stevehuang52 <stevehuang52@users.noreply.github.com>

* minor fix

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: stevehuang52 <stevehuang52@users.noreply.github.com>

---------

Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: Yi Dong <yidong@nvidia.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
Signed-off-by: Alexandra Antonova <antonova_sasha@list.ru>
Signed-off-by: Igor Gitman <igitman@nvidia.com>
Signed-off-by: Roman Korostik <rkorostik@nvidia.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@nvidia.com>
Signed-off-by: Vahid <vnoroozi@nvidia.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: shanmugamr1992 <shanmugamr1992@gmail.com>
Signed-off-by: Matvei Novikov <mattyson.so@gmail.com>
Signed-off-by: Anas …
Signed-off-by: Evelina <ebakhturina@nvidia.com>
Signed-off-by: fayejf <fayejf07@gmail.com>
Signed-off-by: vnoroozi <vnoroozi@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Signed-off-by: Tim Moon <tmoon@nvidia.com>
Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>
Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: anferico <f.cariaggi4@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: arendu <adithyare@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Signed-off-by: Xin Yao <xiny@nvidia.com>
Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>
Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Signed-off-by: Gerald Shen <geshen@nvidia.com>
Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com>
Signed-off-by: Micha Livne <mlivne@nvidia.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Alexander Jipa <azzhipa@amazon.com>
Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>
Signed-off-by: lhb8125 <lhb8125@gmail.com>
Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Anton Peganov <apeganov@nvidia.com>
Signed-off-by: Samuele Cornell <cornellsamuele@gmail.com>
Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Signed-off-by: mburchi <maxime.burchi@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Tamerlan Tabolov <tktabolov@gmail.com>
Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Stas Bekman <stas00@users.noreply.github.com>
Signed-off-by: Jocelyn Huang <jocelynh@nvidia.com>
Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>
Signed-off-by: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: BestJuly <chntaoli@163.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: George <37293288+Jorjeous@users.noreply.github.com>
Signed-off-by: Mehadi Hasan Menon <mehadihasan80@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>
Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Signed-off-by: Seonghun Noh <jzi040941@naver.com>
Signed-off-by: Seonghun <jzi040941@naver.com>
Signed-off-by: Eric Harper <complex451@gmail.com>
Signed-off-by: David Mosallanezhad <dmosallanezh@nvidia.com>
Signed-off-by: Selvaraj Anandaraj <selvaraja@computelab-frontend-3.nvidia.com>
Signed-off-by: dimapihtar <dpykhtar@nvidia.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Xiaowei Ren <xren@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Daniel Egert <degert@nvidia.com>
Signed-off-by: Faith Wenyi Nchifor <52848633+Faith-Nchifor@users.noreply.github.com>
Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: Martin <martin.ku@skysource.com.tw>
Signed-off-by: Oren Amsalem <oren.a4@gmail.com>
Signed-off-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Vivian <xuanzic@nvidia.com>
Signed-off-by: Vivian chen <xuanzic@nvidia.com>
Signed-off-by: Vivian Chen <140748220+xuanzic@users.noreply.github.com>
Signed-off-by: Vivian Chen <xuanzic@nvidia.com>
Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Shantanu Acharya <shantanua@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Agoniii <815244047@qq.com>
Signed-off-by: Stephen <stephen.mcconnachie@bfi.org.uk>
Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Signed-off-by: popcornell <cornellsamuele@gmail.com>
Signed-off-by: Michal Futrega <michal.futrega@gmail.com>
Signed-off-by: xren <xren@nvidia.com>
Signed-off-by: Iztok Lebar Bajec <itzsimpl@gmail.com>
Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
Signed-off-by: Pablo Garay <pagaray@nvidia.com>
Signed-off-by: Harishankar G <harishankar.gopalan@ymail.com>
Signed-off-by: jiemingz <jiemingz@nvidia.com>
Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Signed-off-by: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>
Signed-off-by: Huiying Li <huiyingl@nvidia.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: stevehuang52 <stevehuang52@users.noreply.github.com>
Co-authored-by: ericharper <complex451@gmail.com>
Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Boris Fomitchev <borisfom@users.noreply.github.com>
Co-authored-by: bene-ges <antonova_sasha@list.ru>
Co-authored-by: Igor Gitman <igitman@nvidia.com>
Co-authored-by: Roman Korostik <racoiaws@users.noreply.github.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: Nikolay Karpov <karpnv@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com>
Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com>
Co-authored-by: Vahid Noroozi <VahidooX@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Co-authored-by: zhehuaichen <139396994+zhehuaichen@users.noreply.github.com>
Co-authored-by: Zhehuai Chen <chenzhehuai.sjtu@aispeech.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Francesco Cariaggi <f.cariaggi4@gmail.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>
Co-authored-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Co-authored-by: Xin Yao <yaox12@outlook.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: Zhilin Wang <zhilinw@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Ryan Langman <rlangman@nvidia.com>
Co-authored-by: Abhinav Khattar <aklife97@gmail.com>
Co-authored-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>
Co-authored-by: anteju <108555623+anteju@users.noreply.github.com>
Co-authored-by: Gerald Shen <119401249+gshennvm@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: Yu Yao <yuya@nvidia.com>
Co-authored-by: Alexandre Milesi <alexandrem@nvidia.com>
Co-authored-by: Ao Tang <aot@nvidia.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: Maanu Grover <maanug@nvidia.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@nvidia.com>
Co-authored-by: Mateusz Sieniawski <msieniawski@nvidia.com>
Co-authored-by: Micha Livne <michalivne@users.noreply.github.com>
Co-authored-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: eharper <eharper@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Kelvin Liu <lhb8125@users.noreply.github.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Alexander Jipa <alexander.jipa@gmail.com>
Co-authored-by: Alexander Jipa <azzhipa@amazon.com>
Co-authored-by: omahs <73983677+omahs@users.noreply.github.com>
Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: PeganovAnton <apeganov@nvidia.com>
Co-authored-by: Samuele Cornell <cornellsamuele@gmail.com>
Co-authored-by: Parth Mannan <pmannan@nvidia.com>
Co-authored-by: Lukasz Pierscieniewski <lukaszp@nvidia.com>
Co-authored-by: Kunal Dhawan <kunaldhawan97@gmail.com>
Co-authored-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>
Co-authored-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>
Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: Tamerlan Tabolov <nektonikto999@gmail.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
Co-authored-by: Jocelyn <jocelynh@nvidia.com>
Co-authored-by: Giacomo Leone Maria Cavallini <72698188+GiacomoLeoneMaria@users.noreply.github.com>
Co-authored-by: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Co-authored-by: meatybobby <meatybobby@gmail.com>
Co-authored-by: Marc Romeyn <marcromeyn@gmail.com>
Co-authored-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Co-authored-by: Yuanzhe Dong <yudong@nvidia.com>
Co-authored-by: Li Tao <chntaoli@163.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Co-authored-by: Mehadi Hasan Menon <mehadihasan80@gmail.com>
Co-authored-by: Ahmad Kiswani <kiswani.ahmad@gmail.com>
Co-authored-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Co-authored-by: Seonghun Noh <jzi040941@naver.com>
Co-authored-by: David <amosalla@asu.edu>
Co-authored-by: Selvaraj Anandaraj <anandaraj@wisc.edu>
Co-authored-by: Selvaraj Anandaraj <selvaraja@computelab-frontend-3.nvidia.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Xiaowei Ren <103958965+xrennvidia@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: trias702 <25867060+trias702@users.noreply.github.com>
Co-authored-by: Faith Wenyi Nchifor <52848633+Faith-Nchifor@users.noreply.github.com>
Co-authored-by: Nikolay Karpov <nkarpov@nvidia.com>
Co-authored-by: Martin <martin.ku@skysource.com.tw>
Co-authored-by: Oren Amsalem <oren.amsalem1@mail.huji.ac.il>
Co-authored-by: Szymon Mikler <sjmikler@gmail.com>
Co-authored-by: Vivian Chen <140748220+xuanzic@users.noreply.github.com>
Co-authored-by: Huiying Li <huiyingl@nvidia.com>
Co-authored-by: HuiyingLi <willwin.lee@gmail.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Shantanu Acharya <shantanua@nvidia.com>
Co-authored-by: Oren Amsalem <oren.a4@gmail.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Cathy <815244047@qq.com>
Co-authored-by: Stephen <stephen.mcconnachie@bfi.org.uk>
Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com>
Co-authored-by: Terry Kong <terrycurtiskong@gmail.com>
Co-authored-by: Michal Futrega <michal.futrega@gmail.com>
Co-authored-by: Iztok Lebar Bajec <ilb@fri.uni-lj.si>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Zhuoyao Wang <zhuoyaow@nvidia.com>
Co-authored-by: Szymon Mikler <smikler@nvidia.com>
Co-authored-by: Marek Wawrzos <mwawrzos@nvidia.com>
Co-authored-by: Chia-Chih Chen <chiachihc@nvidia.com>
Co-authored-by: Ali Taghibakhshi <ataghibakhsh@nvidia.com>
Co-authored-by: Harishankar G <harishankar.gopalan@ymail.com>
Co-authored-by: Layali R <31741533+layalir@users.noreply.github.com>
Co-authored-by: Hainan Xu <hainan.xv@gmail.com>
Co-authored-by: Hainan Xu <hainanx@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: jbieniusiewi <152396322+jbieniusiewi@users.noreply.github.com>
Co-authored-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com>
Co-authored-by: stevehuang52 <stevehuang52@users.noreply.github.com>
---
 examples/multimodal/speech_llm/README.md      |  189 ++
 .../conf/modular_audio_gpt_config_eval.yaml   |  128 ++
 .../conf/modular_audio_gpt_config_peft.yaml   |  327 ++++
 .../conf/modular_audio_gpt_config_sft.yaml    |  299 ++++
 ...dular_audio_gpt_multi_enc_config_peft.yaml |  307 ++++
 .../speech_llm/conf/salm/salm_config.yaml     |  339 ++++
 .../speech_llm/modular_audio_gpt_eval.py      |  118 ++
 .../speech_llm/modular_audio_gpt_train.py     |   70 +
 .../asr/modules/conformer_encoder.py          |  121 +-
 .../asr/parts/mixins/transcription.py         |   10 +-
 nemo/collections/common/data/dataset.py       |   14 +-
 nemo/collections/common/metrics/__init__.py   |    6 +-
 .../metrics/metric_string_to_torchmetric.py   |   10 +-
 .../common/parts/preprocessing/collections.py |  344 +++-
 .../tokenizers/sentencepiece_tokenizer.py     |    9 +-
 .../multimodal/speech_llm/__init__.py         |   15 +
 .../multimodal/speech_llm/data/__init__.py    |   13 +
 .../speech_llm/data/audio_text_dataset.py     | 1327 ++++++++++++++
 .../multimodal/speech_llm/models/__init__.py  |   15 +
 .../speech_llm/models/modular_models.py       | 1563 +++++++++++++++++
 .../multimodal/speech_llm/modules/__init__.py |   20 +
 .../common/audio_text_generation_strategy.py  |  175 ++
 .../common/audio_text_generation_utils.py     |  698 ++++++++
 .../speech_llm/modules/modality_adapters.py   |  134 ++
 .../speech_llm/modules/perception_modules.py  |  431 +++++
 .../multimodal/speech_llm/parts/__init__.py   |   21 +
 .../speech_llm/parts/mixins/__init__.py       |   13 +
 .../speech_llm/parts/mixins/adapter_mixin.py  |   75 +
 .../speech_llm/parts/utils/__init__.py        |   13 +
 .../speech_llm/parts/utils/data_utils.py      |  157 ++
 .../language_modeling/megatron_gpt_model.py   |  171 +-
 .../megatron_gpt_sft_model.py                 |   17 +-
 .../nlp/modules/common/megatron/utils.py      |   54 +-
 nemo/core/classes/common.py                   |   15 +-
 .../convert_to_tarred_audio_dataset.py        |   23 +-
 .../multimodal/test_speechllm_models.py       |  266 +++
 36 files changed, 7370 insertions(+), 137 deletions(-)
 create mode 100644 examples/multimodal/speech_llm/README.md
 create mode 100644 examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml
 create mode 100644 examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yaml
 create mode 100644 examples/multimodal/speech_llm/conf/modular_audio_gpt_config_sft.yaml
 create mode 100644 examples/multimodal/speech_llm/conf/modular_audio_gpt_multi_enc_config_peft.yaml
 create mode 100644 examples/multimodal/speech_llm/conf/salm/salm_config.yaml
 create mode 100644 examples/multimodal/speech_llm/modular_audio_gpt_eval.py
 create mode 100644 examples/multimodal/speech_llm/modular_audio_gpt_train.py
 create mode 100644 nemo/collections/multimodal/speech_llm/__init__.py
 create mode 100644 nemo/collections/multimodal/speech_llm/data/__init__.py
 create mode 100644 nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
 create mode 100644 nemo/collections/multimodal/speech_llm/models/__init__.py
 create mode 100644 nemo/collections/multimodal/speech_llm/models/modular_models.py
 create mode 100644 nemo/collections/multimodal/speech_llm/modules/__init__.py
 create mode 100644 nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
 create mode 100644 nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
 create mode 100644 nemo/collections/multimodal/speech_llm/modules/modality_adapters.py
 create mode 100644 nemo/collections/multimodal/speech_llm/modules/perception_modules.py
 create mode 100644 nemo/collections/multimodal/speech_llm/parts/__init__.py
 create mode 100644 nemo/collections/multimodal/speech_llm/parts/mixins/__init__.py
 create mode 100644 nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py
 create mode 100644 nemo/collections/multimodal/speech_llm/parts/utils/__init__.py
 create mode 100644 nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py
 create mode 100644 tests/collections/multimodal/test_speechllm_models.py

diff --git a/examples/multimodal/speech_llm/README.md b/examples/multimodal/speech_llm/README.md
new file mode 100644
index 000000000000..b6a9c7486331
--- /dev/null
+++ b/examples/multimodal/speech_llm/README.md
@@ -0,0 +1,189 @@
+# Modular SpeechLLM
+
+This directory contains example scripts to train and evaluate modular SpeechLLM (e.g, SALM[1], etc). 
+
+## Requirements
+You will need to install this specific branch of NeMo, or use the provided Dockerfile in the root directory of this repository to build a Docker image with all the necessary dependencies.
+
+## Architecture
+
+In general, there're three main components of a modular SpeechLLM: 
+- An audio encoder that processes the input audio and produces a sequence of audio embeddings.
+- A modality adapter that processes the audio embeddings and produces a sequence of embeddings in the same latent space as the token embeddings of a pretrained large language model (LLM).
+- A pretrained large language model (LLM) that processes embeddings from the modality adapter as well as token embeddings of input prompt, and produces the text output. The audio embeddings and text token embeddings are concatenated in time dimension before going into the LLM.
+- The LLM produces text outputs based on the concatenated input audio and text embedding.
+
+## Usage
+
+### Input Format
+
+You'll need to prepare data in the NeMo manifest format, where each line is a python dictionary with some keys, for example:
+```
+{
+    "audio_filepath": "path/to/audio.wav",
+    "offset": 0.0, # offset of the audio in seconds, this is an optional field
+    "duration": 10.0 , # duration of the audio in seconds, can set to `None` to load the whole audio
+    "context": "what is the transcription of the audio?", # text prompt for the audio, see below for more details
+    "answer": "the transcription of the audio", # optional for inference, default to "na" in dataloader
+}
+```
+
+The `context` field in the manifest is optional, and you can put a list of context in a context file (one context for each line) then set `++model.data.train_ds.context_file=<path to to context file>` to ask the dataloader to randomly pick a context from the file for each audio sample. This is useful for training with multiple prompts for the same task. If neither `context` field nor `context_file` is provided, the dataloader will use a default context `what does the audio mean?` for all audios. During inference, it is recommended to have the `context` field in the manifest. 
+
+#### **Customizing the fields to use**
+
+You can also use other fields in the manifest to replace the `context` and `answer`fields, but you'll also need to change the `prompt_template` to use the new field names. For example, if you desire to use the new fields `input_text` and `output_text`, you need to set:
+```bash
+++model.data.train_ds.context_key=input_text \
+++model.data.train_ds.answer_key=output_text \
+++model.data.train_ds.prompt_template="'Q: {input_text}\nA: {output_text}'"
+```
+Note that there're single quotes around the prompt template (to avoid hydra errors), and the field names are wrapped in curly braces.
+
+#### **Customizing the input format**
+
+If you would like to use multiple audios, you can set the `audio_filepath` to be a list of audio file paths, and specify the location of each audio by using a special `audio_locator` string in the context. The choice of `audio_locator` should also be passed into the config. For example, if you have a manifest item like this:
+```
+{
+    "audio_filepath": ["path/to/audio1.wav", "path/to/audio2.wav"],
+    "context": "what is the transcription of the [audio] and [audio]?", # text prompt for the audio, see below for more details
+    "answer": "the transcription of the audio1 and audio2", # optional for inference, default to "na" in dataloader
+}
+```
+You can set the `audio_locator` to be `[audio]` in the config:
+```bash
+++model.data.train_ds.audio_locator='[audio]'
+```
+
+By using `audio_locator`, the dataloader will replace the `audio_locator` in the context with the corresponding audio features extracted for each audio. You need to make sure that the number of audio locators in the context matches the number of audio files in the `audio_filepath` field. 
+
+### Training
+
+There are several configs for training a SpeechLLM:
+- `conf/modular_audio_gpt_config_peft.yaml`: a config for training a SpeechLLM with PEFT (e.g., LoRA), where you don't want to tune the whole LLM but still want to adapt the LLM to your needs.
+- `conf/modular_audio_gpt_config_sft.yaml`: a config for training a SpeechLLM without PEFT, where you might want to tune the whole LLM or simply freeze it and use as is.
+- `conf/modular_audio_gpt_multi_enc_config_peft.yaml`: a config for training a SpeechLLM with multiple audio encoders and PEFT, where you can add speaker embeddings to the audio embeddings. Currently only TitaNet is supported as the speaker encoder.
+
+With any config, you can set the following flags to control which components to train or freeze:
+- `model.freeze_llm` # Generally set to `True` unless you want to fine-tune the whole LLM.
+- `model.freeze_audio_encoder` # Generally set to `False` unless you want to freeze the audio encoder.
+- `model.freeze_modality_adapter` # Generally set to `False` since we want to train the modality adapter.
+
+In addition to the config file, you will also need to prepare the audio encoder and the LLM as `*.nemo` files.
+
+To train a SpeechLLM that uses LoRA, you can run the following script:
+```bash
+MEGATRON_MODEL=/path/to/megatron-model.nemo
+ASR_MODEL=/path/to/audio-model.nemo  # only the encoder part will be loaded. e.g, stt_en_fastconformer_transducer_large.nemo 
+
+TRAIN_MANIFESTS="[/data/train_1.json,/data/train_2.json]"
+VAL_MANIFESTS="[/data/dev_1.json,/data/dev_2.json]"
+VAL_NAMES="[dev-1,dev-2]"  # names to display when logging validation results for each dataset
+
+CUDA_VISIBLE_DEVICES="0,1" python modular_audio_gpt_train.py --config-path="./conf" --config-name "modular_audio_gpt_config_peft" \
+    trainer.devices=-1 \
+    model.freeze_audio_encoder=True \
+    model.freeze_llm=True \
+    model.global_batch_size=4 \  # global_batch_size = micro_batch_size * num_gpus_per_node * num_nodes * accumulate_grad_batches
+    model.micro_batch_size=2 \  # micro_batch_size = batch_size_per_gpu
+    model.pretrained_audio_model=$ASR_MODEL \
+    model.restore_from_path=$MEGATRON_MODEL \
+    model.data.train_ds.manifest_filepath=$TRAIN_MANIFESTS \
+    model.data.validation_ds.manifest_filepath=$VAL_MANIFESTS \
+    ++model.data.validation_ds.names=$VAL_NAMES \
+```
+
+You can also use tarred datasets for faster training by converting normal NeMo datasets to tarred datasets using this [script](https://github.com/NVIDIA/NeMo/blob/main/scripts/speech_recognition/convert_to_tarred_audio_dataset.py) and follow the same dataset setting as shown in the script. Also, `accumulate_grad_batches` is automatically set by the model based on `global_batch_size` and `micro_batch_size`, so there's no need to manually calculate and set `trainer.accumulate_grad_batches`.
+
+
+#### **Multi-task Training**
+
+In order to use a context file, you can set `++model.data.train_ds.context_file=<path to to context file>` in the command line or use multiple context files with `++model.data.train_ds.context_file=[<path to to context file1>,<path to context file2>,...]`. If the number of context files is equal to the number of provided datasets, the dataloader will assigne each context file to a dataset. Otherwise, the dataloader will randomly pick a context file from all provided context files for each audio sample. Using multiple context files is useful for training with multiple tasks, where each task has its own set of prompts. Meanwhile, you can control the weights for different tasks/datasets by using concatentated tarred datasets, where you can assign weights to datasets by:
+```
+++model.data.train_ds.is_tarred=True \
+++model.data.train_ds.is_concat=True \
+++model.data.train_ds.manifest_filepath=[/path/to/data1/tarred_audio_manifest.json,/path/to/data2/tarred_audio_manifest.json] \
+++model.data.train_ds.tarred_audio_filepaths=[/path/to/data1/audio__OP_0..1023_CL_.tar,/path/to/data2/audio__OP_0..1023_CL_.tar] \
+++model.data.train_ds.concat_sampling_technique='random' \
+++model.data.train_ds.concat_sampling_probabilities=[0.4,0.6] \
+```
+
+#### **Available Audio Encoders**
+
+Currently all NeMo ASR models are supported, others may also work if they have an `encoder` attribute that returns a sequence of audio embeddings, and a `preprocessor` that takes raw audios and returns a sequence of features for the encoder. The model should also have a `cfg` attribute that returns a `omegaconf.DictConfig` object of model configuration. In addition to a local model, you can also set `pretrained_audio_model` to a model from NGC (e.g., `stt_en_fastconformer_transducer_large`) or Huggingface (e.g., `nvidia/parakeet-rnnt-1.1b`), and the script will download the model and use it for training.
+
+
+### Inference
+
+The script you need to perform inference is `modular_audio_gpt_eval.py`, and the corresponding config file is `conf/modular_audio_gpt_config_eval.yaml`, where you mainly need to set the `model.data.test_ds` fields as well as paths to the checkpoints.
+
+#### **Inference with Intermediate Checkpoints**
+
+If you want to perform inference with intermediate checkpoints, where there's no single NeMo checkpoint file that contains all the model parameters, you can use the following script to load each component from its own checkpoint file and perform inference:
+
+```bash
+MEGATRON_CKPT=/path/to/megatron-llm.nemo
+ALM_DIR=/path/to/nemo_experiments/job_name
+# below is the path to the config used during training
+ALM_YAML=$ALM_DIR/version_0/hparams.yaml
+# this checkpoint file only contains the trainable params, the backslash is used to avoid hyrda parsing error
+ALM_CKPT="$ALM_DIR/checkpoints/AudioGPT--validation_wer\=0.2-step\=100000-epoch\=0-last.ckpt"  
+
+TEST_MANIFESTS="[/data/test_1.json,/data/test_2.json]"
+TEST_NAMES="[test-1,test-2]"
+
+CUDA_VISIBLE_DEVICES=0 python modular_audio_gpt_eval.py \
+    model.restore_from_path=$MEGATRON_CKPT \
+    model.peft.restore_from_path=$ALM_CKPT \
+    model.peft.restore_from_hparams_path=$ALM_YAML \
+    model.data.test_ds.manifest_filepath=$TEST_MANIFESTS \
+    model.data.test_ds.names=$TEST_NAMES \
+    model.data.test_ds.metric.name="bleu" \
+    model.data.test_ds.global_batch_size=8 \
+    model.data.test_ds.micro_batch_size=8 \
+    model.data.test_ds.tokens_to_generate=256 \
+    ++inference.greedy=False \
+    ++inference.top_k=50 \
+    ++inference.top_p=0.95 \
+    ++inference.temperature=0.4 \
+    ++inference.repetition_penalty=1.2 \
+    ++model.data.test_ds.output_dir=${ALM_DIR}
+```
+
+If you froze the audio encoder during training, you will also need to add the following line to the above script:
+```bash
+++model.pretrained_audio_model=/path/to/audio/model.nemo
+```
+
+If you want to save the intermediate checkpoints to a single NeMo checkpoint file, you can add the following line to the above script:
+```bash
+++save_to_nemo=/path/to/save/model.nemo
+```
+
+#### **Inference with Complete SpeechLLM Checkpoints**
+
+If you want to load a trained SpeechLLM from cloud, you can use the following script:
+```bash
+TEST_MANIFESTS="[/data/test_1.json,/data/test_2.json]"
+TEST_NAMES="[test-1,test-2]"
+
+CUDA_VISIBLE_DEVICES=0 python modular_audio_gpt_eval.py \
+    model.from_pretrained="speechllm_fc_llama2_7b" \
+    model.data.test_ds.manifest_filepath=$TEST_MANIFESTS \
+    model.data.test_ds.names=$TEST_NAMES \
+    model.data.test_ds.global_batch_size=8 \
+    model.data.test_ds.micro_batch_size=8 \
+	model.data.test_ds.tokens_to_generate=256 \
+    ++inference.greedy=False \
+    ++inference.top_k=50 \
+    ++inference.top_p=0.95 \
+    ++inference.temperature=0.4 \
+    ++inference.repetition_penalty=1.2 \
+    ++model.data.test_ds.output_dir="./test_outputs"
+```
+
+If you have a local `.nemo` file, you can use `model.restore_from_path=/path/to/model.nemo` to replace the line `model.from_pretrained="speechllm_fc_llama2_7b"` in the above example.
+
+
+## Reference
+[1] Chen, Z.\*, Huang, H.\*, Andrusenko, A., Hrinchuk, O., Puvvada, K.C., Li, J., Ghosh, S., Balam, J. and Ginsburg, B., 2023. SALM: Speech-augmented Language Model with In-context Learning for Speech Recognition and Translation. ICASSP'24.
\ No newline at end of file
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml
new file mode 100644
index 000000000000..e2ef61a8046d
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml
@@ -0,0 +1,128 @@
+# this config is used to perform inference on SpeechLLM checkpoints
+name: megatron_audio_gpt_eval
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 1
+  max_steps: 1000000
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: True
+    save_best_model: False
+
+model:
+  from_pretrained: null  # pretrained model name on NGC or HF
+  restore_from_path: null # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  pretrained_audio_model: null  # Path to a .nemo model for audio encoder
+
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  global_batch_size: 1
+  micro_batch_size: 1
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: False # not used right now
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft: # keep these basic params for reusing in both sft and peft SpeechLMs
+    restore_from_path: null
+    restore_from_hparams_path: null
+    restore_from_ckpt:
+      checkpoint_name: null
+      checkpoint_dir: null
+
+
+  data:
+    test_ds:
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: 1
+      micro_batch_size: 1
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      end_string: ${data.train_ds.end_string}  # don't change, let hydra resolve from saved config
+      context_key: ${data.train_ds.context_key} # don't change, let hydra resolve from saved config
+      answer_key: ${data.train_ds.answer_key} # don't change, let hydra resolve from saved config
+      add_eos: ${data.train_ds.add_eos} # don't change, let hydra resolve from saved config
+      add_sep: ${data.train_ds.add_sep} # don't change, let hydra resolve from saved config
+      add_bos: ${data.train_ds.add_bos} # don't change, let hydra resolve from saved config
+      separate_prompt_and_response_with_newline: ${data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: True
+      output_file_path_prefix: "preds" # Prefix of the file to write predictions to.
+      truncation_field: ${data.train_ds.truncation_field}  # don't change, let hydra resolve from saved config
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${data.train_ds.prompt_template} # don't change, let hydra resolve from saved config
+      tokens_to_generate: 512
+      log_every_n_steps: 1
+      sample_rate: ${data.train_ds.sample_rate} # don't change, let hydra resolve from saved config
+      audio_locator: null # set it to allow multiple audios in a sample, e.g. '|audio|', and use it in the context field of manifest to specify the locations of audios (`audio_filepath` is a list of audios).
+
+      metric:
+        name: "bleu" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'wer', 'bleu', 'rouge']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+save_as_nemo: null  # optional string, set to save the whole model into a single nemo file
+
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  outfile_path: output.txt
+  compute_attention_mask: True
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yaml
new file mode 100644
index 000000000000..172a8f37cf1c
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yaml
@@ -0,0 +1,327 @@
+name: megatron_audio_gpt_peft
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 1000  # used to keep epoch logging correctly, but training will stop based on max_steps
+  max_steps: 1000000 # 1M steps
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 3000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: ???
+  freeze_llm: True
+  freeze_audio_encoder: False
+  freeze_modality_adapter: False
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "lora"  # can be either lora, adapter, ia3 or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  perception:
+    use_multi_layer_feat: false  # whether to extract multi-layer features, only supports conformer encoder
+    multi_layer_feat:
+      layer_idx_list: [0,16]  # layer indices to extract features from
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained audio encoder:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'context': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'context' field is added if missing in manigests, so as to work with ASR manifests
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      sample_alpha: null
+      audio_locator: null
+
+    validation_ds:
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      audio_locator: ${model.data.train_ds.audio_locator}
+
+      log_every_n_steps: 10
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'wer', 'bleu', 'rouge']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    # test_ds:
+    #   manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    #   names: null # Names of the corresponding datasets used to log metrics.
+    #   global_batch_size: ${model.global_batch_size}
+    #   micro_batch_size: ${model.micro_batch_size}
+    #   shuffle: False
+    #   num_workers: 4
+    #   pin_memory: True
+    #   max_seq_length: 2048
+    #   min_seq_length: 1
+    #   drop_last: False
+    #   context_key: 'context'
+    #   answer_key: 'answer'
+    #   add_eos: ${model.data.train_ds.add_eos}
+    #   end_string: ${model.data.end_string}
+    #   add_sep: ${model.data.train_ds.add_sep}
+    #   add_bos: ${model.data.train_ds.add_bos}
+    #   separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+    #   write_predictions_to_file: False
+    #   output_file_path_prefix: null # Prefix of the file to write predictions to.
+    #   truncation_field: "context" # Options: ['context', 'answer']
+    #   index_mapping_dir: null # Path to a directory to write index mapping files.
+    #   prompt_template: ${model.data.train_ds.prompt_template}
+    #   # ASR configs
+    #   sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+    #   metric:
+    #     name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+    #     average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+    #     num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.001 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 5000
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_sft.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_sft.yaml
new file mode 100644
index 000000000000..7f8512fbb19e
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_sft.yaml
@@ -0,0 +1,299 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: megatron_audio_gpt_sft
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 1000  # used to keep epoch logging correctly, but training will stop based on max_steps
+  max_steps: 1000000 # 1M steps
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 3000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: ???
+  freeze_llm: True
+  freeze_audio_encoder: True
+  freeze_modality_adapter: False
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  perception:
+    use_multi_layer_feat: false
+    multi_layer_feat:
+      layer_idx_list: [0,16]
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained audio encoder:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'context': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'context' field is added if missing in manigests, so as to work with ASR manifests
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      sample_alpha: null
+      audio_locator: null
+
+    validation_ds:
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      audio_locator: ${model.data.train_ds.audio_locator}
+
+      log_every_n_steps: 10
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'wer', 'bleu', 'rouge']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    # test_ds:
+    #   manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    #   names: null # Names of the corresponding datasets used to log metrics.
+    #   global_batch_size: ${model.global_batch_size}
+    #   micro_batch_size: ${model.micro_batch_size}
+    #   shuffle: False
+    #   num_workers: 4
+    #   pin_memory: True
+    #   max_seq_length: 2048
+    #   min_seq_length: 1
+    #   drop_last: False
+    #   context_key: 'context'
+    #   answer_key: 'answer'
+    #   add_eos: ${model.data.train_ds.add_eos}
+    #   end_string: ${model.data.end_string}
+    #   add_sep: ${model.data.train_ds.add_sep}
+    #   add_bos: ${model.data.train_ds.add_bos}
+    #   separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+    #   write_predictions_to_file: False
+    #   output_file_path_prefix: null # Prefix of the file to write predictions to.
+    #   truncation_field: "context" # Options: ['context', 'answer']
+    #   index_mapping_dir: null # Path to a directory to write index mapping files.
+    #   prompt_template: ${model.data.train_ds.prompt_template}
+    #   # ASR configs
+    #   sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+    #   metric:
+    #     name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+    #     average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+    #     num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.001 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 5000
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_multi_enc_config_peft.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_multi_enc_config_peft.yaml
new file mode 100644
index 000000000000..656e7df287f1
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/modular_audio_gpt_multi_enc_config_peft.yaml
@@ -0,0 +1,307 @@
+name: megatron_audio_gpt_multi_enc_peft_tuning
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 10000  # used to keep epoch logging correctly, but training will stop based on max_steps
+  max_steps: 1000000 # 1M steps
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 3000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  freeze_llm: True
+  freeze_audio_encoder: True
+  freeze_modality_adapter: False
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  perception:
+    modality_adapter: 
+      _target_: nemo.collections.multimodal.speech_llm.modules.PoolingMLPConnectors
+      hidden_dim: 512
+      pooling: 'cat'
+      pooling_factor: 2
+      num_layers: 4
+      input_dim: -1
+      output_dim: -1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    encoders:
+    # use `target` instead of `_target_` to avoid auto initialization by hydra, need to do manual instantiation
+      asr_model:
+        target: nemo.collections.asr.models.EncDecRNNTBPEModel
+        model_dim_key: d_model
+        freeze: True
+        pretrained_model: stt_en_fastconformer_transducer_large
+      ssl_model:
+        target: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel
+        model_dim_key: d_model
+        freeze: True
+        pretrained_model: ssl_en_conformer_large
+        use_multi_layer_feat: True
+        multi_layer_feat:
+          layer_idx_list: [0,16]
+          aggregator:
+            mode: "cat"
+            pooling: "avg"
+            rounding: "floor"
+  
+    speaker_model:
+      segment_length_in_secs: 0.4
+      freeze: True
+      pretrained_model: titanet_large
+
+    ref_model: asr_model
+    aggregator:
+      mode: "cat"
+      pooling: "mean"
+      rounding: "floor"
+
+    # the following are read from the pretrained audio encoder:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'context': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'context' field is added if missing in manigests, so as to work with ASR manifests
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      # add_eos: True
+      add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "synced_randomized"
+      bucketing_batch_size: null
+      sample_alpha: null
+      audio_locator: null
+
+    validation_ds:
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      audio_locator: ${model.data.train_ds.audio_locator}
+
+      log_every_n_steps: 20
+      metric:
+        name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    # test_ds:
+    #   manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    #   names: null # Names of the corresponding datasets used to log metrics.
+    #   global_batch_size: ${model.global_batch_size}
+    #   micro_batch_size: ${model.micro_batch_size}
+    #   shuffle: False
+    #   num_workers: 4
+    #   pin_memory: True
+    #   max_seq_length: 2048
+    #   min_seq_length: 1
+    #   drop_last: False
+    #   context_key: 'context'
+    #   answer_key: 'answer'
+    #   add_eos: ${model.data.train_ds.add_eos}
+    #  end_string: ${model.data.end_string}
+    #   add_sep: ${model.data.train_ds.add_sep}
+    #   add_bos: ${model.data.train_ds.add_bos}
+    #   separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+    #   write_predictions_to_file: False
+    #   output_file_path_prefix: null # Prefix of the file to write predictions to.
+    #   truncation_field: "context" # Options: ['context', 'answer']
+    #   index_mapping_dir: null # Path to a directory to write index mapping files.
+    #   prompt_template: ${model.data.train_ds.prompt_template}
+    #   # ASR configs
+    #   sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+    #   metric:
+    #     name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+    #     average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+    #     num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.001 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 5000
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/salm/salm_config.yaml b/examples/multimodal/speech_llm/conf/salm/salm_config.yaml
new file mode 100644
index 000000000000..c49e335c8d66
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/salm/salm_config.yaml
@@ -0,0 +1,339 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: salm_fastconformer_gpt_lora_tuning
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 100
+  max_steps: 1000000 # 1M steps
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 3000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: stt_en_fastconformer_transducer_large
+  freeze_llm: True
+  freeze_audio_encoder: False
+  freeze_modality_adapter: False
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "lora"  # can be either lora, adapter, ia3 or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  perception:
+    use_multi_layer_feat: false  # whether to extract multi-layer features, only supports conformer encoder
+    multi_layer_feat:
+      layer_idx_list: [0,16]  # layer indices to extract features from
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained audio encoder:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'question': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'question' field is added if missing in manigests, so as to work with ASR manifests
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      # sample_alpha: 0.1
+
+    validation_ds:
+      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+      log_every_n_steps: 10
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'wer', 'bleu', 'rouge']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    # test_ds:
+    #   manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    #   names: null # Names of the corresponding datasets used to log metrics.
+    #   global_batch_size: ${model.global_batch_size}
+    #   micro_batch_size: ${model.micro_batch_size}
+    #   shuffle: False
+    #   num_workers: 4
+    #   pin_memory: True
+    #   max_seq_length: 2048
+    #   min_seq_length: 1
+    #   drop_last: False
+    #   context_key: 'input'
+    #   answer_key: 'output'
+    #   add_eos: ${model.data.train_ds.add_eos}
+    #  end_string: ${model.data.end_string}
+    #   add_sep: ${model.data.train_ds.add_sep}
+    #   add_bos: ${model.data.train_ds.add_bos}
+    #   separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+    #   write_predictions_to_file: False
+    #   output_file_path_prefix: null # Prefix of the file to write predictions to.
+    #   truncation_field: "context" # Options: ['context', 'answer']
+    #   index_mapping_dir: null # Path to a directory to write index mapping files.
+    #   prompt_template: ${model.data.train_ds.prompt_template}
+    #   # ASR configs
+    #   sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+    #   metric:
+    #     name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+    #     average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+    #     num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.001 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/modular_audio_gpt_eval.py b/examples/multimodal/speech_llm/modular_audio_gpt_eval.py
new file mode 100644
index 000000000000..d76e479829fa
--- /dev/null
+++ b/examples/multimodal/speech_llm/modular_audio_gpt_eval.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from pathlib import Path
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+mp.set_start_method("spawn", force=True)
+
+"""
+This is the script to run inference with a ModularAudioGPTModel.
+
+If you want to evaluate an ModularAudioGPTModel:
+
+MEGATRON_CKPT=/path/to/megatron-llm.nemo
+ALM_DIR=/path/to/nemo_experiments/job_name
+ALM_YAML=$ALM_DIR/version_0/hparams.yaml
+ALM_CKPT="$ALM_DIR/checkpoints/AudioGPT--validation_wer\=0.5-step\=103-epoch\=0-last.ckpt"
+
+VAL_MANIFESTS="[/data/libri-test-other.json,/data/MCV_7.1_test.json,/data/wsj-test.json]"
+VAL_NAMES="[ls-test-other,mcv7.1-test,wsj-test]"
+
+HYDRA_FULL_ERROR=1 \
+CUDA_VISIBLE_DEVICES=0 python modular_audio_gpt_eval.py \
+    model.restore_from_path=$MEGATRON_CKPT \
+    model.peft.restore_from_path=$ALM_CKPT \
+    model.peft.restore_from_hparams_path=$ALM_YAML \
+    model.data.test_ds.manifest_filepath=$VAL_MANIFESTS \
+    model.data.test_ds.names=$VAL_NAMES \
+    model.data.test_ds.global_batch_size=8 \
+	model.data.test_ds.micro_batch_size=8 \
+	model.data.test_ds.tokens_to_generate=256 \
+    ++inference.greedy=False \
+    ++inference.top_k=50 \
+    ++inference.top_p=0.95 \
+    ++inference.temperature=0.4 \
+    ++inference.repetition_penalty=1.2 \
+    ++model.data.test_ds.output_dir=${ALM_DIR}
+"""
+
+
+@hydra_runner(config_path="conf", config_name="modular_audio_gpt_config_eval")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
+    logging.info("**************************************************\n\n")
+
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+
+    if cfg.model.from_pretrained:
+        # Load model from NGC or HuggingFace
+        logging.info(f"Loading model from cloud: {cfg.model.from_pretrained}")
+        model_cfg = ModularAudioGPTModel.from_pretrained(
+            cfg.model.from_pretrained, trainer=trainer, return_config=True
+        )
+        model_cfg = ModularAudioGPTModel.merge_inference_cfg(cfg, trainer, model_cfg)
+        model_file = ModularAudioGPTModel.from_pretrained(
+            cfg.model.from_pretrained, trainer=trainer, return_model_file=True
+        )
+        model = ModularAudioGPTModel.restore_from(
+            restore_path=model_file,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            strict=False,
+            map_location="cpu",
+        )
+        if "peft" in model_cfg and model_cfg.peft.get("peft_scheme", None):
+            # need this due to the way that MegatronGPTSFTModel doesn't load adapters in model initialization
+            model.load_adapters(model_file, map_location="cpu")
+    else:
+        # Load model from a local file
+        model_cfg = ModularAudioGPTModel.merge_inference_cfg(cfg, trainer)
+        model = ModularAudioGPTModel.restore_from(
+            restore_path=cfg.model.restore_from_path,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            strict=False,
+            map_location="cpu",
+        )
+        model = ModularAudioGPTModel.load_adapters_for_inference(cfg, model_cfg, model)
+        model = ModularAudioGPTModel.load_audio_encoder_for_inference(cfg, model_cfg, model)
+
+    model.freeze()
+    if cfg.get("save_as_nemo", None):
+        model.setup("predict")  # need to call setup() to load adapters and prepare for saving
+        model.save_to(cfg.save_as_nemo)
+        logging.info(f"Model saved to {Path(cfg.save_as_nemo).absolute()}, exiting...")
+        exit(0)
+
+    if not cfg.model.get('use_flash_attention', False):
+        cfg.inference.compute_attention_mask = True
+    config = OmegaConf.to_container(cfg.inference, resolve=True)
+    model.set_inference_config(config)
+
+    # run inference
+    trainer.test(model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multimodal/speech_llm/modular_audio_gpt_train.py b/examples/multimodal/speech_llm/modular_audio_gpt_train.py
new file mode 100644
index 000000000000..04bff37e7a3f
--- /dev/null
+++ b/examples/multimodal/speech_llm/modular_audio_gpt_train.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf, open_dict
+
+from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+mp.set_start_method("spawn", force=True)
+
+"""
+MEGATRON_CKPT=/path/to/megatron-llm.nemo
+ASR_MODEL=/path/to/asr-model.nemo
+
+TRAIN_MANIFESTS="[/data/train_1.json,/data/train_2.json]"
+VAL_MANIFESTS="[/data/dev_1.json,/data/dev_2.json]"
+VAL_NAMES="[dev-1,dev-2]"
+
+CUDA_VISIBLE_DEVICES="0,1" python modular_audio_gpt_train.py --config-path="./conf" --config-name "modular_audio_gpt_config_peft" \
+    trainer.devices=-1 \
+    model.freeze_audio_encoder=True \
+    model.freeze_llm=True \
+    model.global_batch_size=4 \
+    model.micro_batch_size=2 \
+    model.pretrained_audio_model=$ASR_MODEL \
+    model.restore_from_path=$MEGATRON_MODEL \
+    model.data.train_ds.manifest_filepath=$TRAIN_MANIFESTS \
+    model.data.validation_ds.manifest_filepath=$VAL_MANIFESTS \
+    ++model.data.validation_ds.names=$VAL_NAMES \
+"""
+
+
+@hydra_runner(config_path="conf", config_name="modular_audio_gpt_config_peft")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
+    with open_dict(cfg):
+        cfg.model.precision = cfg.trainer.precision
+
+    precision = cfg.trainer.precision
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+    cfg.trainer.precision = precision
+
+    exp_manager(trainer, cfg.exp_manager)
+    # update resume from checkpoint found by exp_manager
+    logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
+
+    model = ModularAudioGPTModel.restore_from_pretrained_models(cfg, trainer=trainer)
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
index b9642b3ea5dc..d0e014e42a37 100644
--- a/nemo/collections/asr/modules/conformer_encoder.py
+++ b/nemo/collections/asr/modules/conformer_encoder.py
@@ -16,7 +16,7 @@
 import random
 from collections import OrderedDict
 from dataclasses import dataclass
-from typing import List, Optional, Set
+from typing import List, Optional, Set, Tuple
 
 import torch
 import torch.distributed
@@ -356,7 +356,9 @@ def __init__(
         if reduction and reduction_factor > 1:
             assert reduction_position >= -1 and reduction_position < n_layers
             self.reduction_subsampling = SubsamplingReductionModule(
-                reduction=reduction, d_model=d_model, reduction_factor=reduction_factor,
+                reduction=reduction,
+                d_model=d_model,
+                reduction_factor=reduction_factor,
             )
             self.reduction_position = reduction_position
         else:
@@ -804,15 +806,15 @@ def setup_streaming_params(
         max_context: int = 10000,
     ):
         """
-            This function sets the needed values and parameters to perform streaming. The configuration would be stored in self.streaming_cfg.
-            The streaming configuration is needed to simulate streaming inference.
-
-            Args:
-                chunk_size (int): overrides the chunk size
-                shift_size (int): overrides the shift size for chunks
-                left_chunks (int): overrides the number of left chunks visible to each chunk
-                max_context (int): the value used for the cache size of last_channel layers if left context is set to infinity (-1)
-                    Defaults to -1 (means feat_out is d_model)
+        This function sets the needed values and parameters to perform streaming. The configuration would be stored in self.streaming_cfg.
+        The streaming configuration is needed to simulate streaming inference.
+
+        Args:
+            chunk_size (int): overrides the chunk size
+            shift_size (int): overrides the shift size for chunks
+            left_chunks (int): overrides the number of left chunks visible to each chunk
+            max_context (int): the value used for the cache size of last_channel layers if left context is set to infinity (-1)
+                Defaults to -1 (means feat_out is d_model)
         """
         streaming_cfg = CacheAwareStreamingConfig()
 
@@ -903,12 +905,19 @@ def get_initial_cache_state(self, batch_size=1, dtype=torch.float32, device=None
             create_tensor = torch.zeros
         last_time_cache_size = self.conv_context_size[0]
         cache_last_channel = create_tensor(
-            (len(self.layers), batch_size, self.streaming_cfg.last_channel_cache_size, self.d_model,),
+            (
+                len(self.layers),
+                batch_size,
+                self.streaming_cfg.last_channel_cache_size,
+                self.d_model,
+            ),
             device=device,
             dtype=dtype,
         )
         cache_last_time = create_tensor(
-            (len(self.layers), batch_size, self.d_model, last_time_cache_size), device=device, dtype=dtype,
+            (len(self.layers), batch_size, self.d_model, last_time_cache_size),
+            device=device,
+            dtype=dtype,
         )
         if max_dim > 0:
             cache_last_channel_len = torch.randint(
@@ -934,7 +943,6 @@ def change_attention_model(
         update_config: bool = True,
         device: torch.device = None,
     ):
-
         """
         Update the self_attention_model which changes the positional encoding and attention layers.
 
@@ -1053,7 +1061,7 @@ def change_attention_model(
 
     def change_subsampling_conv_chunking_factor(self, subsampling_conv_chunking_factor: int):
         """
-        Update the conv_chunking_factor (int) 
+        Update the conv_chunking_factor (int)
         Default is 1 (auto)
         Set it to -1 (disabled) or to a specific value (power of 2) if you OOM in the conv subsampling layers
 
@@ -1098,7 +1106,9 @@ def _update_adapter_cfg_input_dim(self, cfg: DictConfig):
         cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.d_model)
         return cfg
 
-    def get_accepted_adapter_types(self,) -> Set[type]:
+    def get_accepted_adapter_types(
+        self,
+    ) -> Set[type]:
         types = super().get_accepted_adapter_types()
 
         if len(types) == 0:
@@ -1113,6 +1123,85 @@ def get_accepted_adapter_types(self,) -> Set[type]:
         return types
 
 
+class ConformerMultiLayerFeatureExtractor(NeuralModule, Exportable, AccessMixin):
+    """
+    A wrapper module that extracts features from multiple layers of a ConformerEncoder,
+    by reusing existing mechanisim for interctc loss.
+    To use it, set `layer_idx_list` to  specify the indices of layers to extract from.
+    Also, you can specify an `aggretator` module to aggregate the features from different layers, default not aggregating.
+    """
+
+    def __init__(
+        self,
+        encoder: ConformerEncoder,
+        layer_idx_list: List[int],
+        aggregator: NeuralModule = None,
+        detach: bool = False,
+        convert_to_cpu: bool = False,
+    ):
+        super().__init__()
+        self.encoder = encoder
+        self.layer_idx_list = [int(l) for l in layer_idx_list]
+        for x in self.layer_idx_list:
+            if x < 0 or x >= len(encoder.layers):
+                raise ValueError(f"layer index {x} out of range [0, {len(encoder.layers)})")
+        self.enc_access_cfg = {
+            "interctc": {
+                "capture_layers": self.layer_idx_list,
+            },
+            "detach": detach,
+            "convert_to_cpu": convert_to_cpu,
+        }
+        self.aggregator = aggregator
+
+    def forward(
+        self, audio_signal, length, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        old_access_flag = self.is_access_enabled(guid=getattr(self, "model_guid", None))
+        self.update_access_cfg(self.enc_access_cfg, guid=getattr(self, "model_guid", None))
+        self.set_access_enabled(access_enabled=True, guid=getattr(self, "model_guid", None))
+
+        _ = self.encoder(
+            audio_signal=audio_signal,
+            length=length,
+            cache_last_channel=cache_last_channel,
+            cache_last_time=cache_last_time,
+            cache_last_channel_len=cache_last_channel_len,
+        )
+
+        ### chunk of code adapted from ConformerEncoder.forward_internal()
+        total_registry = {}
+        for module_registry in self.get_module_registry(self.encoder).values():
+            for key in module_registry:
+                if key.startswith("interctc/") and key in total_registry:
+                    raise RuntimeError(f"layer {key} has been logged multiple times!")
+            total_registry.update(module_registry)
+
+        encoded_list = []
+        encoded_len_list = []
+        for layer_idx in self.layer_idx_list:
+            try:
+                layer_outputs = total_registry[f"interctc/layer_output_{layer_idx}"]
+                layer_lengths = total_registry[f"interctc/layer_length_{layer_idx}"]
+            except KeyError:
+                raise RuntimeError(
+                    f"Intermediate layer {layer_idx} was not captured! Check the layer index and the number of ConformerEncoder layers."
+                )
+            if len(layer_outputs) > 1 or len(layer_lengths) > 1:
+                raise RuntimeError("Make sure encoder.forward is called exactly one time")
+            encoded_list.append(layer_outputs[0])  # [B, D, T]
+            encoded_len_list.append(layer_lengths[0])  # [B]
+
+        self.encoder.reset_registry()
+        self.set_access_enabled(access_enabled=old_access_flag, guid=getattr(self, "model_guid", None))
+        ### end of adapted chunk
+
+        if self.aggregator is not None:
+            return self.aggregator(encoded_list, encoded_len_list)  # Tensor[B,D*L,T], Tensor[B]
+        else:
+            return encoded_list, encoded_len_list  # List[Tensor[B,D,T]], List[Tensor[B]]
+
+
 """
 Register any additional information
 """
diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py
index 5a71679607be..c252d498dc08 100644
--- a/nemo/collections/asr/parts/mixins/transcription.py
+++ b/nemo/collections/asr/parts/mixins/transcription.py
@@ -67,18 +67,18 @@ class TranscribeConfig:
     _internal: Optional[InternalTranscribeConfig] = None
 
 
-def move_to_device(batch, device):
+def move_to_device(batch, device, non_blocking=False):
     """
     Recursively move all tensors in `batch` to `device`.
     """
     if isinstance(batch, torch.Tensor):
-        return batch.to(device)
+        return batch.to(device, non_blocking=non_blocking)
     elif isinstance(batch, (list, tuple)):
-        return [move_to_device(x, device) for x in batch]
+        return [move_to_device(x, device, non_blocking) for x in batch]
     elif isinstance(batch, dict):
-        return {k: move_to_device(v, device) for k, v in batch.items()}
+        return {k: move_to_device(v, device, non_blocking) for k, v in batch.items()}
     else:
-        raise TypeError(f"Unsupported type: {type(batch)}")
+        return batch  # do nothing if not supported type
 
 
 def get_value_from_transcription_config(trcfg, key, default):
diff --git a/nemo/collections/common/data/dataset.py b/nemo/collections/common/data/dataset.py
index c2c29b54f7f6..71220dd9d5f2 100644
--- a/nemo/collections/common/data/dataset.py
+++ b/nemo/collections/common/data/dataset.py
@@ -26,12 +26,12 @@
 
 class ConcatDataset(IterableDataset):
     """
-    A dataset that accepts as argument multiple datasets and then samples from them based on the specified 
+    A dataset that accepts as argument multiple datasets and then samples from them based on the specified
     sampling technique.
 
     Args:
         datasets (list): A list of datasets to sample from.
-        shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets. 
+        shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets.
             Defaults to True.
         sampling_technique (str): Sampling technique to choose which dataset to draw a sample from.
             Defaults to 'temperature'. Currently supports 'temperature', 'random' and 'round-robin'.
@@ -73,7 +73,9 @@ def __init__(
             self.sampling_kwargs['seed'] = seed
         elif sampling_technique == 'random':
             self.index_generator = ConcatDataset.random_generator
-            self.sampling_kwargs['p'] = sampling_probabilities
+            self.sampling_kwargs['p'] = (
+                sampling_probabilities if sampling_probabilities else [1 / len(datasets)] * len(datasets)
+            )
             self.sampling_kwargs['seed'] = seed
         elif sampling_technique == 'round-robin':
             self.index_generator = ConcatDataset.round_robin_generator
@@ -200,7 +202,7 @@ def random_generator(datasets, **kwargs):
 
 class ConcatMapDataset(Dataset):
     """
-    A dataset that accepts as argument multiple datasets and then samples from them based on the specified 
+    A dataset that accepts as argument multiple datasets and then samples from them based on the specified
     sampling technique.
 
     Args:
@@ -300,7 +302,7 @@ class CodeSwitchedDataset(IterableDataset):
     Args:
         datasets (list): A list of datasets
         lang_probs (list): A list of probabilities (which must sum to 1) corresponding to the sampling probability for each dataset
-        shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets. 
+        shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets.
             Defaults to True.
         min_duration (int): the minimum duration (secs) of each synthetic code-switched sample. Will draw randomly until this is hit.
             Defaults to 4
@@ -535,7 +537,7 @@ def build_single_CS_sample(self):
                 wav = np.trim_zeros(wav)
 
             # normalise to provided DB level
-            wav_norm = wav * (10.0 ** (self.db_norm / 20.0) / np.maximum(0.01, (wav ** 2).mean(axis=0) ** 0.5))
+            wav_norm = wav * (10.0 ** (self.db_norm / 20.0) / np.maximum(0.01, (wav**2).mean(axis=0) ** 0.5))
 
             # this part appends the normed waveform to the existing waveform, and inserts pause_join amount of silence
             # if necessary, otherwise just a straight append
diff --git a/nemo/collections/common/metrics/__init__.py b/nemo/collections/common/metrics/__init__.py
index 322e62214ead..9e21d93816a9 100644
--- a/nemo/collections/common/metrics/__init__.py
+++ b/nemo/collections/common/metrics/__init__.py
@@ -14,5 +14,9 @@
 
 from nemo.collections.common.metrics.classification_accuracy import TopKClassificationAccuracy
 from nemo.collections.common.metrics.global_average_loss_metric import GlobalAverageLossMetric
-from nemo.collections.common.metrics.metric_string_to_torchmetric import MetricStringToTorchMetric
+from nemo.collections.common.metrics.metric_string_to_torchmetric import (
+    ClassificationMetricsSet,
+    MetricStringToTorchMetric,
+    TextMetricsSet,
+)
 from nemo.collections.common.metrics.perplexity import Perplexity
diff --git a/nemo/collections/common/metrics/metric_string_to_torchmetric.py b/nemo/collections/common/metrics/metric_string_to_torchmetric.py
index b38047b576cc..f91c915309f2 100644
--- a/nemo/collections/common/metrics/metric_string_to_torchmetric.py
+++ b/nemo/collections/common/metrics/metric_string_to_torchmetric.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 
 from torchmetrics import Accuracy, AveragePrecision, F1Score, MatthewsCorrCoef, PearsonCorrCoef, SpearmanCorrCoef
+from torchmetrics.text import SacreBLEUScore
 from torchmetrics.text.rouge import ROUGEScore
+from torchmetrics.text.wer import WordErrorRate
 
 from nemo.collections.common.metrics.classification_accuracy import ExactStringMatchMetric, TokenF1Score
 
-__all__ = ['MetricStringToTorchMetric']
+__all__ = ['MetricStringToTorchMetric', 'TextMetricsSet', 'ClassificationMetricsSet']
 
 # Dictionary that maps a metric string name to its corresponding torchmetric class.
 
@@ -31,4 +33,10 @@
     'matthews_corr_coef': MatthewsCorrCoef,
     'exact_string_match': ExactStringMatchMetric,
     'rouge': ROUGEScore,
+    'wer': WordErrorRate,
+    'bleu': SacreBLEUScore,
 }
+
+TextMetricsSet = set(['rouge', 'wer', 'bleu'])
+
+ClassificationMetricsSet = set(['accuracy', 'average_precision', 'f1', 'exact_string_match'])
diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py
index 66def034400f..24ca6cffe458 100644
--- a/nemo/collections/common/parts/preprocessing/collections.py
+++ b/nemo/collections/common/parts/preprocessing/collections.py
@@ -17,11 +17,11 @@
 import os
 from itertools import combinations
 from typing import Any, Dict, Iterable, List, Optional, Union
-
+import numpy as np
 import pandas as pd
 
 from nemo.collections.common.parts.preprocessing import manifest, parsers
-from nemo.utils import logging
+from nemo.utils import logging, logging_mode
 
 
 class _Collection(collections.UserList):
@@ -320,7 +320,13 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
             **kwargs: Kwargs to pass to `AudioText` constructor.
         """
 
-        ids, audio_files, durations, texts, offsets, = (
+        (
+            ids,
+            audio_files,
+            durations,
+            texts,
+            offsets,
+        ) = (
             [],
             [],
             [],
@@ -343,6 +349,19 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
         )
 
 
+class SpeechLLMAudioTextEntity(object):
+    def __init__(self, sid, audio_file, duration, context, answer, offset, speaker, orig_sr, lang) -> None:
+        self.id = sid
+        self.audio_file = audio_file
+        self.duration = duration
+        self.context = context
+        self.answer = answer
+        self.offset = offset
+        self.speaker = speaker
+        self.orig_sr = orig_sr
+        self.lang = lang
+
+
 class ASRVideoText(VideoText):
     """`VideoText` collector from cv structured json files."""
 
@@ -356,7 +375,13 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
             **kwargs: Kwargs to pass to `VideoText` constructor.
         """
 
-        ids, video_files, durations, texts, offsets, = (
+        (
+            ids,
+            video_files,
+            durations,
+            texts,
+            offsets,
+        ) = (
             [],
             [],
             [],
@@ -379,10 +404,272 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
         )
 
 
+class SpeechLLMAudioText(object):
+    """List of audio-transcript text correspondence with preprocessing.
+
+    All of the audio, duration, context, answer are optional.
+    If answer is not present, text is treated as the answer.
+    """
+
+    def __init__(
+        self,
+        ids: List[int],
+        audio_files: List[str],
+        durations: List[float],
+        context_list: List[str],
+        answers: List[str],
+        offsets: List[str],
+        speakers: List[Optional[int]],
+        orig_sampling_rates: List[Optional[int]],
+        langs: List[Optional[str]],
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+        max_number: Optional[int] = None,
+        do_sort_by_duration: bool = False,
+        index_by_file_id: bool = False,
+        max_num_samples: Optional[int] = None,
+    ):
+        """Instantiates audio-context-answer manifest with filters and preprocessing.
+
+
+        Args:
+            ids: List of examples positions.
+            audio_files: List of audio files.
+            durations: List of float durations.
+            context_list: List of raw text transcripts.
+            answers: List of raw text transcripts.
+            offsets: List of duration offsets or None.
+            speakers: List of optional speakers ids.
+            orig_sampling_rates: List of original sampling rates of audio files.
+            langs: List of language ids, one for eadh sample, or None.
+            min_duration: Minimum duration to keep entry with (default: None).
+            max_duration: Maximum duration to keep entry with (default: None).
+            max_number: Maximum number of samples to collect.
+            do_sort_by_duration: True if sort samples list by duration. Not compatible with index_by_file_id.
+            index_by_file_id: If True, saves a mapping from filename base (ID) to index in data.
+        """
+
+        data, duration_filtered, num_filtered, total_duration = [], 0.0, 0, 0.0
+        if index_by_file_id:
+            self.mapping = {}
+
+        for id_, audio_file, duration, offset, context, answer, speaker, orig_sr, lang in zip(
+            ids, audio_files, durations, offsets, context_list, answers, speakers, orig_sampling_rates, langs
+        ):
+            # Duration filters.
+            if duration is not None:
+                curr_min_dur = min(duration) if isinstance(duration, list) else duration
+                curr_max_dur = max(duration) if isinstance(duration, list) else duration
+                curr_sum_dur = sum(duration) if isinstance(duration, list) else duration
+                if min_duration is not None and curr_min_dur < min_duration:
+                    duration_filtered += curr_sum_dur
+                    num_filtered += 1
+                    continue
+
+                if max_duration is not None and curr_max_dur > max_duration:
+                    duration_filtered += curr_sum_dur
+                    num_filtered += 1
+                    continue
+                total_duration += curr_sum_dur
+
+            if answer is None:
+                duration_filtered += curr_sum_dur
+                num_filtered += 1
+                continue
+
+            data.append(
+                SpeechLLMAudioTextEntity(id_, audio_file, duration, context, answer, offset, speaker, orig_sr, lang)
+            )
+            if index_by_file_id and audio_file is not None:
+                file_id, _ = os.path.splitext(os.path.basename(audio_file))
+                if file_id not in self.mapping:
+                    self.mapping[file_id] = []
+                self.mapping[file_id].append(len(data) - 1)
+
+            # Max number of entities filter.
+            if len(data) == max_number:
+                break
+
+        if max_num_samples is not None and not index_by_file_id:
+            if max_num_samples <= len(data):
+                logging.info(f"Subsampling dataset from {len(data)} to {max_num_samples} samples")
+                data = data[:max_num_samples]
+            else:
+                logging.info(f"Oversampling dataset from {len(data)} to {max_num_samples} samples")
+                data = data * (max_num_samples // len(data))
+                res_num = max_num_samples % len(data)
+                res_data = [data[idx] for idx in np.random.choice(len(data), res_num, replace=False)]
+                data.extend(res_data)
+        elif max_num_samples is not None and index_by_file_id:
+            logging.warning("Tried to subsample dataset by max_num_samples, but cannot since index_by_file_id is set.")
+
+        if do_sort_by_duration:
+            if index_by_file_id:
+                logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
+            else:
+                data.sort(key=lambda entity: entity.duration)
+
+        logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600)
+        logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600)
+
+        self.data = data
+
+    def __getitem__(self, idx):
+        if idx < 0 or idx > len(self.data):
+            raise ValueError(f"index out of range [0,{len(self.data)}), got {idx} instead")
+        return self.data[idx]
+
+    def __len__(self):
+        return len(self.data)
+
+
+class SpeechLLMAudioTextCollection(SpeechLLMAudioText):
+    """`SpeechLLMAudioText` collector from SpeechLLM json files.
+
+    This collector also keeps backward compatibility with SpeechLLMAudioText.
+    """
+
+    def __init__(
+        self,
+        manifests_files: Union[str, List[str]],
+        context_file: Optional[Union[List[str], str]] = None,
+        context_key: str = "context",
+        answer_key: str = "answer",
+        *args,
+        **kwargs,
+    ):
+        """Parse lists of audio files, durations and transcripts texts.
+
+        Args:
+            manifests_files: Either single string file or list of such -
+                manifests to yield items from.
+            *args: Args to pass to `AudioText` constructor.
+            **kwargs: Kwargs to pass to `AudioText` constructor.
+        """
+        self.context_key = context_key
+        self.answer_key = answer_key
+
+        (
+            ids,
+            audio_files,
+            durations,
+            context_list,
+            answers,
+            offsets,
+        ) = (
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+        speakers, orig_srs, langs = (
+            [],
+            [],
+            [],
+        )
+        if context_file is not None:
+            question_file_list = context_file.split(",") if isinstance(context_file, str) else context_file
+            self.context_list = []
+            for filepath in question_file_list:
+                with open(filepath, 'r') as f:
+                    for line in f.readlines():
+                        line = line.strip()
+                        if line:
+                            self.context_list.append(line)
+            logging.info(f"Use random text context from {context_file} for {manifests_files}")
+        else:
+            self.context_list = None
+
+        for item in manifest.item_iter(manifests_files, parse_func=self.__parse_item):
+            ids.append(item['id'])
+            audio_files.append(item['audio_file'])
+            durations.append(item['duration'])
+            context_list.append(item['context'])
+            answers.append(item['answer'])
+            offsets.append(item['offset'])
+            speakers.append(item['speaker'])
+            orig_srs.append(item['orig_sr'])
+            langs.append(item['lang'])
+        super().__init__(
+            ids, audio_files, durations, context_list, answers, offsets, speakers, orig_srs, langs, *args, **kwargs
+        )
+
+    def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
+        item = json.loads(line)
+
+        # Audio file
+        if 'audio_filename' in item:
+            item['audio_file'] = item.pop('audio_filename')
+        elif 'audio_filepath' in item:
+            item['audio_file'] = item.pop('audio_filepath')
+        elif 'audio_file' not in item:
+            item['audio_file'] = None
+
+        # If the audio path is a relative path and does not exist,
+        # try to attach the parent directory of manifest to the audio path.
+        # Revert to the original path if the new path still doesn't exist.
+        # Assume that the audio path is like "wavs/xxxxxx.wav".
+        if item['audio_file'] is not None:
+            item['audio_file'] = manifest.get_full_path(audio_file=item['audio_file'], manifest_file=manifest_file)
+
+        # Duration.
+        if 'duration' not in item:
+            item['duration'] = None
+
+        # Answer.
+        if self.answer_key in item:
+            item['answer'] = item.pop(self.answer_key)
+        elif 'text' in item:
+            # compatability with ASR manifests that uses 'text' as answer key
+            item['answer'] = item.pop('text')
+        elif 'text_filepath' in item:
+            with open(item.pop('text_filepath'), 'r') as f:
+                item['answer'] = f.read()
+        else:
+            item['answer'] = "na"
+
+        # context.
+        if self.context_key in item:
+            item['context'] = item.pop(self.context_key)
+        elif 'context_filepath' in item:
+            with open(item.pop('context_filepath'), 'r') as f:
+                item['context'] = f.read()
+        elif self.context_list is not None:
+            context = np.random.choice(self.context_list).strip()
+            item['context'] = context
+        elif 'question' in item:
+            # compatability with old manifests that uses 'question' as context key
+            logging.warning(
+                f"Neither `{self.context_key}` is found nor `context_file` is set, but found `question` in item: {item}",
+                mode=logging_mode.ONCE,
+            )
+            item['context'] = item.pop('question')
+        else:
+            # default context if nothing is found
+            item['context'] = "what does this audio mean"
+
+        item = dict(
+            audio_file=item['audio_file'],
+            duration=item['duration'],
+            context=str(item['context']),
+            answer=str(item['answer']),
+            offset=item.get('offset', None),
+            speaker=item.get('speaker', None),
+            orig_sr=item.get('orig_sample_rate', None),
+            lang=item.get('lang', None),
+        )
+        return item
+
+
 class SpeechLabel(_Collection):
     """List of audio-label correspondence with preprocessing."""
 
-    OUTPUT_TYPE = collections.namedtuple(typename='SpeechLabelEntity', field_names='audio_file duration label offset',)
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='SpeechLabelEntity',
+        field_names='audio_file duration label offset',
+    )
 
     def __init__(
         self,
@@ -532,7 +819,10 @@ def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
 class FeatureSequenceLabel(_Collection):
     """List of feature sequence of label correspondence with preprocessing."""
 
-    OUTPUT_TYPE = collections.namedtuple(typename='FeatureSequenceLabelEntity', field_names='feature_file seq_label',)
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='FeatureSequenceLabelEntity',
+        field_names='feature_file seq_label',
+    )
 
     def __init__(
         self,
@@ -614,9 +904,11 @@ class ASRFeatureSequenceLabel(FeatureSequenceLabel):
     """`FeatureSequenceLabel` collector from asr structured json files."""
 
     def __init__(
-        self, manifests_files: Union[str, List[str]], max_number: Optional[int] = None, index_by_file_id: bool = False,
+        self,
+        manifests_files: Union[str, List[str]],
+        max_number: Optional[int] = None,
+        index_by_file_id: bool = False,
     ):
-
         """Parse lists of feature files and sequences of labels.
 
         Args:
@@ -655,7 +947,10 @@ def _parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
                 f"Manifest file has invalid json line " f"structure: {line} without proper seq_label key."
             )
 
-        item = dict(feature_file=item['feature_file'], seq_label=item['seq_label'],)
+        item = dict(
+            feature_file=item['feature_file'],
+            seq_label=item['seq_label'],
+        )
 
         return item
 
@@ -759,7 +1054,8 @@ def __init__(
                 data.sort(key=lambda entity: entity.duration)
 
         logging.info(
-            "Filtered duration for loading collection is %f.", duration_filtered,
+            "Filtered duration for loading collection is %f.",
+            duration_filtered,
         )
         logging.info(f"Total {len(data)} session files loaded accounting to # {len(audio_files)} audio clips")
 
@@ -937,8 +1233,7 @@ def __parse_item_rttm(self, line: str, manifest_file: str) -> Dict[str, Any]:
 
 
 class Audio(_Collection):
-    """Prepare a list of all audio items, filtered by duration.
-    """
+    """Prepare a list of all audio items, filtered by duration."""
 
     OUTPUT_TYPE = collections.namedtuple(typename='Audio', field_names='audio_files duration offset text')
 
@@ -999,11 +1294,14 @@ def __init__(
 
 
 class AudioCollection(Audio):
-    """List of audio files from a manifest file.
-    """
+    """List of audio files from a manifest file."""
 
     def __init__(
-        self, manifest_files: Union[str, List[str]], audio_to_manifest_key: Dict[str, str], *args, **kwargs,
+        self,
+        manifest_files: Union[str, List[str]],
+        audio_to_manifest_key: Dict[str, str],
+        *args,
+        **kwargs,
     ):
         """Instantiates a list of audio files loaded from a manifest file.
 
@@ -1045,6 +1343,7 @@ def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
         Returns:
             Dictionary with audio_files, duration, and offset.
         """
+
         # Local utility function
         def get_audio_file(item: Dict, manifest_key: Union[str, List[str]]):
             """Get item[key] if key is string, or a list
@@ -1117,7 +1416,10 @@ def get_audio_file(item: Dict, manifest_key: Union[str, List[str]]):
 class FeatureLabel(_Collection):
     """List of feature sequence and their label correspondence with preprocessing."""
 
-    OUTPUT_TYPE = collections.namedtuple(typename='FeatureLabelEntity', field_names='feature_file label duration',)
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='FeatureLabelEntity',
+        field_names='feature_file label duration',
+    )
 
     def __init__(
         self,
@@ -1194,7 +1496,6 @@ def __init__(
         *args,
         **kwargs,
     ):
-
         """Parse lists of feature files and sequences of labels.
 
         Args:
@@ -1383,7 +1684,14 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
             **kwargs: Kwargs to pass to `AudioText` constructor.
         """
 
-        ids, feature_files, rttm_files, durations, texts, offsets, = (
+        (
+            ids,
+            feature_files,
+            rttm_files,
+            durations,
+            texts,
+            offsets,
+        ) = (
             [],
             [],
             [],
diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
index b686322c0882..aed05673f6fa 100644
--- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
+++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
@@ -28,7 +28,7 @@
 class SentencePieceTokenizer(TokenizerSpec):
     """
     Sentencepiecetokenizer https://github.com/google/sentencepiece.
-    
+
     Args:
         model_path: path to sentence piece tokenizer model. To create the model use create_spt_model()
         special_tokens: either list of special tokens or dictionary of token name to token value
@@ -87,7 +87,7 @@ def text_to_tokens(self, text):
 
         return self.tokenizer.encode_as_pieces(text)
 
-    def text_to_ids(self, text):
+    def text_to_ids(self, text, sample_alpha=None):
         if self.legacy:
             ids = []
             idx = 0
@@ -115,7 +115,10 @@ def text_to_ids(self, text):
             ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
             return ids
 
-        return self.tokenizer.encode_as_ids(text)
+        if sample_alpha is not None:
+            return self.tokenizer.encode_as_ids(text, enable_sampling=True, alpha=sample_alpha, nbest_size=-1)
+        else:
+            return self.tokenizer.encode_as_ids(text)
 
     def tokens_to_text(self, tokens):
         if isinstance(tokens, np.ndarray):
diff --git a/nemo/collections/multimodal/speech_llm/__init__.py b/nemo/collections/multimodal/speech_llm/__init__.py
new file mode 100644
index 000000000000..f0c19a3eebb9
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.multimodal.speech_llm import models, modules
diff --git a/nemo/collections/multimodal/speech_llm/data/__init__.py b/nemo/collections/multimodal/speech_llm/data/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/data/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py b/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
new file mode 100644
index 000000000000..7d0ee6afbfa2
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
@@ -0,0 +1,1327 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import io
+import os
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import torch
+import webdataset as wds
+from omegaconf import DictConfig, ListConfig, open_dict
+
+from nemo.collections.asr.data.audio_to_text import (
+    VALID_FILE_FORMATS,
+    cache_datastore_manifests,
+    expand_sharded_filepaths,
+    shard_manifests_if_needed,
+)
+from nemo.collections.asr.data.audio_to_text_dataset import ConcatDataset, convert_to_config_list, get_chain_dataset
+from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
+from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
+from nemo.collections.common.parts.preprocessing import collections
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import (
+    ceil_to_nearest,
+    get_num_samples_from_files,
+    maybe_cast_to_list,
+)
+from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
+    get_datasets_weights_and_num_samples,
+)
+from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
+from nemo.core.classes import Dataset, IterableDataset
+from nemo.utils import logging, logging_mode
+from nemo.utils.distributed import webdataset_split_by_workers
+
+try:
+    from megatron.core import parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_MEGATRON_CORE = False
+
+__all__ = [
+    'AudioTextDataset',
+    'TarredAudioTextDataset',
+    'get_tarred_audio_text_dataset_from_config',
+    'get_audio_text_dataset_from_config',
+]
+
+
+def _audio_collate_fn(audio_signals, audio_lengths):
+    """collate batch of audio sig, audio len, tokens, tokens len
+    Args:
+        audio_signals: List[Tensor]
+        audio_lengths: List[Tensor]
+    """
+
+    max_audio_len = 0
+    has_audio = audio_lengths[0] is not None
+    if has_audio:
+        max_audio_len = max(audio_lengths).item()
+
+    audio_signals_padded = []
+    for sig, sig_len in zip(audio_signals, audio_lengths):
+        if has_audio:
+            sig_len = sig_len.item()
+            if sig_len < max_audio_len:
+                pad = (0, max_audio_len - sig_len)
+                sig = torch.nn.functional.pad(sig, pad)
+            audio_signals_padded.append(sig)
+
+    if has_audio:
+        audio_signals_padded = torch.stack(audio_signals_padded)
+        audio_lengths = torch.stack(audio_lengths)
+    else:
+        audio_signals_padded, audio_lengths = None, None
+
+    return audio_signals_padded, audio_lengths
+
+
+def _build_loss_mask(processed_example: Dict, answer_only_loss: bool = True):
+    """Pad input_ids in batch to max batch length while building loss mask"""
+    # function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
+    input_ids = processed_example['input_ids']
+    answer_start_idx = processed_example['answer_start_idx']
+    if answer_only_loss:
+        loss_mask = [float(idx >= answer_start_idx) for idx in range(len(input_ids))]
+    else:
+        loss_mask = [1.0] * len(input_ids)
+
+    return loss_mask
+
+
+def _collate_item(item: Union[torch.Tensor, np.ndarray, List], max_length: int, pad_id: int = 0):
+    # function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
+    item = maybe_cast_to_list(item)
+    # max_length = max([len(x) for x in item]) if item else 0
+    # here [0] should be tokenizer.pad_id
+    item = [x + [pad_id] * (max_length - len(x)) for x in item]
+    return item
+
+
+def _speechllm_audio_text_collate_fn(
+    batch: Dict,
+    tokens_to_generate: int,
+    pad_to_max_length: bool,
+    max_seq_length: int,
+    text_pad_id: int,
+):
+    sample_ids = [x["idx"] for x in batch]
+    sample_ids = torch.tensor(sample_ids, dtype=torch.int32)
+
+    audio_signal = [x["audio_signal"] for x in batch]
+    audio_lengths = [x["audio_length"] for x in batch]
+    audio_signal, audio_lengths = _audio_collate_fn(audio_signal, audio_lengths)
+
+    input_ids = [item['input_ids'][:-1] for item in batch]
+    labels = [item['input_ids'][1:] for item in batch]
+    contexts = [item['context_ids'] for item in batch]
+    context_lengths = torch.LongTensor([item['context_length'] for item in batch])
+    answers = [item['answer_ids'] for item in batch]
+
+    loss_mask = [_build_loss_mask(item)[1:] for item in batch]
+
+    max_length = max([len(x) for x in input_ids]) + tokens_to_generate
+    # increase max length to nearest multiple of 4 or 8
+    if pad_to_max_length:
+        max_length = max_seq_length
+    else:
+        max_length = min(max_seq_length, ceil_to_nearest(max_length, 8))
+    assert max_length <= max_seq_length
+
+    position_ids = [list(range(max_length)) for _ in batch]
+    position_ids = torch.LongTensor(position_ids)
+    input_ids = torch.LongTensor(_collate_item(input_ids, max_length=max_length, pad_id=text_pad_id))
+    input_length = torch.LongTensor([len(x) for x in input_ids])
+    labels = torch.LongTensor(_collate_item(labels, max_length=max_length, pad_id=text_pad_id))
+    loss_mask = torch.LongTensor(_collate_item(loss_mask, max_length=max_length, pad_id=0))
+    contexts = torch.LongTensor(_collate_item(contexts, max_length=max_length, pad_id=text_pad_id))
+    answers = torch.LongTensor(_collate_item(answers, max_length=max_length, pad_id=text_pad_id))
+
+    batch = {
+        'sample_ids': sample_ids,
+        'audio_signal': audio_signal,
+        'audio_signal_length': audio_lengths,
+        'tokens': input_ids,
+        'tokens_length': input_length,
+        'labels': labels,
+        'loss_mask': loss_mask,
+        'position_ids': position_ids,
+        'contexts': contexts,
+        'context_lengths': context_lengths,
+        'answers': answers,
+        'max_length': torch.LongTensor(max_length),
+        'metadata': [x['metadata'] for x in batch],
+    }
+
+    return batch
+
+
+def _speechllm_multi_audio_text_collate_fn(
+    batch: Dict,
+    tokens_to_generate: int,
+    pad_to_max_length: bool,
+    max_seq_length: int,
+    text_pad_id: int,
+):
+    """Collate function for multi audio case."""
+    context_start_idx = [item['context_start_idx'] for item in batch]
+
+    audio_signals = [x["audio_signal"] for x in batch]
+    audio_lengths = [x["audio_length"] for x in batch]
+    num_audios = [len(x) for x in audio_signals]
+
+    # put all audios from all samples in one batch
+    audio_signals_merged = [item for audio_list in audio_signals for item in audio_list]
+    audio_lengths_merged = [item for length_list in audio_lengths for item in length_list]
+    audio_signals_merged, audio_lengths_merged = _audio_collate_fn(audio_signals_merged, audio_lengths_merged)
+
+    for i in range(len(batch)):
+        # create dummy audio_signal and audio_length for _speechllm_audio_text_collate_fn()
+        batch[i]["audio_signal"] = audio_signals[i][0]
+        batch[i]["audio_length"] = audio_lengths[i][0]
+
+    batch = _speechllm_audio_text_collate_fn(batch, tokens_to_generate, pad_to_max_length, max_seq_length, text_pad_id)
+
+    # add multi audio specific fields
+    batch['context_start_idx'] = list(context_start_idx)
+    batch['num_audios'] = torch.LongTensor(num_audios)
+    batch['audio_signal'] = audio_signals_merged
+    batch['audio_signal_length'] = audio_lengths_merged
+
+    return batch
+
+
+class TextProcessing(object):
+    """
+    Text processing pipeline for AudioTextDataset and TarredAudioTextDataset.
+    This class is adapted from the one used in nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+    """
+
+    def __init__(
+        self,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        max_seq_length: int = 1024,
+        min_seq_length: int = 1,
+        add_bos: bool = False,
+        add_eos: bool = True,
+        add_sep: bool = False,
+        sep_id: Optional[int] = None,
+        seed: int = 1234,
+        separate_prompt_and_response_with_newline: bool = False,
+        answer_only_loss: bool = True,
+        truncation_field: str = "answer",
+        pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
+        prompt_template: str = None,
+        virtual_tokens: int = 0,
+        tokens_to_generate: int = 0,
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        end_string: Optional[str] = None,
+        sample_alpha: Optional[float] = None,
+        audio_locator: Optional[str] = None,
+    ):
+        self.context_key = context_key
+        self.answer_key = answer_key
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        self.min_seq_length = min_seq_length
+        self.seed = seed
+        self.separate_prompt_and_response_with_newline = separate_prompt_and_response_with_newline
+        self.answer_only_loss = answer_only_loss
+        self.truncation_field = truncation_field
+        self.pad_to_max_length = pad_to_max_length
+        self.prompt_template = prompt_template
+        self.virtual_tokens = virtual_tokens
+        self.tokens_to_generate = tokens_to_generate
+        self.add_bos = add_bos
+        self.add_eos = add_eos
+        self.add_sep = add_sep
+        self.end_string = end_string
+        self.sample_alpha = sample_alpha
+        self.audio_locator = audio_locator
+
+        if add_bos and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0:
+            self.bos_id = tokenizer.bos_id
+        else:
+            self.bos_id = None
+
+        if add_eos and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0:
+            self.eos_id = tokenizer.eos_id
+        else:
+            self.eos_id = None
+
+        if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0:
+            self.pad_id = tokenizer.pad_id
+        else:
+            self.pad_id = self.eos_id if self.eos_id is not None else 0
+
+        self.sep_id = sep_id if add_sep else None
+
+        if self.prompt_template is not None:
+            # When providing things like newlines in the prompt template via the CLI, they are escaped. This line unescapes them.
+            self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape')
+        assert self.truncation_field in ["answer", "context"]
+
+    def _process_example(self, context: str, output: str):
+        """
+        Create an example by concatenating text and answer.
+        Truncation is carried out when needed, but it is performed only on the prompt side.
+        BOS, EOS, and SEP, are added if specified.
+
+        function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
+        """
+        if self.prompt_template is not None:
+            if self.context_key not in self.prompt_template or self.answer_key not in self.prompt_template:
+                if "input" in self.prompt_template and "output" in self.prompt_template:
+                    logging.warning(
+                        f"Using 'input' and 'output' as context and answer keys, since given ones ({self.context_key}, {self.answer_key}) are not found in the prompt template: {self.prompt_template}.",
+                        mode=logging_mode.ONCE,
+                    )
+                    self.context_key = "input"
+                    self.answer_key = "output"
+            assert f'{{{self.context_key}}}' in self.prompt_template
+            assert f'{{{self.answer_key}}}' in self.prompt_template
+            # Make sure that '{output}' always occurs at the end of the prompt template string
+            assert self.prompt_template.index(f'{{{self.answer_key}}}') == len(self.prompt_template) - len(
+                f'{{{self.answer_key}}}'
+            )
+            # Get the context by replacing only the input
+            original_context = context
+            context = (
+                self.prompt_template.replace(f'{{{self.context_key}}}', context)
+                .replace(f'{{{self.answer_key}}}', '')
+                .strip(' ')
+            )
+            # Replace the input and output placeholders with the actual input and output
+            text = self.prompt_template.replace(f'{{{self.context_key}}}', original_context).replace(
+                f'{{{self.answer_key}}}', output
+            )
+
+        elif self.separate_prompt_and_response_with_newline:
+            text = context + '\n' + output
+        else:
+            text = context + ' ' + output
+
+        if self.virtual_tokens:
+            # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context
+            # these pad/eos tokens are placeholders for virtual tokens
+            pre_pad = [self.tokenizer.eos_id] * self.virtual_tokens
+        else:
+            pre_pad = []
+        answer_text = text[len(context) :]
+        answer_ids = pre_pad + self.tokenizer.text_to_ids(answer_text, self.sample_alpha)
+        if self.end_string:
+            answer_ids += self.tokenizer.text_to_ids(self.end_string)
+
+        if self.audio_locator is None:
+            # signle audio case
+            context_ids = self.tokenizer.text_to_ids(context)
+            context_start_idx = [0]
+        else:
+            # multiple audio case
+            context_ids = []
+            context_start_idx = []
+            for context_seg in context.split(self.audio_locator):
+                context_start_idx.append(len(context_ids))
+                context_ids.extend(self.tokenizer.text_to_ids(context_seg))
+        context_ids = pre_pad + context_ids
+        context_start_idx = [x + len(pre_pad) for x in context_start_idx]
+
+        # for the long context cases, collate_fn includes self.tokens_to_generate for padding
+        total_ids = len(context_ids) + max(len(answer_ids), self.tokens_to_generate)
+        if self.add_bos:
+            total_ids += 1
+        if self.add_sep:
+            total_ids += 1
+        # Only training need to consider eos token
+        if self.add_eos and self.tokens_to_generate == 0:
+            total_ids += 1
+
+        # If the total number of token is greater than the max, we will try to truncate the answer
+        if total_ids > self.max_seq_length:
+            truncation_length = total_ids - self.max_seq_length
+            if self.truncation_field == "answer":
+                answer_ids = answer_ids[: -min(truncation_length, len(answer_ids))]
+            elif self.truncation_field == "context":
+                context_ids = context_ids[: -min(truncation_length, len(context_ids))]
+
+        input_ids = context_ids
+        answer_start_idx = len(input_ids)
+
+        # Adds bos token in the start
+        if self.add_bos:
+            context_ids = [self.tokenizer.bos_id] + context_ids
+            input_ids = [self.tokenizer.bos_id] + input_ids
+            answer_start_idx += 1
+
+        # Adds sep token between text/prompt and answer
+        if self.add_sep:
+            context_ids = context_ids + [self.sep_id]
+            input_ids = input_ids + [self.sep_id]
+            answer_start_idx += 1
+
+        input_ids = input_ids + answer_ids
+
+        # Only training need to consider eos token
+        if self.add_eos and self.tokens_to_generate == 0:
+            input_ids = input_ids + [self.tokenizer.eos_id]
+
+        if len(input_ids) > self.max_seq_length:
+            logging.warning(f'Input ids length {len(input_ids)} exceed max sequence length {self.max_seq_length}')
+            input_ids = input_ids[: self.max_seq_length]
+
+        processed_example = {
+            'input_ids': input_ids,
+            'answer_start_idx': answer_start_idx,
+            'context_ids': context_ids,
+            'context_length': len(context_ids),
+            'answer_ids': answer_ids,
+            'context_start_idx': context_start_idx,
+        }
+
+        return processed_example
+
+
+class AudioTextDataset(TextProcessing, Dataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds).
+    Each new line is a different sample. Example below:
+    {"audio_filepath": "1.wav", "duration": 1.12, "question": "what is the capital of France?", "answer": "Paris"}
+    {"audio_filepath": "2.wav", "duration": 2.15, "question": "what is the capital of Italy?", "answer": "Rome"}
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can be comma-separated paths.
+        tokenizer: text tokenizer object
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded
+            audio
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include in dataset
+        max_utts: Limit number of utterances
+        trim: whether or not to trim silence. Defaults to False
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+        --------- NLP SPECIFIC ARGS -------------
+        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
+        add_bos (bool): Whether to add a beginning of sentence token to each data example
+        add_eos (bool): Whether to add an end of sentence token to each data example
+        add_sep (bool): Whether to add a separation token to each data example (goes between prompt and answer)
+        tokens_to_generate (int): (inference only) Number of tokens to generate during inference
+        seed: Random seed for data shuffling.
+        max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded.
+        seed: int = 1234,
+        context_key: Key to use for the context in your JSONL file
+        answer_key: Key to use for the label in your JSONL file
+        separate_prompt_and_response_with_newline: Adds a newline between prompt and response.
+        answer_only_loss: If True, will compute the loss only on the answer part of the input. If False, will compute the loss on the entire input.
+        truncation_field: Field to use for truncation. (Options: "answer", "context"). Field to be used for truncation if the combined length exceeds the max sequence length.
+        pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
+        prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
+        end_string: Optional[str] = None, if not None, add this string to the end of the answer.
+        --------------- additional args for misc purposes ----------------
+        context_file: Optional[Union[List[str], str]] = None, if provided, will use this file to load random questions from, if question is not in manifest.
+        sample_alpha: Optional[float] = None, for SPE subword sampling
+        audio_locator: Optional[str] = None, a special string to split the context into multiple audio segments.
+    """
+
+    def __init__(
+        self,
+        manifest_filepath: str,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        max_duration: Optional[int] = None,
+        min_duration: Optional[int] = None,
+        max_utts: int = 0,
+        trim: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+        max_seq_length: int = 1024,
+        min_seq_length: int = 1,
+        add_bos: bool = False,
+        add_eos: bool = True,
+        add_sep: bool = False,
+        sep_id: Optional[int] = None,
+        max_num_samples: Optional[int] = None,
+        seed: int = 1234,
+        separate_prompt_and_response_with_newline: bool = False,
+        answer_only_loss: bool = True,
+        truncation_field: str = "answer",
+        pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
+        prompt_template: str = None,
+        virtual_tokens: int = 0,
+        tokens_to_generate: int = 0,
+        index_by_file_id: bool = False,
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        end_string: Optional[str] = None,
+        context_file: Optional[Union[List[str], str]] = None,
+        sample_alpha: Optional[float] = None,
+        audio_locator: Optional[str] = None,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            max_seq_length=max_seq_length,
+            min_seq_length=min_seq_length,
+            add_bos=add_bos,
+            add_eos=add_eos,
+            add_sep=add_sep,
+            sep_id=sep_id,
+            seed=seed,
+            separate_prompt_and_response_with_newline=separate_prompt_and_response_with_newline,
+            answer_only_loss=answer_only_loss,
+            truncation_field=truncation_field,
+            pad_to_max_length=pad_to_max_length,
+            prompt_template=prompt_template,
+            virtual_tokens=virtual_tokens,
+            tokens_to_generate=tokens_to_generate,
+            context_key=context_key,
+            answer_key=answer_key,
+            end_string=end_string,
+            sample_alpha=sample_alpha,
+            audio_locator=audio_locator,
+        )
+
+        if isinstance(manifest_filepath, str):
+            manifest_filepath = manifest_filepath.split(",")
+
+        # If necessary, cache manifests and audio from object store
+        cache_datastore_manifests(manifest_filepaths=manifest_filepath, cache_audio=True)
+
+        self.collection = collections.SpeechLLMAudioTextCollection(
+            manifests_files=manifest_filepath,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            max_number=max_utts,
+            index_by_file_id=index_by_file_id,
+            max_num_samples=max_num_samples,
+            context_file=context_file,
+            context_key=context_key,
+            answer_key=answer_key,
+        )
+
+        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
+        self.trim = trim
+        self.channel_selector = channel_selector
+
+    def get_manifest_sample(self, sample_id):
+        return self.collection[sample_id]
+
+    def __getitem__(self, index):
+        output = {"idx": index}
+        sample = self.collection[index]
+        offset = sample.offset
+
+        if offset is None:
+            offset = 0
+
+        if sample.audio_file is not None:
+            features = self.featurizer.process(
+                sample.audio_file,
+                offset=offset,
+                duration=sample.duration,
+                trim=self.trim,
+                orig_sr=sample.orig_sr,
+                channel_selector=self.channel_selector,
+            )
+            f, fl = features, torch.tensor(features.shape[0]).long()
+            output["audio_signal"] = f
+            output["audio_length"] = fl
+        else:
+            # dummy features
+            output["audio_signal"] = torch.zeros([80])
+            # accomodates normalize_batch
+            output["audio_length"] = torch.tensor(80)
+
+        text_data = self._process_example(context=sample.context, output=sample.answer)
+
+        output.update(text_data)
+        output['metadata'] = {
+            'audio_filepath': sample.audio_file,
+            'offset': offset,
+            'duration': sample.duration,
+        }
+        return output
+
+    def __len__(self):
+        return len(self.collection)
+
+    def _collate_fn(self, batch):
+        return _speechllm_audio_text_collate_fn(
+            batch=batch,
+            tokens_to_generate=self.tokens_to_generate,
+            pad_to_max_length=self.pad_to_max_length,
+            max_seq_length=self.max_seq_length,
+            text_pad_id=self.pad_id,
+        )
+
+    def collate_fn(self, batch):
+        # override collate_fn to skip type checking
+        return self._collate_fn(batch)
+
+
+class MultiAudioTextDataset(AudioTextDataset):
+    """
+    Dataset for having multi audios per sample, for example in few-shot in-context learning.
+    To use this dataset, you need to specify the `audio_locator` field in the dataset config,
+    and use that to specify the locations of the audio files in your manifest. In this case,
+    the `audio_filepath` field in the manifest is a list of audio filepaths, and the `duration`
+    field is a list of durations, one for each audio file. The `offset` field is optional, and
+    if not specified, it is assumed to be 0.0. The `offset` field is also a list of offsets if specified.
+
+    Example manifest item for audio_locator='|audio|':
+    {
+    "audio_filepath": ["1.wav","2.wav","3.wav"],
+    "duration": [1.05,1.05,2.0],
+    "answer": "this was her dream as nearly as she could recall it",
+    "question": "Following are examples of speech audios and their transcriptions.
+        Example 1: audio is |audio|, transcription is 'I have a dream'.
+        Example 2: audio is |audio|, transcription is ' I don't have a dream'.
+        Given the following audio |audio|, transcribe the audio into words."
+    }
+    """
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+    def _collate_fn(self, batch):
+        return _speechllm_multi_audio_text_collate_fn(
+            batch=batch,
+            tokens_to_generate=self.tokens_to_generate,
+            pad_to_max_length=self.pad_to_max_length,
+            max_seq_length=self.max_seq_length,
+            text_pad_id=self.pad_id,
+        )
+
+    def __getitem__(self, index):
+        output = {"idx": index}
+        sample = self.collection[index]
+        offsets = sample.offset if sample.offset else 0.0
+        durations = sample.duration if sample.duration else 0.0
+        num_audios = 0
+        output["audio_signal"] = []
+        output["audio_length"] = []
+        if sample.audio_file is not None:
+            audio_list = sample.audio_file
+            if isinstance(sample.audio_file, str):
+                audio_list = [sample.audio_file]
+            if not isinstance(audio_list, list):
+                raise ValueError(
+                    f"The field `audio_file` must be either a str or a list of str, but got type {type(sample.audio_file)} instead"
+                )
+
+            num_audios = len(audio_list)
+            if isinstance(durations, list) and len(durations) != num_audios:
+                raise ValueError(
+                    f"The number of durations ({len(durations)}) must match the number of audio clips ({num_audios})"
+                )
+            if isinstance(offsets, list) and len(offsets) != num_audios:
+                raise ValueError(
+                    f"The number of offsets ({len(offsets)}) must match the number of audio clips ({num_audios})"
+                )
+
+            for i, audio_file in enumerate(audio_list):
+                duration = durations[i] if isinstance(durations, list) else 0
+                offset = offsets[i] if isinstance(offsets, list) else 0
+                features = self.featurizer.process(
+                    audio_file,
+                    offset=offset,
+                    duration=duration,
+                    trim=self.trim,
+                    orig_sr=sample.orig_sr,
+                    channel_selector=self.channel_selector,
+                )
+                f, fl = features, torch.tensor(features.shape[0]).long()
+                output["audio_signal"].append(f)
+                output["audio_length"].append(fl)
+        else:
+            # dummy features
+            output["audio_signal"] = [torch.zeros([8])]
+            # accomodates normalize_batch
+            output["audio_length"] = [torch.tensor(8)]
+
+        text_data = self._process_example(context=sample.context, output=sample.answer)
+
+        if isinstance(output["audio_signal"], list) and len(output["audio_signal"]) + 1 != len(
+            text_data['context_start_idx']
+        ):
+            raise ValueError(
+                f"The number of text segments ({len(text_data['context_start_idx'])}) must be one more than number of audios ({len(output['audio_signal'])})"
+            )
+
+        output.update(text_data)
+        output['metadata'] = {
+            'audio_filepath': sample.audio_file,
+            'offset': offsets,
+            'duration': sample.duration,
+        }
+        return output
+
+
+class TarredAudioFilter:
+    def __init__(self, collection, iterator):
+        self.iterator = iterator
+        self.collection = collection
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        while True:
+            audio_bytes, audio_filename = next(self.iterator)
+            file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+            if file_id in self.collection.mapping:
+                return audio_bytes, audio_filename
+
+
+class TarredAudioLoopOffsets:
+    def __init__(self, collection, iterator):
+        self.iterator = iterator
+        self.collection = collection
+        self.current_fn = None
+        self.current_bytes = None
+        self.offset_id = 0
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.current_fn is None:
+            self.current_bytes, self.current_fn = next(self.iterator)
+            self.offset_id = 0
+        else:
+            offset_list = self.collection.mapping[self.current_fn]
+            if len(offset_list) == self.offset_id + 1:
+                self.current_bytes, self.current_fn = next(self.iterator)
+                self.offset_id = 0
+            else:
+                self.offset_id += 1
+
+        return self.current_bytes, self.current_fn, self.offset_id
+
+
+class TarredAudioTextDataset(TextProcessing, IterableDataset):
+    """
+    A similar Dataset to the AudioTextDataset, but which loads tarred audio files.
+
+    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioTextDataset),
+    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
+    contain the information for one audio file, including at least the transcript and name of the audio
+    file within the tarball.
+
+    Valid formats for the audio_tar_filepaths argument include:
+    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
+    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].
+
+    Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference.
+    This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements.
+    Supported opening braces - { <=> (, [, < and the special tag _OP_.
+    Supported closing braces - } <=> ), ], > and the special tag _CL_.
+    For SLURM based tasks, we suggest the use of the special tags for ease of use.
+
+    See the WebDataset documentation for more information about accepted data and input formats.
+
+    If using multiple workers the number of shards should be divisible by world_size to ensure an
+    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
+    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
+    is applied. We currently do not check for this, but your program may hang if the shards are uneven!
+
+    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
+    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.
+
+    Args:
+        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
+            string (can be brace-expandable).
+        manifest_filepath (str): Path to the manifest.
+        parser (callable): A callable which is used to pre-process the text output.
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        shuffle_n (int): How many samples to look ahead and load to be shuffled.
+            See WebDataset documentation for more details.
+            Defaults to 0.
+        min_duration (float): Dataset parameter.
+            All training files which have a duration less than min_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to 0.1.
+        max_duration (float): Dataset parameter.
+            All training files which have a duration more than max_duration
+            are dropped. Note: Duration is read from the manifest JSON.
+            Defaults to None.
+        blank_index (int): Blank character index, defaults to -1.
+        unk_index (int): Unknown character index, defaults to -1.
+        normalize (bool): Dataset parameter.
+            Whether to use automatic text cleaning.
+            It is highly recommended to manually clean text for best results.
+            Defaults to True.
+        trim (bool): Whether to use trim silence from beginning and end
+            of audio signal using librosa.effects.trim().
+            Defaults to False.
+        bos_id (id): Dataset parameter.
+            Beginning of string symbol id used for seq2seq models.
+            Defaults to None.
+        eos_id (id): Dataset parameter.
+            End of string symbol id used for seq2seq models.
+            Defaults to None.
+        pad_id (id): Token used to pad when collating samples in batches.
+            If this is None, pads using 0s.
+            Defaults to None.
+        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
+            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
+                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
+            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
+                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
+                The benefit of replication is that it allows each node to sample data points from the entire
+                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.
+
+                .. warning::
+                    Replicated strategy allows every node to sample the entire set of available tarfiles,
+                    and therefore more than one node may sample the same tarfile, and even sample the same
+                    data points! As such, there is no assured guarantee that all samples in the dataset will be
+                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
+                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
+                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
+                    or test datasets.
+        shard_manifests (bool): Whether or not to try / shard manifests. Defaults to False.
+        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
+        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
+        --------- NLP SPECIFIC ARGS -------------
+        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
+        add_bos (bool): Whether to add a beginning of sentence token to each data example
+        add_eos (bool): Whether to add an end of sentence token to each data example
+        add_sep (bool): Whether to add a separation token to each data example (goes between prompt and answer)
+        tokens_to_generate (int): (inference only) Number of tokens to generate during inference
+        seed: Random seed for data shuffling.
+        seed: int = 1234,
+        context_key: Key to use for the context in your JSONL file
+        answer_key: Key to use for the label in your JSONL file
+        separate_prompt_and_response_with_newline: Adds a newline between prompt and response.
+        answer_only_loss: If True, will compute the loss only on the answer part of the input. If False, will compute the loss on the entire input.
+        truncation_field: Field to use for truncation. (Options: "answer", "context"). Field to be used for truncation if the combined length exceeds the max sequence length.
+        pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
+        prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
+        end_string: Optional[str] = None, if not None, add this string to the end of the answer.
+        --------------- additional args for misc purposes ----------------
+        context_file: Optional[Union[List[str], str]] = None, if provided, will use this file to load random questions from, if question is not in manifest.
+        sample_alpha: Optional[float] = None, for SPE subword sampling
+    """
+
+    def __init__(
+        self,
+        audio_tar_filepaths: Union[str, List[str]],
+        manifest_filepath: str,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: Optional['nemo.collections.asr.parts.perturb.AudioAugmentor'] = None,
+        shuffle_n: int = 0,
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+        trim: bool = False,
+        shard_strategy: str = "scatter",
+        shard_manifests: bool = False,
+        global_rank: int = 0,
+        world_size: int = 0,
+        max_seq_length: int = 1024,
+        min_seq_length: int = 1,
+        add_bos: bool = False,
+        add_eos: bool = True,
+        add_sep: bool = False,
+        sep_id: int = None,
+        seed: int = 1234,
+        separate_prompt_and_response_with_newline: bool = False,
+        answer_only_loss: bool = True,
+        truncation_field: str = "answer",  # choices=["answer", "context"]
+        pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
+        prompt_template: str = None,
+        virtual_tokens: int = 0,
+        tokens_to_generate: int = 0,
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        end_string: Optional[str] = None,
+        context_file: Optional[Union[List[str], str]] = None,
+        sample_alpha: Optional[float] = None,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            max_seq_length=max_seq_length,
+            min_seq_length=min_seq_length,
+            add_bos=add_bos,
+            add_eos=add_eos,
+            add_sep=add_sep,
+            sep_id=sep_id,
+            seed=seed,
+            separate_prompt_and_response_with_newline=separate_prompt_and_response_with_newline,
+            answer_only_loss=answer_only_loss,
+            truncation_field=truncation_field,
+            pad_to_max_length=pad_to_max_length,
+            prompt_template=prompt_template,
+            virtual_tokens=virtual_tokens,
+            tokens_to_generate=tokens_to_generate,
+            context_key=context_key,
+            answer_key=answer_key,
+            end_string=end_string,
+            sample_alpha=sample_alpha,
+        )
+        self.is_megatron_iterable = True
+        self.shard_manifests = shard_manifests
+
+        # Shard manifests if necessary and possible and then expand the paths
+        manifest_filepath = shard_manifests_if_needed(
+            shard_manifests=shard_manifests,
+            shard_strategy=shard_strategy,
+            manifest_filepaths=manifest_filepath,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+
+        # If necessary, cache manifests from object store
+        cache_datastore_manifests(manifest_filepaths=manifest_filepath)
+
+        self.collection = collections.SpeechLLMAudioTextCollection(
+            manifests_files=manifest_filepath,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            index_by_file_id=True,
+            context_file=context_file,
+            context_key=context_key,
+            answer_key=answer_key,
+        )
+
+        self.len = self._compute_len()
+
+        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
+        self.trim = trim
+
+        audio_tar_filepaths = expand_sharded_filepaths(
+            sharded_filepaths=audio_tar_filepaths,
+            shard_strategy=shard_strategy,
+            world_size=world_size,
+            global_rank=global_rank,
+        )
+
+        # Put together WebDataset
+        self._dataset = wds.WebDataset(urls=audio_tar_filepaths, nodesplitter=None)
+
+        if shuffle_n == 0:
+            logging.info("WebDataset will not shuffle files within the tar files.")
+
+        # Put together WebDataset pipeline
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=audio_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(audio=VALID_FILE_FORMATS, key='__key__'),
+            wds.to_tuple('audio', 'key'),
+            self._filter,
+            self._loop_offsets,
+            wds.map(self._build_sample),
+        )
+
+    def _filter(self, iterator):
+        """This function is used to remove samples that have been filtered out by ASRAudioText already.
+        Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample
+        that was filtered out (e.g. for duration).
+        Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard,
+        which may make your code hang as one process will finish before the other.
+        """
+        return TarredAudioFilter(self.collection, iterator)
+
+    def _loop_offsets(self, iterator):
+        """This function is used to iterate through utterances with different offsets for each file."""
+        return TarredAudioLoopOffsets(self.collection, iterator)
+
+    def _collate_fn(self, batch):
+        return _speechllm_audio_text_collate_fn(
+            batch=batch,
+            tokens_to_generate=self.tokens_to_generate,
+            pad_to_max_length=self.pad_to_max_length,
+            max_seq_length=self.max_seq_length,
+            text_pad_id=self.pad_id,
+        )
+
+    def collate_fn(self, batch):
+        # override collate_fn to skip type checking
+        return self._collate_fn(batch)
+
+    def _build_sample(self, tup):
+        """Builds the training sample by combining the data from the WebDataset with the manifest info."""
+        audio_bytes, audio_filename, offset_id = tup
+
+        if audio_filename is not None:
+            # Grab manifest entry from self.manifest_preprocessor.collection
+            file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+            manifest_idx = self.collection.mapping[file_id][offset_id]
+            manifest_entry = self.collection[manifest_idx]
+
+            # init output dict
+            output = {"idx": manifest_idx}
+
+            offset = manifest_entry.offset
+            if offset is None:
+                offset = 0
+            # Convert audio bytes to IO stream for processing (for SoundFile to read)
+            audio_filestream = io.BytesIO(audio_bytes)
+            features = self.featurizer.process(
+                audio_filestream,
+                offset=offset,
+                duration=manifest_entry.duration,
+                trim=self.trim,
+                orig_sr=manifest_entry.orig_sr,
+            )
+            audio_filestream.close()
+
+            # Audio features
+            output["audio_signal"] = features
+            output["audio_length"] = torch.tensor(features.shape[0]).long()
+        else:
+            # dummy features
+            output["audio_signal"] = torch.zeros([80])
+            # accomodates normalize_batch
+            output["audio_length"] = torch.tensor(80)
+
+        # Text features
+        text_data = self._process_example(context=manifest_entry.context, output=manifest_entry.answer)
+
+        output.update(text_data)
+
+        output['metadata'] = {
+            'audio_filepath': audio_filename,
+            'offset': offset,
+            'duration': manifest_entry.duration,
+        }
+        return output
+
+    def get_manifest_sample(self, sample_id):
+        return self.collection[sample_id]
+
+    def __iter__(self):
+        return self._dataset.__iter__()
+
+    def _compute_len(self):
+        # TODO: need to figure out why here needs to be divided by world_size, while in ASR we don't need to.
+        if self.shard_manifests and torch.distributed.is_available() and torch.distributed.is_initialized():
+            my_len = torch.tensor(len(self.collection), dtype=torch.int32).cuda()
+            torch.distributed.all_reduce(my_len)
+            my_len = my_len.int() // parallel_state.get_data_parallel_world_size()
+            logging.info(f'Sharded manifests: Total length: {my_len}')
+        else:
+            my_len = len(self.collection) // parallel_state.get_data_parallel_world_size()
+
+        return my_len
+
+    def __len__(self):
+        return self.len
+
+
+def get_tarred_audio_text_dataset(
+    config,
+    tokenizer,
+    augmentor,
+    global_rank=0,
+    world_size=1,
+    shuffle_n=0,
+    sep_id=None,
+    answer_only_loss=True,
+    virtual_tokens=0,
+):
+    tarred_audio_filepaths = config['tarred_audio_filepaths']
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths)
+    manifest_filepaths = convert_to_config_list(manifest_filepaths)
+
+    bucketing_weights = config.get('bucketing_weights', None)  # For upsampling buckets
+    if bucketing_weights:
+        for idx, weight in enumerate(bucketing_weights):
+            if not isinstance(weight, int) or weight <= 0:
+                raise ValueError(f"bucket weights must be positive integers")
+
+    if len(manifest_filepaths) != len(tarred_audio_filepaths):
+        raise ValueError(
+            f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets."
+        )
+
+    if 'labels' not in config:
+        logging.warning(f"dataset does not have explicitly defined labels")
+
+    if 'max_utts' in config:
+        raise ValueError('"max_utts" parameter is not supported for tarred datasets')
+
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        if len(tarred_audio_filepath) == 1:
+            tarred_audio_filepath = tarred_audio_filepath[0]
+        if len(manifest_filepath) == 1:
+            manifest_filepath = manifest_filepath[0]
+
+        dataset = TarredAudioTextDataset(
+            audio_tar_filepaths=tarred_audio_filepath,
+            manifest_filepath=manifest_filepath,
+            tokenizer=tokenizer,
+            sample_rate=config['sample_rate'],
+            int_values=config.get('int_values', False),
+            augmentor=augmentor,
+            shuffle_n=shuffle_n,
+            max_duration=config.get('max_duration', None),
+            min_duration=config.get('min_duration', None),
+            trim=config.get('trim_silence', False),
+            shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
+            shard_manifests=config.get('shard_manifests', False),
+            global_rank=global_rank,
+            world_size=world_size,
+            max_seq_length=config.max_seq_length,
+            min_seq_length=config.min_seq_length,
+            add_bos=config.get('add_bos', False),
+            add_eos=config.get('add_eos', True),
+            add_sep=config.get('add_sep', False),
+            sep_id=sep_id,
+            separate_prompt_and_response_with_newline=config.get('separate_prompt_and_response_with_newline', True),
+            answer_only_loss=answer_only_loss,
+            truncation_field=config.get('truncation_field', 'context'),
+            pad_to_max_length=False,
+            prompt_template=config.get('prompt_template', None),
+            virtual_tokens=virtual_tokens,
+            tokens_to_generate=config.get(
+                'tokens_to_generate', 0
+            ),  # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
+            context_key=config.get('context_key', 'context'),
+            answer_key=config.get('answer_key', 'answer'),
+            end_string=config.get('end_string', None),
+            sample_alpha=config.get('sample_alpha', None),
+            context_file=config.get('context_file', None),
+        )
+
+        if bucketing_weights:
+            [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])]
+        else:
+            datasets.append(dataset)
+
+    with open_dict(config):  # patch for bucketing tarred datasets
+        config['batch_size'] = config.get("micro_batch_size", 1)
+    return get_chain_dataset(datasets=datasets, ds_config=config, rank=global_rank)
+
+
+def get_concat_tarred_audio_text_dataset(
+    config,
+    tokenizer,
+    augmentor,
+    global_rank=0,
+    world_size=1,
+    shuffle_n=0,
+    sep_id=None,
+    answer_only_loss=True,
+    virtual_tokens=0,
+):
+    tarred_audio_filepaths = config['tarred_audio_filepaths']
+    manifest_filepaths = config['manifest_filepath']
+    datasets = []
+    for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate(
+        zip(tarred_audio_filepaths, manifest_filepaths)
+    ):
+        conf = copy.deepcopy(config)
+        conf['manifest_filepath'] = manifest_filepath
+        conf['tarred_audio_filepaths'] = tarred_audio_filepath
+        context_files = config.get('context_file', None)
+        if isinstance(context_files, ListConfig) and len(context_files) == len(manifest_filepaths):
+            conf['context_file'] = context_files[dataset_idx]
+        else:
+            conf['context_file'] = context_files
+        dataset = get_tarred_audio_text_dataset(
+            config=conf,
+            tokenizer=tokenizer,
+            shuffle_n=shuffle_n,
+            global_rank=global_rank,
+            world_size=world_size,
+            augmentor=augmentor,
+            sep_id=sep_id,
+            answer_only_loss=answer_only_loss,
+            virtual_tokens=virtual_tokens,
+        )
+        datasets.append(dataset)
+
+    concat_sampling_probabilities = config.get('concat_sampling_probabilities', None)
+    if not isinstance(concat_sampling_probabilities, ListConfig) or len(concat_sampling_probabilities) != len(
+        datasets
+    ):
+        logging.info(
+            f"concat_sampling_probabilities is not provided or is not of the same size as datasets, using uniform sampling."
+        )
+        concat_sampling_probabilities = [1.0 / len(datasets)] * len(datasets)
+
+    dataset = ConcatDataset(
+        datasets,
+        sampling_technique=config.get('concat_sampling_technique', 'temperature'),
+        sampling_temperature=config.get('concat_sampling_temperature', 5),
+        sampling_scale=config.get('concat_sampling_scale', 1),
+        sampling_probabilities=concat_sampling_probabilities,
+        shuffle=config.get('concat_shuffle', True),
+        seed=config.get('concat_sampling_seed', None),
+        global_rank=global_rank,
+        world_size=world_size,
+    )
+    return dataset
+
+
+def get_tarred_audio_text_dataset_from_config(
+    config: DictConfig,
+    tokenizer,
+    augmentor,
+    global_rank: int = 0,
+    world_size: int = 1,
+    sep_id: Optional[int] = None,
+    answer_only_loss: bool = True,
+    virtual_tokens: int = 0,
+):
+    is_concat = config.get('is_concat', False)
+    if is_concat:
+        if 'concat_sampling_technique' in config and config['concat_sampling_technique'] is None:
+            logging.warning(
+                f"Concat dataset requires `concat_sampling_technique` but it was not provided. Config: {config}"
+            )
+            return None
+
+    data_parallel_size = parallel_state.get_data_parallel_world_size()
+    num_micro_batches = config.global_batch_size // (config.micro_batch_size * data_parallel_size)
+    global_batch_size_on_this_data_parallel_rank = num_micro_batches * config.micro_batch_size
+    shuffle = config['shuffle']
+    shuffle_n = config.get('shuffle_n', 4 * global_batch_size_on_this_data_parallel_rank) if shuffle else 0
+    if is_concat:
+        dataset = get_concat_tarred_audio_text_dataset(
+            config=config,
+            tokenizer=tokenizer,
+            augmentor=augmentor,
+            shuffle_n=shuffle_n,
+            global_rank=global_rank,
+            world_size=world_size,
+            sep_id=sep_id,
+            answer_only_loss=answer_only_loss,
+            virtual_tokens=virtual_tokens,
+        )
+    else:
+        dataset = get_tarred_audio_text_dataset(
+            config=config,
+            tokenizer=tokenizer,
+            augmentor=augmentor,
+            shuffle_n=shuffle_n,
+            global_rank=global_rank,
+            world_size=world_size,
+            sep_id=sep_id,
+            answer_only_loss=answer_only_loss,
+            virtual_tokens=virtual_tokens,
+        )
+    return dataset
+
+
+def get_audio_text_dataset_from_config(
+    manifest_filepath: str,
+    config: DictConfig,
+    tokenizer,
+    augmentor,
+    is_train,
+    sep_id: Optional[int] = None,
+    answer_only_loss: bool = True,
+    virtual_tokens: int = 0,
+):
+    if isinstance(config.manifest_filepath, str):
+        manifest_filepath = config.manifest_filepath.split(',')
+    else:
+        manifest_filepath = config.manifest_filepath
+
+    data_cls = MultiAudioTextDataset if config.get('audio_locator', None) else AudioTextDataset
+    datasets = []
+    if is_train:
+        # Construct the data prefix list for `get_datasets_weights_and_num_samples()`
+        # that is of the format [weight1,file_name1,weight2,file_name2,...]
+        concat_sampling_probabilities = config.get('concat_sampling_probabilities', None)
+        if concat_sampling_probabilities is None:
+            concat_sampling_probabilities = [1.0 / len(manifest_filepath)] * len(manifest_filepath)
+        elif len(config.get('concat_sampling_probabilities', None)) != len(manifest_filepath):
+            raise ValueError(
+                (
+                    f"concat_sampling_probabilities must be of the same size as manifest_filepath.",
+                    f"Provided size {len(config.concat_sampling_probabilities)}, number of datasets {len(manifest_filepath)}",
+                )
+            )
+        data_prefix = []
+        for weight, prefix in zip(concat_sampling_probabilities, manifest_filepath):
+            data_prefix.append(weight)
+            data_prefix.append(prefix)
+
+        num_samples_per_dataset = get_num_samples_from_files(manifest_filepath)
+        num_train_samples = [len(manifest_filepath) * max(num_samples_per_dataset)]
+        _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples)
+        num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])
+    else:
+        num_train_samples_per_dataset = [[None]] * len(manifest_filepath)
+
+    for dataset_idx, (file_path, num_samples) in enumerate(zip(manifest_filepath, num_train_samples_per_dataset)):
+        context_file = config.get('context_file', None)
+        if isinstance(context_file, ListConfig) and len(context_file) == len(manifest_filepath):
+            context_file = context_file[dataset_idx]
+        dataset = data_cls(
+            manifest_filepath=file_path,
+            tokenizer=tokenizer,
+            sample_rate=config.sample_rate,
+            int_values=config.get('int_values', False),
+            augmentor=augmentor,
+            max_duration=getattr(config, 'max_duration', None),
+            min_duration=getattr(config, 'min_duration', None),
+            max_utts=getattr(config, 'max_utts', -1),
+            trim=getattr(config, 'trim_silence', False),
+            channel_selector=getattr(config, 'channel_selector', None),
+            max_seq_length=config.max_seq_length,
+            min_seq_length=config.min_seq_length,
+            add_bos=config.get('add_bos', False),
+            add_eos=config.get('add_eos', True),
+            add_sep=config.get('add_sep', False),
+            sep_id=sep_id,
+            max_num_samples=num_samples[0],
+            seed=config.get('seed', 1234),
+            separate_prompt_and_response_with_newline=config.get('separate_prompt_and_response_with_newline', True),
+            answer_only_loss=answer_only_loss,
+            truncation_field=config.get('truncation_field', 'context'),
+            pad_to_max_length=config.get('pad_to_max_length', False),
+            prompt_template=config.get('prompt_template', None),
+            virtual_tokens=virtual_tokens,
+            tokens_to_generate=config.get(
+                'tokens_to_generate', 0
+            ),  # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
+            context_key=config.get('context_key', 'context'),
+            answer_key=config.get('answer_key', 'answer'),
+            end_string=config.get('end_string', None),
+            sample_alpha=config.get('sample_alpha', None),
+            context_file=context_file,
+            audio_locator=config.get('audio_locator', None),
+        )
+        datasets.append(dataset)
+
+    if is_train:
+        dataset = BlendableDataset(
+            datasets=datasets, weights=concat_sampling_probabilities, size=num_train_samples_after_blend
+        )
+        return dataset
+    else:
+        return datasets
diff --git a/nemo/collections/multimodal/speech_llm/models/__init__.py b/nemo/collections/multimodal/speech_llm/models/__init__.py
new file mode 100644
index 000000000000..ec188828ec87
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/models/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_models.py b/nemo/collections/multimodal/speech_llm/models/modular_models.py
new file mode 100644
index 000000000000..39bc37c33e56
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/models/modular_models.py
@@ -0,0 +1,1563 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import json
+import os
+from typing import List, Optional, Union
+
+import hydra
+import sacrebleu
+import torch
+from hydra.utils import get_class
+from omegaconf import ListConfig
+from omegaconf.dictconfig import DictConfig
+from omegaconf.omegaconf import OmegaConf, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+from pytorch_lightning.utilities import rank_zero_only
+
+from nemo.collections.asr.models import ASRModel, EncDecSpeakerLabelModel
+from nemo.collections.asr.parts.mixins.transcription import move_to_device
+from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
+from nemo.collections.asr.parts.utils.eval_utils import remove_punctuations
+from nemo.collections.common.metrics import MetricStringToTorchMetric, TextMetricsSet
+from nemo.collections.multimodal.speech_llm.data.audio_text_dataset import (
+    get_audio_text_dataset_from_config,
+    get_tarred_audio_text_dataset_from_config,
+)
+from nemo.collections.multimodal.speech_llm.modules.common.audio_text_generation_utils import generate
+from nemo.collections.multimodal.speech_llm.modules.perception_modules import (
+    AudioPerceptionModule,
+    MultiAudioPerceptionModule,
+)
+from nemo.collections.multimodal.speech_llm.parts.mixins.adapter_mixin import SpeechLLMAdapterMixin
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import get_nested_dict_value
+from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
+from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import (
+    MegatronPretrainingBatchSampler,
+)
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    average_losses_across_data_parallel_group,
+    build_position_ids,
+)
+from nemo.collections.nlp.modules.common.text_generation_utils import get_computeprob_response
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.collections.nlp.parts.utils_funcs import get_last_rank
+from nemo.core.classes import ModelPT
+from nemo.core.classes.common import PretrainedModelInfo
+from nemo.core.classes.mixins import adapter_mixins
+from nemo.utils import AppState, logging
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+try:
+    from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator, get_num_microbatches
+
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+
+try:
+    from megatron.core import InferenceParams, parallel_state, tensor_parallel
+    from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_MEGATRON_CORE = False
+
+
+__all__ = ["ModularAudioGPTModel"]
+
+
+default_inference_config = {'tokens_to_generate': 30}
+
+
+class ModularAudioGPTModel(SpeechLLMAdapterMixin, MegatronGPTSFTModel):
+    """Modularized speech GPT model."""
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        self.cfg = cfg
+        super().__init__(cfg, trainer)
+
+        self.perception = (
+            AudioPerceptionModule(cfg=cfg.perception)
+            if "encoders" not in cfg.perception
+            else MultiAudioPerceptionModule(cfg=cfg.perception)
+        )
+        # print out params in more details
+        self.summarize(max_depth=2)
+
+    def parameters(self):
+        # override the same method in MegatronGPT model to include parameters ouside of LM
+        all_names = []
+        all_params = []
+        for name, param in self.named_parameters(recurse=True):
+            all_names.append(name)
+            all_params.append(param)
+
+        if isinstance(self.model, list):
+            for module in self.model:
+                for name, param in module.named_parameters(recurse=True):
+                    all_names.append(name)
+                    all_params.append(param)
+
+        return itertools.chain(all_params)
+
+    def setup_optimizer_param_groups(self):
+        """
+        Override parent method to setup optimizer groups for training/freezing different parts of the model.
+        """
+        known_groups = []
+        if self.cfg.get('freeze_llm', True):
+            for param in self.model.parameters():
+                param.requires_grad = False
+            known_groups.append('model.')
+
+        if self.cfg.get('freeze_audio_encoder', False):
+            # freeze speaker model if there is any
+            if self.cfg.perception.get("speaker_model", None) is not None:
+                if self.cfg.perception.speaker_model.get("freeze", False):
+                    self.perception.speaker_model.freeze()
+                    known_groups.append('perception.speaker_model.')
+            # freeze other audio encoders
+            if self.cfg.perception.get("encoders", None) is not None:
+                # multiple audio encoders
+                for key, enc_cfg in self.cfg.perception.encoders.items():
+                    if enc_cfg.get("freeze", False):
+                        self.perception.encoders[key].freeze()
+                        known_groups.append(f'perception.encoders.{key}.')
+            else:
+                # single audio encoder
+                self.perception.encoder.freeze()
+                known_groups.append('perception.encoder.')
+
+        if self.cfg.get('freeze_modality_adapter', False):
+            # freeze modality adapter
+            self.perception.modality_adapter.freeze()
+            known_groups.append('perception.modality_adapter.')
+
+        opt_params = []
+        for _, module in self.named_modules():
+            if isinstance(module, adapter_mixins.AdapterModuleMixin) and module.is_adapter_available():
+                # add adapters to the optimizer
+                module.set_enabled_adapters(enabled=True)
+                module.unfreeze_enabled_adapters()  # selectively unfreeze the adapter modules.
+                opt_params += [p for p in module.parameters()]
+
+        # add param groups with specified args, if any
+        param_groups = []
+        if "optim_param_groups" in self.cfg:
+            param_groups_cfg = self.cfg.optim_param_groups
+            for group, group_cfg in param_groups_cfg.items():
+                module = getattr(self, group, None)
+                if module is None:
+                    raise ValueError(f"{group} not found in model.")
+                elif hasattr(module, "parameters"):
+                    known_groups.append(f"{group}.")
+                    new_group = {"params": module.parameters()}
+                    for k, v in group_cfg.items():
+                        new_group[k] = v
+                    param_groups.append(new_group)
+                else:
+                    raise ValueError(f"{group} does not have parameters.")
+
+        # add other trainable params
+        for n, p in self.named_parameters():
+            is_unknown = True
+            for group in known_groups:
+                if n.startswith(group):
+                    is_unknown = False
+            if is_unknown:
+                opt_params.append(p)
+
+        param_groups = [{"params": opt_params}] + param_groups
+
+        self._optimizer_param_groups = param_groups
+        logging.info(f"Optimizer groups set:\n{self.summarize(max_depth=2)}")
+
+    def _create_attention_mask(self, encoder_input: torch.Tensor):
+        # Create causal attention mask for whole input
+        batch_size = encoder_input.shape[0]
+        max_len = encoder_input.shape[1]
+        attention_mask = torch.tril(torch.ones((batch_size, max_len, max_len), device=encoder_input.device)).view(
+            batch_size, 1, max_len, max_len
+        )
+        # Convert attention mask from float to bool
+        attention_mask = attention_mask < 0.5
+        return attention_mask
+
+    def _concat_features(self, embs1, emb1_lens, embs2, emb2_lens):
+        """Concatenate two sets of embeddings and their lengths."""
+        concat_emb = []
+        concat_len = []
+        for emb1, emb1_len, emb2, emb2_len in zip(embs1, emb1_lens, embs2, emb2_lens):
+            new_len = emb1_len + emb2_len
+            new_emb = torch.concat([emb1[:emb1_len], emb2[:emb2_len]], axis=0)
+            padded_new_emb = torch.zeros(emb1.shape[0] + emb2.shape[0], emb1.shape[-1], device=emb1.device)
+            padded_new_emb[:new_len, ...] = new_emb
+            concat_emb.append(padded_new_emb)
+            concat_len.append(new_len)
+        concat_emb = torch.stack(concat_emb, dim=0)
+        concat_len = torch.stack(concat_len, dim=0)
+        return concat_emb, concat_len
+
+    def _concat_multi_features(
+        self,
+        encoded: List[torch.Tensor],
+        encoded_len: List[torch.Tensor],
+        input_embeds: torch.Tensor,
+        input_length: torch.Tensor,
+        context_start_idx: List[List[int]],
+    ):
+        """Concatenate multiple audio features with text segments."""
+        encoder_input_list, encoder_length_list = [], []
+        batch_size = input_embeds.size(0)
+        max_length = 0
+        for i in range(batch_size):
+            start_idx_list_i = context_start_idx[i] + [
+                input_embeds.size(1)
+            ]  # use input_embeds instead of input_length to handle tokens_to_generate in inference
+            input_len_list = [start_idx_list_i[j + 1] - start_idx_list_i[j] for j in range(len(start_idx_list_i) - 1)]
+            input_emb_list = input_embeds[i].split(input_len_list)
+            encoder_input_i = [input_emb_list[0]]
+            for j in range(1, len(input_emb_list)):
+                encoder_input_i.append(encoded[i][j - 1][: encoded_len[i][j - 1]])
+                encoder_input_i.append(input_emb_list[j])
+            encoder_input_i = torch.cat(encoder_input_i)  # T, C
+            encoder_length_i = encoded_len[i].sum() + input_length[i]  # total length of audio and text features
+            max_length = max(max_length, encoder_input_i.size(0))
+            encoder_input_list.append(encoder_input_i)
+            encoder_length_list.append(encoder_length_i)
+
+        encoder_input = torch.stack(
+            [torch.nn.functional.pad(f, (0, 0, 0, max_length - f.size(0))) for f in encoder_input_list]
+        )
+        encoder_length = torch.LongTensor(encoder_length_list).to(encoder_input.device)
+        return encoder_input, encoder_length
+
+    def inject_perception_input(
+        self,
+        encoded: Union[torch.Tensor, List[torch.Tensor]],
+        encoded_len: Union[torch.Tensor, List[torch.Tensor]],
+        input_ids: torch.Tensor,
+        input_length: torch.Tensor,
+        context_start_idx: Optional[List[List[int]]] = None,
+    ):
+        """Inject audio features into the text input and return the final input embeddings to LLM."""
+        # [b, t, c]
+        lm_embedding = (
+            self.model.language_model.embedding if hasattr(self.model, 'language_model') else self.model.embedding
+        )
+        input_embeds = lm_embedding.word_embeddings(input_ids)
+        if isinstance(encoded, torch.Tensor):
+            # single audio
+            encoder_input, encoder_length = self._concat_features(encoded, encoded_len, input_embeds, input_length)
+        else:
+            # concat multiple audios with text segments
+            encoder_input, encoder_length = self._concat_multi_features(
+                encoded, encoded_len, input_embeds, input_length, context_start_idx
+            )
+
+        attention_mask = self._create_attention_mask(encoder_input)
+        position_ids = build_position_ids(encoder_input[:, :, 0])
+
+        # Add position embeddings
+        if (
+            getattr(lm_embedding, "position_embeddings", None) is not None
+            and lm_embedding.position_embedding_type == 'learned_absolute'
+        ):
+            position_embeddings = lm_embedding.position_embeddings(position_ids)
+            encoder_input = encoder_input + position_embeddings
+
+        encoder_max_length = encoder_input.shape[1]
+        if not hasattr(lm_embedding, 'transpose_batch_sequence') or lm_embedding.transpose_batch_sequence:
+            encoder_input = encoder_input.transpose(0, 1).contiguous()
+        if self.cfg.get("sequence_parallel", False):
+            encoder_input = tensor_parallel.mappings.scatter_to_sequence_parallel_region(encoder_input)
+        return encoder_input, attention_mask, encoder_length, position_ids, encoder_max_length
+
+    def _shift_labels_by_emb_len(self, labels, label_lens, emb_lens, max_len, pad_token=0):
+        """Shift labels to the right by the length of the audio embeddings."""
+        shifted_labels = []
+        for label, label_len, emb_len in zip(labels, label_lens, emb_lens):
+            shifted_label = torch.full([max_len], pad_token, device=label.device)
+            shifted_label[emb_len : emb_len + label_len] = label[:label_len]
+            shifted_labels.append(shifted_label)
+        shifted_labels = torch.stack(shifted_labels, dim=0)
+        return shifted_labels
+
+    def _get_text_embeddings(self, text_tokens, position_ids):
+        """Get text embeddings for the input text tokens."""
+        lm_embedding = (
+            self.model.language_model.embedding if hasattr(self.model, 'language_model') else self.model.embedding
+        )
+        text_embeddings = lm_embedding.word_embeddings(text_tokens)  # (batch_size, seq_len, hidden_size)
+        if hasattr(lm_embedding, 'position_embeddings'):
+            position_embeddings = lm_embedding.position_embeddings(position_ids)
+            text_embeddings = text_embeddings + position_embeddings
+        return text_embeddings.transpose(0, 1)
+
+    def prepare_llm_input(self, audio_batch):
+        """Prepare input for the LLM."""
+        input_signal = audio_batch['audio_signal']
+        input_signal_length = audio_batch['audio_signal_length']
+
+        input_ids, input_length, labels, loss_mask = (
+            audio_batch['tokens'],
+            audio_batch['tokens_length'],
+            audio_batch['labels'],
+            audio_batch['loss_mask'],
+        )
+
+        num_audios = audio_batch.get("num_audios", None)
+        context_start_idx = audio_batch.get("context_start_idx", None)
+
+        # [b, t, c]
+        encoded, encoded_len = self.perception(
+            input_signal=input_signal,
+            input_signal_length=input_signal_length,
+            processed_signal=None,
+            processed_signal_length=None,
+        )
+
+        if num_audios is not None:
+            # split the encoded and encoded_len by num_audios, used when there're multiple audio files per sample
+            encoded = encoded.split(num_audios.tolist())
+            encoded_len = encoded_len.split(num_audios.tolist())
+
+        encoder_input, attention_mask, encoder_length, _, encoder_max_length = self.inject_perception_input(
+            encoded, encoded_len, input_ids, input_length, context_start_idx
+        )
+        if num_audios is not None:
+            # sum up the audio_feat_lens for each sample in the batch
+            encoded_len = torch.stack([torch.sum(lens) for lens in encoded_len])
+
+        # Shift labels to the right
+        labels = self._shift_labels_by_emb_len(labels, input_length, encoded_len, encoder_max_length, pad_token=0)
+        # Loss mask where answer tokens are 1.0 and all other tokens are 0.0
+        loss_mask = self._shift_labels_by_emb_len(
+            loss_mask, input_length, encoded_len, encoder_max_length, pad_token=0
+        )
+
+        return encoder_input, attention_mask, labels, loss_mask, encoder_length
+
+    def forward(
+        self,
+        audio_batch,
+        checkpoint_activations_all_layers,
+    ):
+        """
+        Forward pass of the model. We prepend audio embeddings to the instruction and label text tokens as the LLM input.
+        """
+        encoder_input, attention_mask, labels, loss_mask, _ = self.prepare_llm_input(audio_batch)
+        if self.mcore_gpt:
+            output = self.model(
+                input_ids=None,
+                position_ids=None,
+                decoder_input=encoder_input,
+                attention_mask=attention_mask,
+                labels=labels,
+            )
+        else:
+            output = self.model(
+                input_ids=None,
+                position_ids=None,
+                encoder_input=encoder_input,
+                attention_mask=attention_mask,
+                labels=labels,
+                checkpoint_activations_all_layers=checkpoint_activations_all_layers,
+            )
+
+        return output, loss_mask
+
+    def get_forward_output_only_func(self):
+        def fwd_output_only_func(dataloader_iter, model):
+            batch = next(dataloader_iter)
+            extra_arg = {}
+            # take the batch produced by prepare_batch_at_step
+            (
+                tokens,
+                input_embeddings,
+                attention_mask,
+                position_ids,
+                set_inference_key_value_memory,
+                inference_max_sequence_len,
+            ) = batch
+            tokens = tokens.cuda()
+
+            if attention_mask is not None:
+                attention_mask = attention_mask.cuda()
+                attention_mask = attention_mask[0:1]
+            if self.mcore_gpt:
+                # if first step, then clear KV cache, otherwise reuse inference_paarms
+                if set_inference_key_value_memory[0].item():
+                    self.inference_params = InferenceParams(
+                        max_batch_size=tokens.size(0), max_sequence_length=inference_max_sequence_len[0].item()
+                    )
+                extra_arg['inference_params'] = self.inference_params
+            else:
+                extra_arg['set_inference_key_value_memory'] = set_inference_key_value_memory[0].item()
+                extra_arg['inference_max_sequence_len'] = inference_max_sequence_len[0].item()
+
+            # Currently for all MCore transformer layer specs causal attention mask
+            # is used so we can delegate creating it to MCore/TE and pass None below
+            if (
+                isinstance(model, MCoreGPTModel)
+                or hasattr(model, "module")
+                and isinstance(model.module, MCoreGPTModel)
+            ):
+                attention_mask = None
+
+            output_tensor = model(
+                input_ids=None,
+                position_ids=None,
+                decoder_input=input_embeddings,
+                attention_mask=attention_mask,
+                **extra_arg,
+            )
+
+            # Advance inference sequence offset.
+            if self.inference_params:
+                # if last stage, then (final) output is [b, s, h], otherwise it's [s, b, h]
+                if parallel_state.is_pipeline_last_stage():
+                    self.inference_params.sequence_len_offset += output_tensor.size(1)
+                else:
+                    self.inference_params.sequence_len_offset += output_tensor.size(0)
+
+            def id_func(output_tensor):
+                return output_tensor, {'logits': output_tensor}
+
+            return output_tensor, id_func
+
+        return fwd_output_only_func
+
+    def get_forward_output_and_loss_func(self, validation_step=False, tuning=False):
+        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
+            batch = next(dataloader_iter)
+
+            # Transfer needed data to GPU
+            required_keys = set()
+            if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+                required_keys.update(batch.keys())
+            else:
+                required_keys.add('attention_mask')
+                if parallel_state.is_pipeline_first_stage():
+                    required_keys.update(('tokens', 'position_ids'))
+                if parallel_state.is_pipeline_last_stage():
+                    required_keys.update(('labels', 'loss_mask'))
+            if self.get_attention_mask_from_fusion and 'attention_mask' in required_keys:
+                required_keys.remove('attention_mask')
+
+            batch = move_to_device(batch, self.device)
+            batch = self.get_batch_on_this_context_parallel_rank(batch)
+
+            if not self.mcore_gpt:
+                batch['checkpoint_activations_all_layers'] = checkpoint_activations_all_layers
+
+            output_tensor, loss_mask = self.forward(
+                batch, checkpoint_activations_all_layers=checkpoint_activations_all_layers
+            )
+            batch['loss_mask'] = loss_mask
+
+            def loss_func(output_tensor):
+                # Loss for a micro-batch (ub)
+                loss_for_ub = self.loss_func(batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor)
+                cp_size = self.cfg.get('context_parallel_size', 1)
+                if self.cfg.data.get(
+                    "return_output_tensors", False
+                ):  # TODO: need a better way to check if loss_func is returning more stuff than just loss... (@adithyare)
+                    loss_for_ub, q_hs, d_hs, pos_cs, neg_cs, diff_cs = loss_for_ub
+                    reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+                    pos_cs = average_losses_across_data_parallel_group([pos_cs])
+                    neg_cs = average_losses_across_data_parallel_group([neg_cs])
+                    diff_cs = average_losses_across_data_parallel_group([diff_cs])
+                    return (
+                        loss_for_ub * cp_size,
+                        {
+                            'avg': reduced_loss,
+                            'query_hs': q_hs,
+                            'doc_hs': d_hs,
+                            'avg_pos_cs': pos_cs,
+                            'avg_neg_cs': neg_cs,
+                            'diff_cs': diff_cs,
+                        },
+                    )
+                elif validation_step and not self.cfg.data.get('validation_drop_last', True):
+                    num_valid_tokens_in_ub = batch['num_valid_tokens_in_ub']
+                    if loss_for_ub.isnan():
+                        assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input'
+                        loss_sum_for_ub = torch.zeros_like(num_valid_tokens_in_ub)
+                    else:
+                        loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub
+
+                    loss_sum_and_ub_size_all_gpu = torch.cat(
+                        [
+                            loss_sum_for_ub.clone().detach().view(1),
+                            torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(),
+                        ]
+                    )
+                    # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds)
+                    torch.distributed.all_reduce(
+                        loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group()
+                    )
+                    return loss_for_ub * cp_size, {'loss_sum_and_ub_size': loss_sum_and_ub_size_all_gpu}
+                else:
+                    reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+                    return loss_for_ub * cp_size, {'avg': reduced_loss}
+
+            return output_tensor, loss_func
+
+        return fwd_output_and_loss_func
+
+    def _build_dataset(self, data_cfg, is_train=True):
+        if 'augmentor' in data_cfg:
+            augmentor = process_augmentations(
+                data_cfg['augmentor'], global_rank=self.global_rank, world_size=self.world_size
+            )
+        else:
+            augmentor = None
+
+        # Check dataset max_seq_legnth and max_position_embeddings size
+        if (
+            self.cfg.get('position_embedding_type', None) in [None, 'learned_absolute']
+            and data_cfg.max_seq_length > self.cfg.max_position_embeddings
+        ):
+            logging.warning(
+                f"Set dataset max_seq_length to max_position_embeddings {self.cfg.max_position_embeddings} if using learned_absolute position embedding"
+            )
+            data_cfg.max_seq_length = self.cfg.max_position_embeddings
+
+        # Notably, the data weights are controlled by either bucketing_weights
+        # or concat_sampling_probabilities depending on the dataset type.
+        if data_cfg.get('is_tarred', False):
+            return get_tarred_audio_text_dataset_from_config(
+                config=data_cfg,
+                tokenizer=self.tokenizer,
+                augmentor=augmentor,
+                sep_id=self.sep_id,
+                answer_only_loss=self.cfg.get('answer_only_loss', True),
+                virtual_tokens=self.virtual_tokens,
+                global_rank=parallel_state.get_data_parallel_rank(),
+                world_size=parallel_state.get_data_parallel_world_size(),
+            )
+        else:
+            return get_audio_text_dataset_from_config(
+                manifest_filepath=data_cfg.manifest_filepath,
+                config=data_cfg,
+                tokenizer=self.tokenizer,
+                augmentor=augmentor,
+                is_train=is_train,
+                sep_id=self.sep_id,
+                answer_only_loss=self.cfg.get('answer_only_loss', True),
+                virtual_tokens=self.virtual_tokens,
+            )
+
+    def build_data_loader(self, dataset, data_cfg, consumed_samples=0, is_predict=False):
+        """Buld dataloader given an input dataset."""
+        logging.info(f'Building dataloader with consumed samples: {consumed_samples}')
+        if isinstance(dataset, BlendableDataset):
+            collate_fn = dataset.datasets[0].collate_fn
+        elif hasattr(dataset, 'collate_fn'):
+            collate_fn = dataset.collate_fn
+        elif hasattr(dataset.datasets[0], 'collate_fn'):
+            # support datasets that are lists of entries
+            collate_fn = dataset.datasets[0].collate_fn
+        else:
+            # support datasets that are lists of lists
+            collate_fn = dataset.datasets[0].datasets[0].collate_fn
+
+        if isinstance(dataset, torch.utils.data.IterableDataset):
+            data_parallel_size = parallel_state.get_data_parallel_world_size()
+            num_micro_batches = data_cfg.global_batch_size // (data_cfg.micro_batch_size * data_parallel_size)
+            global_batch_size_on_this_data_parallel_rank = num_micro_batches * data_cfg.micro_batch_size
+
+            dataloader = torch.utils.data.DataLoader(
+                dataset,
+                collate_fn=collate_fn,
+                shuffle=False,
+                batch_size=global_batch_size_on_this_data_parallel_rank,
+                drop_last=True,
+                num_workers=data_cfg.num_workers,
+                pin_memory=data_cfg.pin_memory,
+            )
+            return dataloader
+
+        if is_predict:
+            # MegatronPretrainingBatchSampler doesn't work with trainer.predict()
+            dataloader = torch.utils.data.DataLoader(
+                dataset,
+                collate_fn=collate_fn,
+                batch_size=data_cfg.micro_batch_size,
+                num_workers=data_cfg.num_workers,
+                pin_memory=data_cfg.pin_memory,
+            )
+            return dataloader
+
+        batch_sampler = MegatronPretrainingBatchSampler(
+            total_samples=len(dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=data_cfg.micro_batch_size,
+            global_batch_size=data_cfg.global_batch_size,
+            data_parallel_rank=parallel_state.get_data_parallel_rank(),
+            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            drop_last=data_cfg.drop_last,
+            pad_samples_to_global_batch_size=not data_cfg.drop_last,
+        )
+
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn,
+            num_workers=data_cfg.num_workers,
+            pin_memory=data_cfg.pin_memory,
+            persistent_workers=True if data_cfg.num_workers > 0 else False,
+        )
+        return dataloader
+
+    @classmethod
+    def _modify_audio_encoder_config(cls, gpt_cfg, audio_cfg, speaker_cfg=None):
+        """load the ecoder configs from the pretrained audio models and updating the model's config."""
+        with open_dict(gpt_cfg):
+            use_multi_encoder = gpt_cfg.perception.get("encoders", None) is not None
+            if not use_multi_encoder:
+                gpt_cfg.perception.preprocessor = audio_cfg.preprocessor
+                gpt_cfg.perception.encoder = audio_cfg.encoder
+            else:
+                for key in gpt_cfg.perception.encoders:
+                    model_key = gpt_cfg.perception.encoders[key].get("model_key", "encoder")
+                    gpt_cfg.perception.encoders[key]["model"] = audio_cfg[key][model_key]
+                    if "preprocessor" in audio_cfg[key]:
+                        gpt_cfg.perception.encoders[key]['preprocessor'] = audio_cfg[key].preprocessor
+                if speaker_cfg is not None:
+                    gpt_cfg.perception.speaker_model.model = speaker_cfg
+
+            gpt_cfg.perception.output_dim = gpt_cfg.hidden_size
+            modality_adapter_cfg = gpt_cfg.perception.modality_adapter
+            if 'output_dim' in modality_adapter_cfg:
+                modality_adapter_cfg.output_dim = gpt_cfg.hidden_size
+            if not use_multi_encoder:
+                model_dim_key = gpt_cfg.perception.get("model_dim_key", "d_model")
+                encoder_dim = get_nested_dict_value(audio_cfg.encoder, model_dim_key)
+                input_dim = encoder_dim
+                if (
+                    gpt_cfg.perception.get('use_multi_layer_feat', False)
+                    and gpt_cfg.perception.multi_layer_feat.aggregator.get("mode", "cat") == "cat"
+                ):
+                    input_dim = encoder_dim * len(gpt_cfg.perception.multi_layer_feat.layer_idx_list)
+            else:
+                input_dim = 0
+                if speaker_cfg is not None:
+                    input_dim += speaker_cfg.decoder.emb_sizes
+                for enc_cfg in gpt_cfg.perception.encoders.values():
+                    encoder_dim = get_nested_dict_value(enc_cfg.model, enc_cfg.get("model_dim_key", "d_model"))
+                    if (
+                        enc_cfg.get('use_multi_layer_feat', False)
+                        and enc_cfg.multi_layer_feat.aggregator.get("mode", "cat") == "cat"
+                    ):
+                        input_dim += encoder_dim * len(enc_cfg.multi_layer_feat.layer_idx_list)
+                    else:
+                        input_dim += encoder_dim
+
+            if 'feat_in' in modality_adapter_cfg:
+                modality_adapter_cfg.feat_in = input_dim
+            elif 'input_dim' in modality_adapter_cfg:
+                modality_adapter_cfg.input_dim = input_dim
+
+    @classmethod
+    def _modify_config(cls, gpt_cfg, cfg, audio_cfg, add_cfg_to_tree=False, speaker_cfg=None):
+        """
+        This function modifies the original gpt pre-training config (gpt_cfg) with attributes from the finetuning config (cfg).
+        The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
+        """
+        OmegaConf.set_struct(gpt_cfg, True)
+        OmegaConf.resolve(cfg)
+        with open_dict(gpt_cfg):
+            # for AudioGPTLoRAModel
+            gpt_cfg.target = f"{cls.__module__}.{cls.__name__}"
+            gpt_cfg.perception = cfg.model.perception
+            # inject audio encoder configs into the target config (gpt_cfg)
+            cls._modify_audio_encoder_config(gpt_cfg, audio_cfg, speaker_cfg)
+
+            # inject the sample rate from the audio encoder into the gpt config
+            if isinstance(audio_cfg, (ListConfig, list)):
+                sample_rate = [_cfg.preprocessor.sample_rate for _cfg in audio_cfg]
+                if not all([sr == sample_rate[0] for sr in sample_rate]):
+                    raise ValueError("All audio encoders must have the same sample rate.")
+                gpt_cfg.data.train_ds.sample_rate = sample_rate[0]
+                gpt_cfg.data.validation_ds.sample_rate = sample_rate[0]
+            else:
+                sample_rate = audio_cfg.preprocessor.sample_rate
+                gpt_cfg.data.train_ds.sample_rate = sample_rate
+                gpt_cfg.data.validation_ds.sample_rate = sample_rate
+
+            # This is needed when modifying a hparam file directly to load `.ckpt` files.
+            # This is not needed to modify the cfg in `.nemo` files.
+            if add_cfg_to_tree:
+                OmegaConf.resolve(gpt_cfg)
+                gpt_cfg.cfg = gpt_cfg
+
+        return gpt_cfg
+
+    @classmethod
+    def get_pretraind_audio_model(cls, encoder_cfg: DictConfig) -> ModelPT:
+        """load pretrained audio model from a given config"""
+        if encoder_cfg.get("_target_", None) is not None:
+            encoder_cls = get_class(encoder_cfg.get("_target_"))
+        elif encoder_cfg.get("target", None) is not None:
+            encoder_cls = get_class(encoder_cfg.get("target"))
+        else:
+            encoder_cls = ASRModel
+
+        pretrained_model = encoder_cfg.get('pretrained_model', None)
+        if pretrained_model is None:
+            return None
+        if encoder_cls is None:
+            raise ValueError(
+                f"Must specify a valid encoder class in the via the `_target_` field in the config: {encoder_cfg}"
+            )
+
+        if pretrained_model.endswith('.nemo'):
+            logging.info(f'Loading pretrained audio model from local file: {pretrained_model}')
+            audio_model = encoder_cls.restore_from(pretrained_model, map_location='cpu')
+        else:
+            logging.info(f'Loading pretrained audio model from NGC: {pretrained_model}')
+            audio_model = encoder_cls.from_pretrained(pretrained_model, map_location='cpu')
+        return audio_model
+
+    @classmethod
+    def get_speaker_model_and_config(cls, cfg):
+        """load speaker embedding model and config if present in the config."""
+        if 'speaker_model' in cfg.model.perception:
+            if cfg.model.get("_target_", None) is not None:
+                model_cls = get_class(cfg.model.get("_target_"))
+            elif cfg.model.get("target", None) is not None:
+                model_cls = get_class(cfg.model.get("target"))
+            else:
+                model_cls = EncDecSpeakerLabelModel
+
+            speaker_cfg = cfg.model.perception.speaker_model
+            if speaker_cfg.get('pretrained_model', None) is not None:
+                if speaker_cfg.pretrained_model.endswith('.nemo'):
+                    logging.info(f'Loading pretrained speaker model from local file: {speaker_cfg.pretrained_model}')
+                    speaker_model = model_cls.restore_from(speaker_cfg.pretrained_model, map_location='cpu')
+                else:
+                    logging.info(f'Loading pretrained speaker model from NGC: {speaker_cfg.pretrained_model}')
+                    speaker_model = model_cls.from_pretrained(speaker_cfg.pretrained_model, map_location='cpu')
+                return speaker_model, speaker_model.cfg
+            return None, None
+        else:
+            return None, None
+
+    @classmethod
+    def get_audio_encoder_models_and_configs(cls, cfg):
+        if 'encoders' in cfg.model.perception:
+            audio_encoders = {}
+            audio_enc_cfgs = {}
+            for key, encoder_cfg in cfg.model.perception.encoders.items():
+                audio_encoders[key] = cls.get_pretraind_audio_model(encoder_cfg)
+                audio_enc_cfgs[key] = audio_encoders[key].cfg
+            return audio_encoders, audio_enc_cfgs
+        else:
+            pretrained_audio_model = cfg.model.get("pretrained_audio_model", None)
+            pretrained_audio_model_class = cfg.model.get(
+                "pretrained_audio_model_target", "nemo.collections.asr.models.ASRModel"
+            )
+
+            model_class = hydra.utils.get_class(pretrained_audio_model_class)
+            if pretrained_audio_model.endswith('.nemo'):
+                logging.info(f'Loading pretrained audio model from local file: {pretrained_audio_model}')
+                audio_model = model_class.restore_from(pretrained_audio_model, map_location='cpu')
+            else:
+                logging.info(f'Loading pretrained audio model from NGC: {pretrained_audio_model}')
+                audio_model = model_class.from_pretrained(pretrained_audio_model, map_location='cpu')
+            return audio_model, audio_model.cfg
+
+    @classmethod
+    def load_pretrained_audio_weights(
+        cls, cfg, model, audio_model, speaker_model: Optional[EncDecSpeakerLabelModel] = None
+    ):
+        use_multi_encoder = cfg.model.perception.get("encoders", None) is not None
+        if not use_multi_encoder:
+            if cfg.model.perception.get("use_multi_layer_feat", False):
+                model.perception.encoder.encoder.load_state_dict(audio_model.encoder.state_dict(), strict=True)
+            else:
+                model.perception.encoder.load_state_dict(audio_model.encoder.state_dict(), strict=True)
+            logging.info(f'Loaded pretrained audio model weights from {cfg.model.pretrained_audio_model}')
+            if cfg.model.get('use_am_tokenizer', False):
+                model.tokenizer = audio_model.tokenizer
+                logging.info(f'Use AM tokenizer: {audio_model.tokenizer}')
+            return model
+        else:
+            for key, enc_cfg in cfg.model.perception.encoders.items():
+                if enc_cfg.get("use_multi_layer_feat", False):
+                    model.perception.encoders[key].encoder.load_state_dict(
+                        audio_model[key].encoder.state_dict(), strict=True
+                    )
+                else:
+                    model.perception.encoders[key].load_state_dict(audio_model[key].encoder.state_dict(), strict=True)
+                logging.info(f'Loaded pretrained audio model weights for {key}')
+            if speaker_model is not None:
+                model.perception.speaker_model.load_state_dict(speaker_model.state_dict(), strict=True)
+                logging.info(f'Loaded pretrained speaker model weights')
+            return model
+
+    @classmethod
+    def restore_from_pretrained_models(
+        cls,
+        cfg: Optional[Union[OmegaConf, str]] = None,
+        trainer: Optional[Trainer] = None,
+    ):
+        """
+        load pretrained LLM and audio encoders, and maybe add adapters, used for training.
+        Args:
+            cfg: input yaml config, with trainer, model, exp_manager, etc.
+            trainer: trainer object
+        """
+        if (
+            cfg.model.get("pretrained_audio_model", None) is None
+            and cfg.model.perception.get("encoders", None) is None
+        ):
+            raise RuntimeError("PEFT training needs at least one pretrained audio model present.")
+
+        if not cfg.model.restore_from_path:
+            raise RuntimeError("PEFT training needs a trained base model present.")
+
+        base_model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
+        audio_model, audio_model_cfg = cls.get_audio_encoder_models_and_configs(cfg)
+        speaker_model, speaker_cfg = cls.get_speaker_model_and_config(cfg)
+        model_cfg = cls._modify_config(
+            base_model_cfg, cfg, audio_model_cfg, add_cfg_to_tree=False, speaker_cfg=speaker_cfg
+        )
+
+        # load llm
+        model = cls.restore_from(
+            restore_path=cfg.model.restore_from_path,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            strict=False,
+            map_location="cpu",
+        )
+
+        if "peft" in cfg.model:
+            peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+            if cfg.model.peft.restore_from_path is not None:
+                # initialize peft weights from a checkpoint instead of randomly
+                # This is not the same as resume training because optimizer states are not restored.
+                logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
+                model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg), map_location="cpu")
+            elif peft_cfg_cls is not None:
+                logging.info("Adding adapter weights to the model for PEFT")
+                model.add_adapter(peft_cfg_cls(model_cfg))
+            else:
+                raise ValueError(f"PEFT scheme not not found in PEFT_CONFIG_MAP: {cfg.model.peft.peft_scheme}")
+        else:
+            logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
+
+        # load audio model weights
+        model = cls.load_pretrained_audio_weights(cfg, model, audio_model, speaker_model)
+
+        if 'inference' in cfg:
+            inference_cfg = OmegaConf.to_container(cfg.inference, resolve=True)
+            model.set_inference_config(inference_cfg)
+        return model
+
+    @classmethod
+    def load_audio_encoder_for_inference(cls, cfg: DictConfig, model_cfg: DictConfig, model: ModelPT) -> ModelPT:
+        """
+        Maybe load audio encoders for inference, if they were not tunable during training.
+        Args:
+            cfg: inference config
+            model_cfg: model config
+            model: model object
+        Returns:
+            model: model object with audio encoder weights loaded
+        """
+        if model_cfg.freeze_audio_encoder and model_cfg.get("pretrained_audio_model", None) is not None:
+            with open_dict(cfg):
+                cfg.model.perception = model_cfg.perception
+
+            audio_model, _ = cls.get_audio_encoder_models_and_configs(cfg)
+            speaker_model, _ = cls.get_speaker_model_and_config(cfg)
+            model = cls.load_pretrained_audio_weights(cfg, model, audio_model, speaker_model)
+        return model
+
+    @classmethod
+    def merge_inference_cfg(
+        cls, cfg: DictConfig, trainer: Trainer, pretrained_model_cfg: DictConfig = None
+    ) -> DictConfig:
+        """
+        Merge the inference config with the model config, used for inference only.
+        if no pretrained_model_cfg is given, it will be loaded from the checkpoint specified in cfg.
+        Args:
+            cfg: inference config
+            trainer: trainer object
+            pretrained_model_cfg: a pre-loaded SpeechLLM model config
+        Returns:
+            model_cfg: merged model config
+        """
+        if pretrained_model_cfg:
+            model_cfg = pretrained_model_cfg
+        elif cfg.model.peft.restore_from_path:
+            if cfg.model.peft.restore_from_path.endswith(".nemo"):
+                model_cfg = ModularAudioGPTModel.restore_from(
+                    restore_path=cfg.model.peft.restore_from_path,
+                    trainer=trainer,
+                    return_config=True,
+                )
+            elif cfg.model.peft.restore_from_hparams_path:  # not a .nemo model we expect a hparams.yaml file
+                model_cfg = OmegaConf.to_container(OmegaConf.load(cfg.model.peft.restore_from_hparams_path).cfg)
+                model_cfg = OmegaConf.create(model_cfg)
+                # extract dict inside cfg key and convert it to DictConfig
+                # this allows interpolation to work the same way as config from the .restore_from method
+            else:
+                raise RuntimeError(
+                    "This script requires a .nemo peft model or path to hparams.yaml (and a ckpt path)."
+                )
+        else:
+            model_cfg = MegatronGPTSFTModel.restore_from(
+                restore_path=cfg.model.restore_from_path,
+                trainer=trainer,
+                return_config=True,
+            )
+
+        if hasattr(model_cfg, 'peft') and model_cfg.peft.peft_scheme not in [None, 'none']:
+            # before PEFT migrates to distributed ckpt, eval must use same TP/PP as training
+            for p in ['tensor_model_parallel_size', 'pipeline_model_parallel_size']:
+                assert model_cfg.get(p) == cfg.model.get(
+                    p
+                ), f"PEFT evaluation {p} ({cfg.model.get(p)}) must equal training {p} ({model_cfg.get(p)})"
+
+        with open_dict(model_cfg):
+            # to be compatible with old checkpoints
+            if "context_key" not in model_cfg.data.train_ds or "answer_key" not in model_cfg.data.train_ds:
+                model_cfg.data.train_ds.context_key = "question"
+                model_cfg.data.train_ds.answer_key = "answer"
+
+            # update the model config of the trained model with params we want to set at inference time.
+            model_cfg.precision = cfg.trainer.precision
+            for key, val in cfg.model.items():
+                if key != 'data' and key != 'peft':
+                    model_cfg[key] = val
+            model_cfg.data.test_ds = cfg.model.data.test_ds
+
+        with open_dict(cfg):
+            if model_cfg.data.test_ds is not None:
+                cfg.inference.add_BOS = model_cfg.data.test_ds.get("add_BOS", False)
+                cfg.inference.tokens_to_generate = model_cfg.data.test_ds.get("tokens_to_generate", 1)
+
+        model_cfg.megatron_amp_O2 = False  # always evaluate with O1
+        return model_cfg
+
+    @classmethod
+    def load_adapters_for_inference(cls, cfg: DictConfig, model_cfg: DictConfig, model: ModelPT) -> ModelPT:
+        if cfg.model.peft.restore_from_path:
+            if '\\' in cfg.model.peft.restore_from_path:
+                cfg.model.peft.restore_from_path = cfg.model.peft.restore_from_path.replace('\\', '')
+            if "peft" in model_cfg:
+                peft_cfg_cls = PEFT_CONFIG_MAP[model_cfg.peft.peft_scheme]
+                model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg), map_location="cpu")
+            else:
+                model.load_state_dict(torch.load(cfg.model.peft.restore_from_path), strict=False)
+        elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
+            checkpoint_path = os.path.join(
+                cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+            )
+            # checkpoint_path is a dir in case of distributed checkpointing
+            if not os.path.isdir(checkpoint_path):
+                # legacy checkpoint needs model parallel rank injection
+                checkpoint_path = inject_model_parallel_rank(
+                    os.path.join(
+                        cfg.model.peft.restore_from_ckpt.checkpoint_dir,
+                        cfg.model.peft.restore_from_ckpt.checkpoint_name,
+                    )
+                )
+                if "peft" in model_cfg:
+                    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+                    model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg), map_location="cpu")
+                else:
+                    model.load_state_dict(torch.load(checkpoint_path), strict=False)
+            else:
+                raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
+        elif model_cfg.peft.get("peft_scheme", None):
+            # special case for loading a complete speechllm checkpoint in nemo format
+            peft_cfg_cls = PEFT_CONFIG_MAP[model_cfg.peft.peft_scheme]
+            model.load_adapters(cfg.model.restore_from_path, peft_cfg_cls(model_cfg), map_location="cpu")
+        return model
+
+    def _build_vocab(self):
+        """
+        Manipulate vocabulary (e.g., pad vocabulary for increased performance)/
+        """
+        if self._cfg.get('override_vocab_size', None) is not None:
+            self.padded_vocab_size = self._cfg.override_vocab_size
+        else:
+            self.padded_vocab_size = self._vocab_size_with_padding(
+                orig_vocab_size=self.tokenizer.vocab_size,
+                make_vocab_size_divisible_by=self._cfg.get('make_vocab_size_divisible_by', 128),
+                tensor_model_parallel_size=self._cfg.get('tensor_model_parallel_size', 1),
+            )
+
+    def state_dict(self, destination=None, prefix=None, keep_vars=False):
+        """
+        Overwrite the state_dict method to include only the trainable parameters.
+        """
+        if self.setup_complete and self.trainer.state.fn == "fit":
+            # Once setup is complete we only need adapter and perception model.
+            if self.cfg.freeze_llm and self.cfg.get("peft", None) is not None:
+                return_state_dict = self.get_peft_state_dict()
+            elif not self.cfg.freeze_llm:
+                return_state_dict = self.model.state_dict(prefix="model.")
+            else:
+                return_state_dict = {}
+
+            state_dict = self.perception.state_dict(prefix="perception.")
+            if self.cfg.freeze_audio_encoder:
+                state_dict = {k: v for k, v in state_dict.items() if not k.startswith("perception.encoder.")}
+
+            return_state_dict.update(state_dict)
+            state_dict = self.perception.state_dict(prefix="perception.")
+            return_state_dict.update(state_dict)
+            return return_state_dict
+        elif self.setup_complete and self.trainer.state.fn != "fit":
+            # used to save the whole model as a nemo file
+            return_state_dict = self.model.state_dict(prefix="model.")
+            state_dict = self.perception.state_dict(prefix="perception.")
+            return_state_dict.update(state_dict)
+            return return_state_dict
+        else:
+            # we want all the params with the same keys as calling self.state_dict()
+            # but we can't call self.state_dict() here as it would be a recursive call.
+            # so we call self.model.state_dict(prefix="model.") which will return all the keys and params same as calling self.state_dict()
+            if not self.cfg.freeze_llm:
+                return_state_dict = self.model.state_dict(prefix="model.")
+            else:
+                return_state_dict = {}
+            state_dict = self.perception.state_dict(prefix="perception.")
+            if self.cfg.freeze_audio_encoder:
+                state_dict = {k: v for k, v in state_dict.items() if not k.startswith("perception.encoder.")}
+            return_state_dict.update(state_dict)
+            return return_state_dict
+
+    def load_state_dict(self, state_dict, strict: bool = True):
+        if not self.setup_complete:
+            if self.cfg.get('override_vocab_size', False):
+                exclude_list = [
+                    "model.language_model.embedding.word_embeddings.weight",
+                    "model.language_model.output_layer.weight",
+                ]
+            else:
+                exclude_list = []
+            state_dict = {k: v for k, v in state_dict.items() if k not in exclude_list}
+        else:
+            strict = False
+
+        if len(state_dict) == 0:
+            return  # checkpoint is loaded in on_load_checkpoint()
+        if self.use_peft and self.setup_complete:
+            # at this stage only adapter params will appear in the state_dict arg
+            # so we only update those while the rest of the model is frozen.
+            # setting strict=False will ignore the missing keys (which are not being updated anyway)
+            # explicitly check if state_dict.keys matches all the expected self.adapter_keys since we don't have the
+            # safety in strict=True anymore.
+            if not self.ptuning_only_and_non_first_stage:
+                if set(state_dict.keys()) != self.adapter_keys.union(self.tunable_base_param_keys):
+                    logging.warning(
+                        f"Unexpected keys found in state_dict: {set(state_dict.keys()) - self.adapter_keys.union(self.tunable_base_param_keys)}, missing keys in state_dict: {self.adapter_keys.union(self.tunable_base_param_keys) - set(state_dict.keys())}"
+                    )
+                super(MegatronGPTModel, self).load_state_dict(state_dict, strict=False)
+        else:
+            super(MegatronGPTModel, self).load_state_dict(state_dict, strict=strict)
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        """LightningModule hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint
+        """
+        checkpoint_state_dict = checkpoint['state_dict']
+        self.load_state_dict(checkpoint_state_dict, strict=False)
+
+    def setup_metric(self, data_cfg):
+        metric_name = "exact_string_match"
+        if not hasattr(data_cfg, "metric"):
+            metric = MetricStringToTorchMetric["exact_string_match"]
+        else:
+            if not hasattr(data_cfg.metric, "name"):
+                raise ValueError("Metric name is not provided in the metric config.")
+            if data_cfg.metric.name == "loss":
+                return None, "loss"
+            if data_cfg.metric.name not in MetricStringToTorchMetric:
+                raise KeyError(
+                    f"{data_cfg.metric.name} is not supported. List of supported metrics: {MetricStringToTorchMetric.keys()}"
+                )
+            if data_cfg.metric.name in self._metrics_require_string2category_map:
+                if data_cfg.metric.average is None:
+                    raise ValueError(
+                        f"{data_cfg.metric.name} requires specifying whether you want to compute a micro or macro average. Found None."
+                    )
+            if (
+                data_cfg.metric.get('labels_are_strings', False)
+                and data_cfg.metric.name in self._metrics_require_string2category_map
+            ):
+                if data_cfg.metric.num_classes is None:
+                    raise ValueError(
+                        "Number of classes is not provided in the metric section within the data config. "
+                        f"Please provide the number of classes in the data config to use the {data_cfg.metric.name} metric."
+                    )
+                if data_cfg.metric.get('class_labels', None) is None or not isinstance(
+                    data_cfg.metric.get('class_labels', None), ListConfig
+                ):
+                    raise ValueError(
+                        "Class labels are not provided properly in the metric section witnin the data config. "
+                        f"Please provide the class labels as a list of strings in the data config to use the {data_cfg.metric.name} metric."
+                    )
+                if len(data_cfg.metric.get('class_labels', None)) != data_cfg.metric.num_classes:
+                    raise ValueError(
+                        f"Number of class labels {len(data_cfg.metric.get('class_labels', None))} does not match `num_classes` : {data_cfg.metric.num_classes}"
+                    )
+
+            metric_name = data_cfg.metric.name
+            metric_cls = MetricStringToTorchMetric[metric_name]
+            if metric_name not in TextMetricsSet:
+                metric = [metric_cls(**data_cfg.metric)]
+            else:
+                metric = [metric_cls()]
+        return metric, metric_name
+
+    def inference_step(self, dataloader_iter, mode):
+        """
+        Used for validation and test steps, added postprocessing after calling self.predict_step().
+        """
+        batch, batch_idx, dataloader_idx = next(dataloader_iter)
+        data_cfg = self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds
+        self._reconfigure_and_process_inference_batch(batch, data_cfg)
+        # Meta data from dataset
+        metadata = batch.get('metadata', [{}] * len(batch['tokens']))
+        loss = super(MegatronGPTSFTModel, self).validation_step(itertools.chain([batch]), dataloader_idx)
+
+        # We need _inference_config to get generation params
+        # add_BOS and tokens_to_generate are set in dataset
+        if self.get_inference_config() is None:
+            logging.warning(f'inference_config is not set. Use default: {default_inference_config}')
+            self.set_inference_config(inference_config=default_inference_config)
+        self._inference_config['add_BOS'] = data_cfg.add_bos
+        self._inference_config['tokens_to_generate'] = data_cfg.get('tokens_to_generate')
+
+        output = self.predict_step(batch, batch_idx, dataloader_idx)
+
+        inputs_text = [self.tokenizer.ids_to_text(c.tolist()) for c in batch['contexts']]
+        labels_text = [self.tokenizer.ids_to_text(a.tolist()) for a in batch['answers']]
+        preds_text = [
+            self.tokenizer.ids_to_text(t[l.item() :][: data_cfg.get('tokens_to_generate')])
+            for t, l in zip(output['token_ids'], batch['context_lengths'])
+        ]
+
+        if data_cfg.get("end_string", None):
+            # sometimes data_cfg.end_string != self.tokenizer.ids_to_text(self.tokenizer.text_to_ids(data_cfg.end_string))
+            # for example when data_cfg.end_string = "<end>", the end_string_re will start with " ?? "
+            end_string_re = self.tokenizer.ids_to_text(self.tokenizer.text_to_ids(data_cfg.end_string))
+            preds_text_cleaned = []
+            labels_text_cleaned = []
+            for p, l in zip(preds_text, labels_text):
+                # remove end_string from the end of the string
+                for es in [end_string_re, data_cfg.end_string]:
+                    if p.endswith(es):
+                        p = p[: -len(es)].strip()
+                    if l.endswith(es):
+                        l = l[: -len(es)].strip()
+                preds_text_cleaned.append(p)
+                labels_text_cleaned.append(l)
+            preds_text = preds_text_cleaned
+            labels_text = labels_text_cleaned
+
+        if data_cfg.get("remove_text_pc", False):
+            preds_text = [remove_punctuations(p.lower(), data_cfg.get("punctuations", None)) for p in preds_text]
+            labels_text = [remove_punctuations(l.lower(), data_cfg.get("punctuations", None)) for l in labels_text]
+
+        if data_cfg.get("log_every_n_steps", None) is not None:
+            if batch_idx % data_cfg.log_every_n_steps == 0:
+                logging.info(f"Input: `{inputs_text[0]}`")
+                logging.info(f"Label: `{labels_text[0]}`")
+                logging.info(f"Pred: `{preds_text[0]}`")
+
+        # if loss is nan, print the input, label and pred
+        if loss.isnan():
+            logging.info("++++++++++++++ NaN loss detected ++++++++++++++")
+            for i in range(len(inputs_text)):
+                logging.info(f"Input: `{inputs_text[i]}`")
+                logging.info(f"Label: `{labels_text[i]}`")
+                logging.info(f"Pred: `{preds_text[i]}`")
+            logging.info("++++++++++++++++++++++++++++++++++++++++++++++++")
+
+        outputs = {
+            'loss': loss,
+            'preds': preds_text,  # [str]
+            'labels': labels_text,  # [str]
+            'inputs': inputs_text,  # [str]
+            'metadata': metadata,  # [dict]
+        }
+
+        if mode == 'validation':
+            if len(self._validation_dl) > 1:
+                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
+                self.validation_step_outputs[dataloader_idx][-1] = outputs
+            else:
+                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
+                self.validation_step_outputs[-1] = outputs
+        else:
+            if len(self._test_dl) > 1:
+                self.test_step_outputs[dataloader_idx][-1] = outputs
+            else:
+                self.test_step_outputs[-1] = outputs
+        return outputs
+
+    def predict_step(self, batch: dict, batch_idx: int, dataloader_idx: Optional[int] = None):
+        """
+        Used to get LLM predictions for validation and test steps based on the given inference config.
+        """
+        inference_config = self.get_inference_config()
+        if inference_config is not None:
+            # need to overwrite some configuration, make it immutable
+            inference_config = inference_config.copy()
+        else:
+            self.set_inference_config(inference_config=default_inference_config)
+            logging.warning(f'inference_config is not set. Use default: {default_inference_config}')
+            inference_config = self.get_inference_config()
+
+        if self.cfg.data.get('end_string', None):
+            inference_config['end_strings'] = [self.cfg.data.end_string]
+
+        global_batch_size_per_gpu = batch['tokens'].size(0)
+        num_micro_batches_before_decode = get_num_microbatches()
+
+        compute_logprob = inference_config.get('compute_logprob', False)
+        if compute_logprob:
+            inference_config['inputs'] = batch
+            inference_config['tokens_to_generate'] = 1
+            inference_config['all_probs'] = True
+            inference_config["add_BOS"] = False
+            inference_config['greedy'] = True
+            response = generate(self, **inference_config)
+            response = get_computeprob_response(self.tokenizer, response, batch)
+        else:
+            # for megatron_gpt_eval.py
+            if isinstance(batch, list):
+                inference_config['inputs'] = batch
+            elif 'num_audios' in batch:
+                # peft_eval.py
+                inference_config['inputs'] = (
+                    batch['contexts'].cuda(),
+                    batch['context_lengths'].cuda(),
+                    batch['audio_signal'].cuda(),
+                    batch['audio_signal_length'].cuda(),
+                    batch['num_audios'].cuda(),
+                    batch['context_start_idx'],
+                )
+            else:
+                # peft_eval.py
+                inference_config['inputs'] = (
+                    batch['contexts'].cuda(),
+                    batch['context_lengths'].cuda(),
+                    batch['audio_signal'].cuda(),
+                    batch['audio_signal_length'].cuda(),
+                )
+            response = generate(self, **inference_config)
+
+        app_state = AppState()
+        _reconfigure_microbatch_calculator(
+            rank=app_state.global_rank,
+            rampup_batch_size=None,
+            global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
+            micro_batch_size=global_batch_size_per_gpu // num_micro_batches_before_decode,
+            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+        )
+
+        # add audio offsets to context lengths for properly decoding only the response
+        batch['context_lengths'] = batch['context_lengths'].cuda() + response['audio_feat_lens']
+
+        return response
+
+    def inference_epoch_end(self, outputs, mode, data_cfg):
+        # Parent class will handle logging of the loss.
+        if not outputs or (all([not x for x in outputs])):
+            return None
+
+        if isinstance(outputs[0], dict):
+            outputs = [outputs]
+
+        averaged_loss = []
+        averaged_metric = []
+        # Log metrics for each provided validation/test dataset.
+        for dataloader_idx, output in enumerate(outputs):
+            if len(output) == 0:
+                logging.warning(f"Empty output for dataloader_idx: {dataloader_idx}")
+                continue
+            # Expand on_validation_epoch_end from parent class MegatronGPTModel as on_validation_epoch_end doesnt take outputs arg
+            loss_vals = [x['loss'] for x in output]
+            if parallel_state.is_pipeline_last_stage():
+                # only the last pipeline parallel stages return loss with their batch size
+                if self.cfg.data.get('validation_drop_last', True):
+                    loss = torch.stack(loss_vals).mean()
+                else:
+                    # Compute the avg loss by total_loss across all samples / total number of samples
+                    total_loss_and_total_samples = torch.vstack(loss_vals).sum(axis=0)
+                    avg_loss = total_loss_and_total_samples[0] / total_loss_and_total_samples[1]
+                    loss = avg_loss.type(torch.float32).cuda()
+            else:
+                loss = torch.tensor(0.0, dtype=torch.float32).cuda()
+
+            # we can only log on one rank if it is rank zero so we broadcast from last rank
+            torch.distributed.broadcast(loss, get_last_rank())
+
+            self.log('val_loss', loss, prog_bar=True, rank_zero_only=True, batch_size=1, sync_dist=True)
+
+            # Determine the key used to log the loss based on the user provided name of the dataset or the dataloader index.
+            loss_log_key = self._determine_log_key(data_cfg, dataloader_idx, "loss", mode)
+            self.log(loss_log_key, loss, batch_size=1)
+            averaged_loss.append(loss)
+
+            # Gather the outputs object from all data parallel ranks since we are using the DistributedSampler which splits data across DDP ranks.
+            gathered_outputs = [None for _ in range(parallel_state.get_data_parallel_world_size())]
+            torch.distributed.all_gather_object(
+                gathered_outputs,
+                [
+                    {'preds': x['preds'], 'labels': x['labels'], 'inputs': x['inputs'], 'metadata': x['metadata']}
+                    for x in output
+                ],
+                group=parallel_state.get_data_parallel_group(),
+            )
+
+            # Remove duplicate examples due to distributed sampler.
+            inp_label_set = set()
+            deduplicated_outputs = {
+                'preds': [],
+                'labels': [],
+                'inputs': [],
+                'metadata': [],
+            }
+            total_size = 0
+            for rank in range(0, parallel_state.get_data_parallel_world_size()):
+                for batch in gathered_outputs[rank]:
+                    for pred, label, input, metadata in zip(
+                        batch['preds'], batch['labels'], batch['inputs'], batch['metadata']
+                    ):
+                        key = input + label + str(metadata)
+                        total_size += 1
+                        if key not in inp_label_set:
+                            inp_label_set.add(key)
+                            deduplicated_outputs['preds'].append(pred)
+                            deduplicated_outputs['labels'].append(label)
+                            deduplicated_outputs['inputs'].append(input)
+                            deduplicated_outputs['metadata'].append(metadata)
+
+            # Compute metric score
+            metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name
+            metric_label_key = self.val_metric_label_key if mode == 'validation' else self.test_metric_label_key
+            if metric_name != 'loss':
+                metric_log_key = self._determine_log_key(data_cfg, dataloader_idx, metric_name, mode)
+                metric_fn = self.val_metric[0] if mode == 'validation' else self.test_metric[0]
+                if metric_label_key in deduplicated_outputs['metadata'][0]:
+                    labels = [m[metric_label_key] for m in deduplicated_outputs['metadata']]
+                else:
+                    labels = deduplicated_outputs['labels']
+
+                # sacrebleu.corpus_bleu is commonly used which does not share
+                # the same interface as other metrics. We handle it separately.
+                if metric_name == 'bleu':
+                    metric_result = torch.Tensor(
+                        [sacrebleu.corpus_bleu(deduplicated_outputs['preds'], [labels]).score]
+                    ).to(self.device)
+                else:
+                    for pred, label in zip(deduplicated_outputs['preds'], labels):
+                        _ = metric_fn(pred, label)
+
+                    metric_result = metric_fn.compute()
+
+                if metric_name == 'rouge':
+                    for k, v in metric_result.items():
+                        if 'fmeasure' in k:
+                            self.log(metric_log_key + f'_{k}', v.item(), sync_dist=True, batch_size=1)
+                            logging.info(f"{mode} {metric_name} {k}: {v.item()}")
+                    metric_result = metric_result['rouge1_fmeasure']
+                else:
+                    self.log(metric_log_key, metric_result.item(), sync_dist=True, batch_size=1)
+                    logging.info(f"{mode} {metric_name}: {metric_result.item()}")
+
+                metric_fn.reset()
+                averaged_metric.append(metric_result)
+
+            # Write predictions to file
+            if self.global_rank == 0 and data_cfg.get("write_predictions_to_file", False):
+                logging.info(
+                    f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['inputs'])}"
+                )
+
+                # Check if the user provided a prefix path to the file(s) they want to write.
+                if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None:
+                    raise ValueError(
+                        f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file."
+                    )
+                filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode)
+                output_dir = data_cfg.get("output_dir", "./")
+                self.write_predictions_to_file(
+                    deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}", output_dir
+                )
+
+            torch.distributed.barrier(group=parallel_state.get_data_parallel_group())
+            outputs[dataloader_idx].clear()  # free memory
+
+        # Logging of the averaged metrics:
+        averaged_loss = sum(averaged_loss) / len(averaged_loss)
+        averaged_metric = sum(averaged_metric) / len(averaged_metric) if len(averaged_metric) > 0 else None
+        averaged_loss = averaged_loss.to(self.device)
+        if averaged_metric is not None:
+            averaged_metric = averaged_metric.to(self.device)
+
+        # Handle case where metrics can be nan or inf. This can break checkpoint save/load.
+        if averaged_metric is not None and (torch.isinf(averaged_metric) or torch.isnan(averaged_metric)):
+            app_state = AppState()
+            monitor_mode = app_state.checkpoint_callback_params.mode
+            assert monitor_mode in ['min', 'max']
+            averaged_metric = 0.0 if monitor_mode == 'max' else 1e5
+
+        if mode == 'validation':
+            self.log("validation_loss", averaged_loss, batch_size=1, sync_dist=True)
+            if averaged_metric is not None:
+                self.log(f"validation_{self.val_metric_name}", averaged_metric, sync_dist=True, batch_size=1)
+        elif mode == 'test':
+            self.log("test_loss", averaged_loss, batch_size=1, sync_dist=True)
+            if averaged_metric is not None:
+                self.log(f"test_{self.test_metric_name}", averaged_metric, sync_dist=True, batch_size=1)
+
+        # Merge the functionality of previous on_inference_epoch_end() within inference_epoch_end() func here
+        app_state = AppState()
+        self._restore_activation_checkpointing_args()
+        if hasattr(self, "_train_ds"):
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=self.cfg.data.train_ds.global_batch_size,
+                micro_batch_size=self.cfg.data.train_ds.micro_batch_size,
+                data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            )
+        # When running `trainer.validate()`, the training dataset is not available.
+        else:
+            logging.warning('No training data found, reconfiguring microbatches based on validation batch sizes.')
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=data_cfg.global_batch_size,
+                micro_batch_size=data_cfg.micro_batch_size,
+                data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            )
+
+        return averaged_loss, averaged_metric
+
+    # consistent with speech models
+    @rank_zero_only
+    def write_predictions_to_file(self, outputs, output_file_path_prefix, output_dir):
+        os.makedirs(output_dir, exist_ok=True)
+        output_file_path = output_file_path_prefix + "_inputs_preds_labels.jsonl"
+        output_file_path = os.path.join(output_dir, output_file_path)
+        with open(output_file_path, "w") as f_json:
+            assert (
+                len(outputs['inputs']) == len(outputs['preds']) == len(outputs['labels']) == len(outputs['metadata'])
+            )
+            for i, p, l, m in zip(outputs['inputs'], outputs['preds'], outputs['labels'], outputs['metadata']):
+                json_string = {'input': i, 'pred_text': p, 'text': l}
+                for k, v in m.items():
+                    if k not in json_string:
+                        json_string[k] = v
+                f_json.write(json.dumps(json_string) + '\n')
+
+        logging.info(f'Predictions saved to {output_file_path}')
+
+    def setup_eval_dataloader(self, datasets, data_cfg):
+        dataloaders = []
+        if not isinstance(datasets, list):
+            return self.build_data_loader(dataset=datasets, data_cfg=data_cfg, consumed_samples=0)
+        for dataset in datasets:
+            eval_dl = self.build_data_loader(dataset=dataset, data_cfg=data_cfg, consumed_samples=0)
+            dataloaders.append(eval_dl)
+        return dataloaders
+
+    def setup_predict_dataloader(self, data_cfg):
+        datasets = self._build_dataset(data_cfg, False)
+        dataloaders = []
+        if not isinstance(datasets, list):
+            return self.build_data_loader(dataset=datasets, data_cfg=data_cfg, consumed_samples=0, is_predict=True)
+        for dataset in datasets:
+            eval_dl = self.build_data_loader(dataset=dataset, data_cfg=data_cfg, consumed_samples=0, is_predict=True)
+            dataloaders.append(eval_dl)
+        return dataloaders
+
+    def sharded_state_dict(self, prefix: str = ''):
+        """
+        Force None for the parent class's sharded_state_dict() method if setup is complete.
+        """
+        if self.setup_complete:
+            return None
+        else:
+            return super().sharded_state_dict(prefix=prefix)
+
+    def maybe_build_test(self):
+        # overwrite the parent class's maybe_build_test() method in MegatronGPTModel
+        if hasattr(self.cfg.data, 'test_ds'):
+            logging.info('Building test datasets...')
+            # Wrap this in a list since the general finetuning parent class supports multi-validation.
+            self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False)
+            lengths = [len(x) for x in self._test_ds]
+            logging.info(f'Length of test datasets: {lengths}, total: {sum(lengths)}')
+        return
+
+    def maybe_setup_test(self):
+        # overwrite the parent class's maybe_build_test() method in MegatronGPTModel
+        if hasattr(self.cfg.data, 'test_ds'):
+            self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds)
+        return
+
+    def build_train_valid_test_datasets(self, stage):
+        if stage != 'test':
+            logging.info('Building validation datasets.')
+            # Wrap this in a list since the general finetuning parent class supports multi-validation.
+            self._validation_ds = self._build_dataset(self.cfg.data.validation_ds, is_train=False)
+            lengths = [len(x) for x in self._validation_ds]
+            logging.info(f'Length of validation datasets: {lengths}, total: {sum(lengths)}')
+
+        if stage != 'validate':
+            self.maybe_build_test()
+
+        if stage == 'validate' or stage == 'test':
+            return
+        logging.info('Building training datasets.')
+        self._train_ds = self._build_dataset(self.cfg.data.train_ds)
+        logging.info(f'Length training datasets: {len(self._train_ds)}')
+
+    @classmethod
+    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
+        """
+        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
+
+        Returns:
+            List of available pre-trained models.
+        """
+        results = []
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="speechllm_fc_llama2_7b",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia/nemo/speechllm_fc_llama2_7b",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/speechllm_fc_llama2_7b/versions/1.23.1/files/speechllm_fc_llama2_7b.nemo",
+        )
+        results.append(model)
+        return results
diff --git a/nemo/collections/multimodal/speech_llm/modules/__init__.py b/nemo/collections/multimodal/speech_llm/modules/__init__.py
new file mode 100644
index 000000000000..d9562652ce84
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.multimodal.speech_llm.modules.modality_adapters import PoolingMLPConnectors
+from nemo.collections.multimodal.speech_llm.modules.perception_modules import (
+    AudioPerceptionModule,
+    MultiAudioPerceptionModule,
+    MultiFeatureAggregator,
+)
diff --git a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
new file mode 100644
index 000000000000..0cd48502bb84
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple
+
+import torch
+
+import nemo.collections.nlp.modules.common.text_generation_strategy as text_generation_strategy
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import shift_tokens_by_multi_audios
+
+
+# the text representation of eos_id, it applies for all tokenizers
+END_OF_SEQ = '<|endoftext|>'
+
+
+def switch(val1, val2, boolean):
+    boolean = boolean.type_as(val1)
+    boolean = boolean.unsqueeze(0).unsqueeze(-1)
+    return (1 - boolean) * val1 + boolean * val2
+
+
+class AudioToTextGenerationStrategy(text_generation_strategy.GPTModelTextGenerationStrategy):
+    def init_batch(
+        self,
+        context_tokens: torch.Tensor,
+        context_lengths: torch.Tensor,
+        audio_signal: torch.Tensor,
+        audio_length: torch.Tensor,
+        compute_attention_mask: bool,
+        num_audios: Optional[torch.Tensor] = None,
+        context_start_idx: Optional[List[List[int]]] = None,
+    ):
+        """initialize the batch data before the inference steps."""
+        # Move to GPU.
+
+        audio_feats, audio_feat_lens = self.model.perception(
+            input_signal=audio_signal,
+            input_signal_length=audio_length,
+            processed_signal=None,
+            processed_signal_length=None,
+        )
+
+        if num_audios is not None:
+            # handle multiple audio files per sample
+            audio_feats = audio_feats.split(num_audios.tolist())
+            audio_feat_lens = audio_feat_lens.split(num_audios.tolist())
+
+        encoder_input, attention_mask, _, position_ids, encoder_max_length = self.model.inject_perception_input(
+            audio_feats, audio_feat_lens, context_tokens, context_lengths, context_start_idx
+        )
+
+        self.attention_mask = attention_mask
+        self.position_ids = position_ids
+
+        if num_audios is not None:
+            # handle multiple audio files per sample
+            new_context_tokens = shift_tokens_by_multi_audios(
+                context_tokens, context_lengths, audio_feat_lens, context_start_idx, encoder_max_length
+            )
+            audio_feat_lens = torch.stack([torch.sum(lens) for lens in audio_feat_lens])  # [batch,]
+        else:
+            new_context_tokens = self.model._shift_labels_by_emb_len(
+                context_tokens, context_lengths, audio_feat_lens, encoder_max_length, pad_token=0
+            )
+
+        return new_context_tokens, encoder_input, audio_feat_lens
+
+    def clip_max_len(self, maxlen: int) -> int:
+        """clip the max len based on the LM model max sequence length"""
+        # for positional embedding types that allow length extrapolation, don't clip the max length
+        if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
+            if maxlen > self.model.cfg.encoder_seq_length + 1:
+                maxlen = self.model.cfg.encoder_seq_length + 1
+        return maxlen
+
+    def prepare_batch_at_step(
+        self,
+        tokens: torch.Tensor,
+        input_embeddings: torch.Tensor,
+        maxlen: int,
+        micro_batch_size: int,
+        step: int,
+        context_lengths: torch.Tensor,
+        curr_context_length: int,
+        compute_attention_mask: bool,
+    ) -> Tuple[List[torch.Tensor], List[int]]:
+        # types2use = None
+        if step == 0:
+            # Allocate memory for the entire context.
+            set_inference_key_value_memory = True
+            tokens2use = tokens[:, :curr_context_length]
+            positions2use = self.position_ids[:, :curr_context_length]
+            embeddings2use = input_embeddings[:curr_context_length]
+        else:
+            # Set this to false so the memory is not reallocated.
+            set_inference_key_value_memory = False
+            tokens2use = tokens[:, curr_context_length - 1].view(micro_batch_size, -1)
+            positions2use = self.position_ids[:, curr_context_length - 1].view(micro_batch_size, -1)
+            embeddings2use = self.model._get_text_embeddings(tokens2use, positions2use)
+            started = context_lengths <= curr_context_length
+            embeddings2use = switch(input_embeddings[curr_context_length - 1].unsqueeze(0), embeddings2use, started)
+
+        """Prepare batch for each of the inference steps"""
+        setkey_value_array = torch.tensor(
+            [set_inference_key_value_memory] * micro_batch_size, device=torch.cuda.current_device()
+        )
+        len_array = torch.tensor([maxlen] * micro_batch_size, device=torch.cuda.current_device())
+
+        batch = [tokens2use, embeddings2use, self.attention_mask, positions2use, setkey_value_array, len_array]
+        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
+        return batch, tensor_shape
+
+    def post_process(self, tokens: torch.Tensor, new_tokens: torch.Tensor, context_length: int):
+        """
+        At the end of the inference, post process the inference results
+        """
+        pass
+
+    def end_of_generation_condition(
+        self, tokens: torch.Tensor, prev: torch.Tensor, eod_id: int, end_strings: List[str]
+    ) -> torch.Tensor:
+        """
+        return whether the generation should stop based on the previous token
+        Args:
+            tokens (torch.Tensor): the generated tokens so far
+            prev  (torch.Tensor): the previous token
+            eod_id (int): the end of document token id
+            end_strings (List[str]): the list of end of generation strings
+        returns:
+            a boolean tensor indicating whether the generation should stop
+        """
+        if len(end_strings) == 1 and end_strings[0] == END_OF_SEQ:
+            return prev == eod_id
+        else:
+            tokenizer = self.model.tokenizer
+            conditions = []
+            end_tokens = set()
+            end_tokens.add(eod_id)
+            for end_string in end_strings:
+                if len(end_string) > 1:
+                    continue
+                ids_1 = tokenizer.text_to_ids(f'<extra_id_1>{end_string}')
+                ids_2 = tokenizer.text_to_ids('<extra_id_1>')
+                if len(ids_1) <= len(ids_2):
+                    continue
+                token_id = ids_1[len(ids_2) :][0]
+
+                end_tokens.add(token_id)
+
+            for p, token_item in zip(prev, tokens):
+                text = tokenizer.ids_to_text(token_item.tolist())
+                conditions.append(
+                    any([text.endswith(end_string) for end_string in end_strings] + [p.item() in end_tokens])
+                )
+            return torch.tensor(conditions, dtype=torch.bool, device=tokens.device)
+
+
+def model_inference_strategy_dispatcher(model, **args):
+    from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
+
+    if isinstance(model, ModularAudioGPTModel):
+        return AudioToTextGenerationStrategy(model, **args)
+    else:
+        return text_generation_strategy.model_inference_strategy_dispatcher(model, **args)
diff --git a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
new file mode 100644
index 000000000000..136418031586
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py
@@ -0,0 +1,698 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for generating text."""
+
+import pickle
+from collections.abc import Iterable
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+import nemo.collections.nlp.modules.common.text_generation_utils as text_generation_utils
+from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
+from nemo.collections.multimodal.speech_llm.modules.common.audio_text_generation_strategy import (
+    model_inference_strategy_dispatcher,
+)
+from nemo.collections.nlp.modules.common.transformer.text_generation import OutputType
+from nemo.utils import AppState
+
+try:
+    from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
+
+    HAVE_APEX = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_APEX = False
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+__all__ = [
+    "get_computeprob_response",
+    "generate",
+]
+
+
+def get_computeprob_response(tokenizer, response, inputs):
+    return text_generation_utils.get_computeprob_response(tokenizer, response, inputs)
+
+
+def send_generate_info(
+    context_tokens_tensor,
+    context_length_tensor,
+    audio_signal,
+    audio_signal_length,
+    tokens_to_generate,
+    all_probs,
+    compute_logprob,
+    temperature,
+    top_k,
+    top_p,
+    greedy,
+    repetition_penalty,
+    min_tokens_to_generate,
+    end_strings,
+    num_audios: Optional[torch.Tensor] = None,
+    context_start_idx: Optional[List[List[int]]] = None,
+):
+    """
+    Needs to be synced up with receive_generate_info
+    """
+    model_parallel_group = parallel_state.get_model_parallel_group()
+    src = text_generation_utils.get_model_parallel_src_rank()
+
+    audio_max_len = audio_signal.size(1) if audio_signal is not None else 0
+
+    # Send the sizes of the tensors
+    input_info = [
+        context_tokens_tensor.size(0),  # batch_size
+        context_tokens_tensor.size(1),  # seq_len
+        audio_max_len,  # audio_max_len
+        tokens_to_generate,
+        all_probs,
+        compute_logprob,  # whether to compute log probabilities matrix
+        temperature,
+        top_k,
+        top_p,
+        greedy,
+        repetition_penalty,
+        min_tokens_to_generate,
+    ]
+    input_info_tensor = torch.cuda.FloatTensor(input_info)
+    torch.distributed.broadcast(input_info_tensor, src, model_parallel_group)
+
+    # Send variables to all ranks
+    torch.distributed.broadcast(context_length_tensor, src, model_parallel_group)
+    torch.distributed.broadcast(context_tokens_tensor, src, model_parallel_group)
+
+    torch.distributed.broadcast(audio_signal, src, model_parallel_group)
+    torch.distributed.broadcast(audio_signal_length, src, model_parallel_group)
+
+    # send end strings
+    string_tensor = torch.as_tensor(
+        np.frombuffer(pickle.dumps(end_strings), dtype=np.int8), device=torch.cuda.current_device()
+    )
+    size = torch.as_tensor([string_tensor.size(0)], device=torch.cuda.current_device(), dtype=torch.int64)
+    torch.distributed.broadcast(size, src, model_parallel_group)
+    torch.distributed.broadcast(string_tensor, src, model_parallel_group)
+
+    if num_audios is not None:
+        torch.distributed.broadcast(num_audios, src, model_parallel_group)
+
+    if context_start_idx is not None:
+        context_idx_tensor = torch.as_tensor(
+            np.frombuffer(pickle.dumps(context_start_idx), dtype=np.int8), device=torch.cuda.current_device()
+        )
+        ctx_size = torch.as_tensor([context_idx_tensor.size(0)], device=torch.cuda.current_device(), dtype=torch.int64)
+        torch.distributed.broadcast(ctx_size, src, model_parallel_group)
+        torch.distributed.broadcast(context_idx_tensor, src, model_parallel_group)
+
+
+def receive_generate_info(has_multi_audios=False):
+    """
+    Needs to be synced up with send_generate_info
+    """
+    model_parallel_group = parallel_state.get_model_parallel_group()
+    src = text_generation_utils.get_model_parallel_src_rank()
+    input_info_tensor = torch.empty(12, dtype=torch.float32, device=torch.cuda.current_device())
+    torch.distributed.broadcast(input_info_tensor, src, model_parallel_group)
+    batch_size = int(input_info_tensor[0].item())
+    seq_len = int(input_info_tensor[1].item())
+    audio_len = int(input_info_tensor[2].item())
+    tokens_to_generate = int(input_info_tensor[3].item())
+    all_probs = bool(input_info_tensor[4].item())
+    compute_logprob = bool(input_info_tensor[5].item())  # whether to compute log probabilities matrix
+    temperature = float(input_info_tensor[6].item())
+    top_k = int(input_info_tensor[7].item())
+    top_p = float(input_info_tensor[8].item())
+    greedy = bool(input_info_tensor[9].item())
+    repetition_penalty = float(input_info_tensor[10].item())
+    min_tokens_to_generate = int(input_info_tensor[11].item())
+
+    context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
+    context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.cuda.current_device())
+    # Send variables to all ranks
+    torch.distributed.broadcast(context_length_tensor, src, model_parallel_group)
+    torch.distributed.broadcast(context_tokens_tensor, src, model_parallel_group)
+
+    audio_signal = torch.empty(batch_size, audio_len, dtype=torch.float32, device=torch.cuda.current_device())
+    audio_signal_length = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
+    # Send variables to all ranks
+    torch.distributed.broadcast(audio_signal, src, model_parallel_group)
+    torch.distributed.broadcast(audio_signal_length, src, model_parallel_group)
+
+    array_size = torch.empty(1, dtype=torch.int64, device=torch.cuda.current_device())
+    torch.distributed.broadcast(array_size, src, model_parallel_group)
+
+    string_tensor = torch.empty(array_size[0], dtype=torch.int8, device=torch.cuda.current_device())
+    torch.distributed.broadcast(string_tensor, src, model_parallel_group)
+    bytes = string_tensor.cpu().numpy().tobytes()
+    end_strings = pickle.loads(bytes)
+
+    num_audios = None
+    context_start_idx = None
+    if has_multi_audios:
+        num_audios = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
+        torch.distributed.broadcast(num_audios, src, model_parallel_group)
+
+        array_size = torch.empty(1, dtype=torch.int64, device=torch.cuda.current_device())
+        torch.distributed.broadcast(array_size, src, model_parallel_group)
+        context_idx_tensor = torch.empty(array_size[0], dtype=torch.int8, device=torch.cuda.current_device())
+        torch.distributed.broadcast(context_idx_tensor, src, model_parallel_group)
+        bytes = context_idx_tensor.cpu().numpy().tobytes()
+        context_start_idx = pickle.loads(bytes)
+
+    return (
+        context_length_tensor,
+        context_tokens_tensor,
+        audio_signal,
+        audio_signal_length,
+        tokens_to_generate,
+        all_probs,
+        compute_logprob,
+        temperature,
+        top_k,
+        top_p,
+        greedy,
+        repetition_penalty,
+        min_tokens_to_generate,
+        end_strings,
+        num_audios,
+        context_start_idx,
+    )
+
+
+def synced_generate(
+    model,
+    inference_strategy,
+    context_tokens_tensor,
+    context_length_tensor,
+    audio_signal,
+    audio_signal_length,
+    tokens_to_generate,
+    all_probs,
+    temperature,
+    top_k=0,
+    top_p=0.0,
+    greedy=False,
+    compute_attention_mask=True,
+    compute_logprob=False,
+    repetition_penalty=1.2,
+    end_strings=[],
+    min_tokens_to_generate=0,
+    num_audios: Optional[torch.Tensor] = None,
+    context_start_idx: Optional[List[List[int]]] = None,
+):
+    context_length = context_length_tensor.min().item()
+    tokenizer = model.tokenizer
+    if isinstance(tokenizer, TabularTokenizer):
+        raise NotImplementedError("Tabular generation is not supported yet")
+    else:
+        batch_token_iterator = sample_sequence_batch(
+            model,
+            inference_strategy,
+            context_tokens_tensor,
+            context_length_tensor,
+            audio_signal,
+            audio_signal_length,
+            tokens_to_generate,
+            all_probs,
+            compute_attention_mask=compute_attention_mask,
+            compute_logprob=compute_logprob,
+            temperature=temperature,
+            end_strings=end_strings,
+            extra={
+                "top_p": top_p,
+                "top_k": top_k,
+                "greedy": greedy,
+                "repetition_penalty": repetition_penalty,
+                "min_tokens_to_generate": min_tokens_to_generate,
+            },
+            num_audios=num_audios,
+            context_start_idx=context_start_idx,
+        )
+
+    for tokens, lengths, output_logits, full_logits, audio_feat_lens in batch_token_iterator:
+        context_length += 1
+    context_length += audio_feat_lens.min().item()
+    if parallel_state.is_pipeline_last_stage():
+        src = parallel_state.get_pipeline_model_parallel_last_rank()
+        group = parallel_state.get_embedding_group()
+        if compute_logprob:
+            torch.distributed.broadcast(output_logits, src, group)
+        if all_probs:
+            src = parallel_state.get_pipeline_model_parallel_last_rank()
+            group = parallel_state.get_embedding_group()
+            torch.distributed.broadcast(full_logits, src, group)
+
+    else:
+        if parallel_state.is_pipeline_first_stage():
+            src = parallel_state.get_pipeline_model_parallel_last_rank()
+            group = parallel_state.get_embedding_group()
+
+            if compute_logprob:
+                precision = model._trainer.precision
+                if precision in [16, "16"]:
+                    dtype = torch.float16
+                elif precision == "bf16":
+                    dtype = torch.bfloat16
+                else:
+                    dtype = torch.float32
+                output_logits = torch.empty(
+                    tokens.size(0), context_length - 1, dtype=dtype, device=torch.device("cuda")
+                )
+                torch.distributed.broadcast(output_logits, src, group)
+
+            if all_probs:
+                src = parallel_state.get_pipeline_model_parallel_last_rank()
+                group = parallel_state.get_embedding_group()
+                full_logits = torch.empty(
+                    tokens.size(0),
+                    context_length - 1,
+                    model.padded_vocab_size,
+                    dtype=dtype,
+                    device=torch.device("cuda"),
+                )
+                torch.distributed.broadcast(full_logits, src, group)
+    if tokens is not None:
+        return tokens[:, :context_length], output_logits, full_logits, audio_feat_lens
+    return None
+
+
+def generate(
+    model,
+    inputs: Union[Tuple, List[str]],
+    tokens_to_generate=0,
+    all_probs=False,
+    temperature=1.0,
+    add_BOS=False,
+    top_k=0,
+    top_p=0.0,
+    greedy=False,
+    compute_attention_mask=True,
+    compute_logprob=False,
+    repetition_penalty=1.0,
+    end_strings=['<|endoftext|>'],
+    min_tokens_to_generate=0,
+    **strategy_args,
+) -> OutputType:
+    """
+    Args:
+        model (NLPModel): text generative model
+        inputs (Union[tuple, List[str]]): if it is a tuple, it is assumed to be (context_tokens_tensor, context_length_tensor). Otherwise it it a list of prompt text strings
+        tokens_to_generate (int): The maximum length of the tokens to be generated.
+        all_probs (bool): Return the log prob for all the tokens
+        temperature (float): sampling temperature
+        add_BOS (bool): add the bos token at the begining of the prompt
+        top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        top_p (float): If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+        greedy (bool):  Whether or not to use sampling ; use greedy decoding otherwise
+        repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty
+        min_tokens_to_generate (int): The minimum length of the tokens to be generated
+        strategy_args, the extra arguments are treated as inference strategy arguments
+        end_strings, a list of strings to stop generation when they are encountered in the output.
+    Returns:
+        OutputType: It generates the output in a dictionary type. It has the following keys:
+            sentences: List[str], output sentences
+            tokens: List[List[str]], output sentences borken into tokens
+            logprob: List[Tensor], log prob of generated tokens
+            full_logprob: List[Tensor], log prob of all the tokens in the vocab
+            token_ids: List[Tensor], output sentence token ids
+            offsets: List[List[int]]  # list of tokens start positions in text
+    """
+    if 'strategy' in strategy_args:
+        inference_strategy = strategy_args['strategy']
+    else:
+        inference_strategy = model_inference_strategy_dispatcher(model)
+    tokenizer = model.tokenizer
+    has_multi_audios = False
+    num_audios = None
+    context_start_idx = None
+    audio_signal, audio_signal_length = None, None
+    if torch.distributed.get_rank() == text_generation_utils.get_model_parallel_src_rank():
+        if isinstance(inputs, tuple) and len(inputs) == 2:
+            context_tokens_tensor, context_length_tensor = inputs
+        elif isinstance(inputs, tuple) and len(inputs) == 4:
+            context_tokens_tensor, context_length_tensor, audio_signal, audio_signal_length = inputs
+        elif isinstance(inputs, tuple) and len(inputs) == 6:  # multi-audio
+            has_multi_audios = True
+            (
+                context_tokens_tensor,
+                context_length_tensor,
+                audio_signal,
+                audio_signal_length,
+                num_audios,
+                context_start_idx,
+            ) = inputs
+        else:
+            context_tokens_tensor, context_length_tensor = inference_strategy.tokenize_batch(
+                inputs, tokens_to_generate, add_BOS
+            )
+
+        send_generate_info(
+            context_tokens_tensor,
+            context_length_tensor,
+            audio_signal,
+            audio_signal_length,
+            tokens_to_generate,
+            all_probs,
+            compute_logprob,
+            temperature,
+            top_k,
+            top_p,
+            greedy,
+            repetition_penalty,
+            min_tokens_to_generate,
+            end_strings,
+            num_audios,
+            context_start_idx,
+        )
+    else:
+        (
+            context_length_tensor,
+            context_tokens_tensor,
+            audio_signal,
+            audio_signal_length,
+            tokens_to_generate,
+            all_probs,
+            compute_logprob,
+            temperature,
+            top_k,
+            top_p,
+            greedy,
+            repetition_penalty,
+            min_tokens_to_generate,
+            end_strings,
+            num_audios,
+            context_start_idx,
+        ) = receive_generate_info(has_multi_audios)
+
+    output = synced_generate(
+        model,
+        inference_strategy,
+        context_tokens_tensor,
+        context_length_tensor,
+        audio_signal,
+        audio_signal_length,
+        tokens_to_generate,
+        all_probs,
+        temperature,
+        compute_attention_mask=compute_attention_mask,
+        compute_logprob=compute_logprob,
+        top_k=top_k,
+        top_p=top_p,
+        greedy=greedy,
+        repetition_penalty=repetition_penalty,
+        end_strings=end_strings,
+        min_tokens_to_generate=min_tokens_to_generate,
+        num_audios=num_audios,
+        context_start_idx=context_start_idx,
+    )
+    special_tokens = set()
+    if hasattr(tokenizer, 'pad_token') and tokenizer.pad_token is not None:
+        special_tokens.add(tokenizer.pad_token)
+    if hasattr(tokenizer, 'eos_token') and tokenizer.eos_token is not None:
+        special_tokens.add(tokenizer.eos_token)
+    if hasattr(tokenizer, 'bos_token') and tokenizer.bos_token is not None:
+        special_tokens.add(tokenizer.bos_token)
+    if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token is not None:
+        special_tokens.add(tokenizer.cls_token)
+    if hasattr(tokenizer, 'unk_token') and tokenizer.unk_token is not None:
+        special_tokens.add(tokenizer.unk_token)
+    if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token is not None:
+        special_tokens.add(tokenizer.sep_token)
+    if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token is not None:
+        special_tokens.add(tokenizer.mask_token)
+    if output is not None:
+        decode_tokens, output_logits, full_logits, audio_feat_lens = output
+        resp_sentences = []
+        resp_sentences_seg = []
+
+        decode_tokens = decode_tokens.cpu().numpy().tolist()
+        for decode_token in decode_tokens:
+            sentence = tokenizer.ids_to_text(decode_token)
+            resp_sentences.append(sentence)
+            if not isinstance(tokenizer, TabularTokenizer):
+                words = []
+                for token in decode_token:
+                    if not isinstance(token, Iterable):
+                        token = [token]
+                    word = tokenizer.ids_to_tokens(token)
+                    if isinstance(word, Iterable):
+                        word = word[0]
+                    if hasattr(tokenizer.tokenizer, 'byte_decoder'):
+                        word = bytearray([tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
+                            'utf-8', errors='replace'
+                        )
+                    words.append(word)
+                resp_sentences_seg.append(words)
+            else:
+                words = tokenizer.text_to_tokens(sentence)
+                resp_sentences_seg.append(words)
+
+        # offsets calculation
+        all_offsets = []
+        for item in resp_sentences_seg:
+            offsets = [0]
+            for index, token in enumerate(item):
+                if index != len(item) - 1:
+                    if token in special_tokens:
+                        offsets.append(offsets[-1])
+                    else:
+                        offsets.append(len(token) + offsets[-1])
+            all_offsets.append(offsets)
+
+        output = {}
+        output['sentences'] = resp_sentences
+        output['tokens'] = resp_sentences_seg
+        output['logprob'] = output_logits
+        output['full_logprob'] = full_logits
+        output['token_ids'] = decode_tokens
+        output['offsets'] = all_offsets
+        output['audio_feat_lens'] = audio_feat_lens
+        output = inference_strategy.post_generation_process(output)
+        return output
+    return None
+
+
+def switch(val1, val2, boolean):
+    boolean = boolean.type_as(val1)
+    return (1 - boolean) * val1 + boolean * val2
+
+
+def sample_sequence_batch(
+    model,
+    inference_strategy,
+    context_tokens,
+    context_lengths,
+    audio_signal,
+    audio_signal_length,
+    tokens_to_generate,
+    all_probs=False,
+    compute_attention_mask=True,
+    compute_logprob=False,
+    type_ids=None,
+    temperature=None,
+    end_strings=['<|endoftext|>'],
+    extra={},
+    num_audios: Optional[torch.Tensor] = None,
+    context_start_idx: Optional[List[List[int]]] = None,
+):
+    app_state = AppState()
+    micro_batch_size = context_tokens.shape[0]
+    _reconfigure_microbatch_calculator(
+        rank=app_state.global_rank,
+        rampup_batch_size=None,
+        global_batch_size=micro_batch_size,
+        micro_batch_size=micro_batch_size,
+        data_parallel_size=1,
+    )
+    assert tokens_to_generate > 0, "tokens_to_generate should be > 0"
+    assert (
+        model.cfg.get('sequence_parallel', False) == False
+    ), 'sequence_parallel should be False during inference. Disable it in the model config if restoring from nemo or in hparams.yaml if restoring from PTL checkpoint'
+    assert (
+        model.cfg.get('activations_checkpoint_granularity', None) is None
+    ), 'activations_checkpoint_granularity should be None during inference. Disable it in the model config if restoring from nemo or in hparams.yaml if restoring from PTL checkpoint'
+    assert (
+        model.cfg.get('activations_checkpoint_method', None) is None
+    ), 'activations_checkpoint_method should be None during inference. Disable it in the model config if restoring from nemo or in hparams.yaml if restoring from PTL checkpoint'
+
+    tokenizer = model.tokenizer
+    # initialize the batch
+    with torch.no_grad():
+        context_tokens, input_embeddings, audio_feat_lens = inference_strategy.init_batch(
+            context_tokens,
+            context_lengths,
+            audio_signal,
+            audio_signal_length,
+            compute_attention_mask,
+            num_audios,
+            context_start_idx,
+        )
+        audio_text_context_lengths = context_lengths + audio_feat_lens
+        context_length = audio_text_context_lengths.min().item()
+        # added eos_id to support the function generate_samples_eval that passes
+        # eos_id as an argument and needs termination when that id id found.
+        eod_id = tokenizer.eos_id
+        counter = 0
+        batch_size = context_tokens.size(0)
+        is_done = torch.zeros([batch_size]).byte().cuda()
+        tokens = context_tokens
+        output_logits = None
+        all_generated_indices = None  # used to track all generated indices
+        # Generate enough tokens for the longest sequence
+        maxlen = tokens_to_generate + audio_text_context_lengths.max().item()
+        maxlen = inference_strategy.clip_max_len(maxlen)
+        lengths = torch.ones([batch_size]).long().cuda() * maxlen
+        while context_length < maxlen:
+            batch, tensor_shape = inference_strategy.prepare_batch_at_step(
+                tokens,
+                input_embeddings,
+                maxlen,
+                micro_batch_size,
+                counter,
+                audio_text_context_lengths,
+                context_length,
+                compute_attention_mask,
+            )
+            output = inference_strategy.forward_step(batch, tensor_shape)
+            if parallel_state.is_pipeline_last_stage():
+                if compute_logprob:
+                    output = output[0]['logits']
+                    output = tensor_parallel.gather_from_tensor_model_parallel_region(output)
+                    assert output is not None
+                    logits = output[:, -1].view(batch_size, -1).contiguous()
+
+                else:
+                    logits = output[0]['logits'][:, -1].contiguous()
+                    logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
+                    assert logits is not None
+                    logits = logits.view(batch_size, -1)
+
+                # make sure it will generate at least min_length
+                min_length = extra.get('min_tokens_to_generate', 0)
+                if min_length > 0:
+                    within_min_length = (context_length - audio_text_context_lengths) < min_length
+                    logits[within_min_length, eod_id] = -float('Inf')
+                # make sure it won't sample outside the vocab_size range
+                logits[:, tokenizer.vocab_size :] = -float('Inf')
+
+                # started indicates whether the current token step passes the context_length, so we make sure not to overwrite the context tokens
+                started = audio_text_context_lengths <= context_length
+                if extra.get('greedy', False):
+                    prev = torch.argmax(logits, dim=-1).view(-1)
+                else:
+                    logits = logits.float()
+                    logits /= temperature
+                    # handle repetition penality
+                    logits = text_generation_utils.repetition_penalty(
+                        logits, extra.get('repetition_penalty', 1.2), all_generated_indices
+                    )
+                    logits = text_generation_utils.top_k_logits(
+                        logits, top_k=extra.get('top_k', 0), top_p=extra.get('top_p', 0.9), started=started
+                    )
+                    probs = F.softmax(logits, dim=-1)
+                    # TODO(zhehuai)
+                    probs = probs.nan_to_num(1.0)
+                    prev = torch.multinomial(probs, num_samples=1).view(-1)
+
+                # Clamp the predicted out of vocabulary tokens
+                prev = torch.clamp(prev, max=tokenizer.vocab_size - 1)
+                new_tokens = switch(tokens[:, context_length].view(-1), prev, started)
+
+                # Replace sampled tokens w/ done token if EOD has already been sampled
+                new_tokens = switch(new_tokens, eod_id, is_done)
+
+                # post process the inference tokens based on the strategy
+                inference_strategy.post_process(tokens, new_tokens, context_length)
+
+                # Insert either new predicted or next prompt token
+                tokens[:, context_length] = new_tokens
+
+                if compute_logprob:
+                    if output_logits is None:
+                        output = F.log_softmax(output[:, :context_length, :], 2)
+
+                        indices = torch.unsqueeze(tokens[:, 1 : context_length + 1], 2)
+                        output_logits = torch.gather(output, 2, indices).squeeze(2)
+                        all_generated_indices = indices[:, :, 0]
+                        if all_probs:
+                            full_logits = output
+                    else:
+                        output = F.log_softmax(output, 2)
+                        indices = torch.unsqueeze(new_tokens, 1).unsqueeze(2)
+                        new_output_logits = torch.gather(output, 2, indices).squeeze(2)
+
+                        # TODO(rprenger) we're copying output_logits every time.  Should pre-allocate
+                        output_logits = torch.cat([output_logits, new_output_logits], 1)
+                        all_generated_indices = torch.cat([all_generated_indices, indices[:, :, 0]], 1)
+                        if all_probs:
+                            full_logits = torch.cat([full_logits, output], 1)
+
+                src = parallel_state.get_pipeline_model_parallel_last_rank()
+                group = parallel_state.get_embedding_group()
+                torch.distributed.broadcast(new_tokens, src, group)
+
+                #                done_token = (prev == eod_id).byte() & started.byte()
+                done_token = inference_strategy.end_of_generation_condition(
+                    tokens[:, : context_length + 1], prev, eod_id, end_strings
+                )
+                done_token = done_token.byte() & started.byte()
+
+                just_finished = (done_token & ~is_done).bool()
+                lengths[just_finished.view(-1)] = context_length
+                is_done = is_done | done_token
+
+                done = torch.all(is_done)
+                src = parallel_state.get_pipeline_model_parallel_last_rank()
+                group = parallel_state.get_pipeline_model_parallel_group()
+                torch.distributed.broadcast(done, src, group)
+                if compute_logprob:
+                    if all_probs:
+                        yield tokens, lengths, output_logits, full_logits, audio_feat_lens
+                    else:
+                        yield tokens, lengths, output_logits, None, audio_feat_lens
+                else:
+                    yield tokens, lengths, None, None, audio_feat_lens
+
+            else:
+                if parallel_state.is_pipeline_first_stage():
+                    src = parallel_state.get_pipeline_model_parallel_last_rank()
+                    group = parallel_state.get_embedding_group()
+                    new_tokens = torch.empty_like(tokens[:, context_length])
+                    torch.distributed.broadcast(new_tokens, src, group)
+                    tokens[:, context_length] = new_tokens
+                    yield tokens, None, None, None, audio_feat_lens
+                else:
+                    yield None, None, None, None, audio_feat_lens
+
+                done = torch.cuda.ByteTensor([0])
+                src = parallel_state.get_pipeline_model_parallel_last_rank()
+                group = parallel_state.get_pipeline_model_parallel_group()
+                torch.distributed.broadcast(done, src, group)
+
+            context_length += 1
+            counter += 1
+            if done:
+                break
diff --git a/nemo/collections/multimodal/speech_llm/modules/modality_adapters.py b/nemo/collections/multimodal/speech_llm/modules/modality_adapters.py
new file mode 100644
index 000000000000..408231adcc6d
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/modality_adapters.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+from nemo.collections.common.parts.multi_layer_perceptron import MultiLayerPerceptron as MLP
+from nemo.core.classes.common import typecheck
+from nemo.core.classes.exportable import Exportable
+from nemo.core.classes.mixins import AccessMixin
+from nemo.core.classes.module import NeuralModule
+from nemo.core.neural_types import AcousticEncodedRepresentation, LengthsType, NeuralType
+
+__all__ = ['PoolingMLPConnectors']
+
+
+class ConcatPooling(nn.Module):
+    """
+    A module that perform pooling by concatenating the features of every pooling_factor frames.
+    """
+
+    def __init__(self, pooling_factor):
+        super().__init__()
+        self.pooling_factor = pooling_factor
+
+    def forward(self, x):
+        # x: [batch_size, seq_len, input_dim]
+        batch_size, seq_len, input_dim = x.shape
+        if seq_len % self.pooling_factor != 0:
+            x = x[:, : -(seq_len % self.pooling_factor), :]
+        x = x.reshape(batch_size, seq_len // self.pooling_factor, input_dim * self.pooling_factor)
+        return x
+
+
+class PoolingMLPConnectors(NeuralModule, Exportable, AccessMixin):
+    """
+    A module that performs pooling and MLP on the input features.
+    Currently only supports mean pooling and concatenation pooling.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim,
+        output_dim=None,
+        num_layers: int = 2,
+        activation: str = "relu",
+        pooling: str = "mean",
+        pooling_factor: int = 2,
+        **kwargs,  # keep this to avoid breaking existing code
+    ):
+        """
+        Args:
+            input_dim: input dimension of the features
+            hidden_dim: hidden dimension of the MLP layers
+            output_dim: output dimension of the features
+            num_layers: number of layers in the MLP
+            activation: activation function used in MLP
+            pooling: type of pooling, currently only supports "mean" and "cat"
+            pooling_factor: size of the pooling window
+        """
+        super().__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.output_dim = output_dim if output_dim else input_dim
+        self.num_layers = num_layers
+        self.activation = activation
+        self.pooling = pooling
+        self.pooling_factor = pooling_factor
+
+        if num_layers == 1:
+            self.hidden_dim = output_dim
+
+        if pooling == "cat":
+            self.preprocess = nn.Sequential(
+                ConcatPooling(pooling_factor), nn.Linear(input_dim * pooling_factor, self.hidden_dim)
+            )
+        else:
+            self.preprocess = nn.Sequential(
+                nn.AvgPool1d(pooling_factor, stride=pooling_factor), nn.Linear(input_dim, self.hidden_dim)
+            )
+
+        if num_layers == 1:
+            self.mlp = nn.Identity()
+        else:
+            self.mlp = MLP(self.hidden_dim, output_dim, num_layers, activation, log_softmax=False)
+
+    @property
+    def input_types(self):
+        """Returns definitions of module input ports."""
+        return OrderedDict(
+            {
+                "audio_signal": NeuralType(("B", "D", "T"), AcousticEncodedRepresentation()),
+                "length": NeuralType(tuple("B"), LengthsType()),
+            }
+        )
+
+    @property
+    def output_types(self):
+        """Returns definitions of module output ports."""
+        return OrderedDict(
+            {
+                "outputs": NeuralType(("B", "D", "T"), AcousticEncodedRepresentation()),
+                "outputs_len": NeuralType(tuple("B"), LengthsType()),
+            }
+        )
+
+    @typecheck()
+    def forward(self, audio_signal, length=None):
+        """
+        Args:
+            audio_signal: [batch_size, input_dim, seq_len]
+            length: [batch_size]
+        Returns:
+            outputs: [batch_size, output_dim, seq_len//pooling_factor]
+            outputs_len: [batch_size]
+        """
+        outputs = self.preprocess(audio_signal.transpose(1, 2))
+        outputs = self.mlp(outputs)
+        outputs_len = torch.div(length, self.pooling_factor, rounding_mode='floor')
+        return outputs.transpose(1, 2), outputs_len
diff --git a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
new file mode 100644
index 000000000000..2f0565982941
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
@@ -0,0 +1,431 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from typing import List, Optional, Tuple
+
+import torch
+import torch.distributed
+import torch.nn as nn
+from omegaconf import DictConfig
+
+from nemo.collections.asr.models import EncDecSpeakerLabelModel
+from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder, ConformerMultiLayerFeatureExtractor
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import align_feat_seq_list
+from nemo.core.classes import Exportable, NeuralModule
+from nemo.core.classes.common import typecheck
+from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType
+from nemo.utils.decorators import experimental
+
+
+__all__ = ["AudioPerceptionModule", "MultiAudioPerceptionModule"]
+
+
+class AudioPerceptionModule(NeuralModule, Exportable):
+    """Audio perception module that consists of audio encoder(s) and modality adapter."""
+
+    def input_example(self, max_batch: int = 8, max_dim: int = 32000, min_length: int = 200):
+        batch_size = torch.randint(low=1, high=max_batch, size=[1]).item()
+        max_length = torch.randint(low=min_length, high=max_dim, size=[1]).item()
+        signals = torch.rand(size=[batch_size, max_length]) * 2 - 1
+        lengths = torch.randint(low=min_length, high=max_dim, size=[batch_size])
+        lengths[0] = max_length
+        return signals, lengths, None, None
+
+    @property
+    def input_types(self):
+        """Returns definitions of module input ports."""
+        return OrderedDict(
+            {
+                "input_signal": NeuralType(("B", "T"), AudioSignal(freq=self.preprocessor._sample_rate)),
+                "input_signal_length": NeuralType(
+                    tuple("B"), LengthsType()
+                ),  # Please note that length should be in samples not seconds.
+                "processed_signal": NeuralType(("B", "D", "T"), SpectrogramType()),
+                "processed_signal_length": NeuralType(tuple("B"), LengthsType()),
+            }
+        )
+
+    @property
+    def output_types(self):
+        """Returns definitions of module output ports."""
+        return OrderedDict(
+            {
+                "encoded": NeuralType(("B", "T", "D"), AcousticEncodedRepresentation()),
+                "encoded_len": NeuralType(tuple("B"), LengthsType()),
+            }
+        )
+
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        # Initialize components
+        self.preprocessor = self.from_config_dict(cfg.preprocessor)
+        self.encoder = self.from_config_dict(cfg.encoder)
+
+        if cfg.get("use_multi_layer_feat", False) and cfg.get("multi_layer_feat", None):
+            if "_target_" in cfg.multi_layer_feat.aggregator:
+                aggregator = self.from_config_dict(cfg.multi_layer_feat.aggregator)
+            else:
+                aggregator = MultiFeatureAggregator(cfg.multi_layer_feat.aggregator, channel_dim=1)
+            self.encoder = ConformerMultiLayerFeatureExtractor(
+                encoder=self.encoder, layer_idx_list=cfg.multi_layer_feat.layer_idx_list, aggregator=aggregator
+            )
+
+        if 'spec_augment' in cfg and cfg.spec_augment is not None:
+            self.spec_augmentation = self.from_config_dict(cfg.spec_augment)
+        else:
+            self.spec_augmentation = None
+        self.modality_adapter = self.from_config_dict(cfg.modality_adapter)
+        if 'output_dim' not in cfg.modality_adapter and "d_model" in cfg.modality_adapter:  # e.g., conformer encoder
+            self.proj = nn.Linear(cfg.modality_adapter.d_model, cfg.output_dim)
+        else:
+            self.proj = nn.Identity()
+
+    def maybe_preprocess_audio(
+        self,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
+    ):
+        has_input_signal = input_signal is not None and input_signal_length is not None
+        has_processed_signal = processed_signal is not None and processed_signal_length is not None
+        if (has_input_signal ^ has_processed_signal) is False:
+            raise ValueError(
+                f"{self.__class__} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive "
+                " with ``processed_signal`` and ``processed_signal_len`` arguments."
+            )
+
+        if not has_processed_signal:
+            processed_signal, processed_signal_length = self.preprocessor(
+                input_signal=input_signal,
+                length=input_signal_length,
+            )
+        return processed_signal, processed_signal_length
+
+    # disable type checks to avoid type-check errors when using Conformer as modality adapter
+    @typecheck.disable_checks()
+    def forward(
+        self,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
+    ):
+        processed_signal, processed_signal_length = self.maybe_preprocess_audio(
+            input_signal, input_signal_length, processed_signal, processed_signal_length
+        )
+
+        # Spec augment is not applied during evaluation/testing
+        if self.spec_augmentation is not None and self.training:
+            processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
+
+        encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length)
+        encoded, encoded_len = self.modality_adapter(audio_signal=encoded, length=encoded_len)
+
+        # b, c, t -> b, t, c
+        encoded = self.proj(encoded.transpose(1, 2))
+
+        return encoded, encoded_len
+
+
+class MultiFeatureAggregator(nn.Module):
+    """
+    A module used to aggregate multiple encoded features (from different encoders or different layers) into a single feature sequence.
+    """
+
+    def __init__(self, cfg: DictConfig, channel_dim: int = 1):
+        super().__init__()
+        self.mode = cfg.get("mode", "cat")
+        self.channel_dim = channel_dim
+        self.pooling = cfg.get("pooling", "mean")
+        self.align_mode = cfg.get("align_mode", "min")
+
+    def _have_same_length(self, encoded_len: List[torch.Tensor]) -> bool:
+        sample_len = encoded_len[0]
+        for x in encoded_len:
+            if torch.sum(x - sample_len) != 0:
+                return False
+        return True
+
+    def forward(
+        self,
+        encoded: List[torch.Tensor],
+        encoded_len: List[torch.Tensor],
+        ref_idx: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not self._have_same_length(encoded_len):
+            """Align the length of encoded features if they are different."""
+            target_len = encoded[0].size(self.channel_dim)
+            if ref_idx is not None:
+                target_len = encoded[ref_idx].size(self.channel_dim)
+            if self.channel_dim != 1:
+                encoded = [x.transpose(1, self.channel_dim) for x in encoded]
+            encoded, encoded_len = align_feat_seq_list(
+                encoded, encoded_len, mode=self.align_mode, pooling=self.pooling, target_len=target_len
+            )
+            if self.channel_dim != 1:
+                encoded = [x.transpose(1, self.channel_dim) for x in encoded]
+
+        if self.mode == "cat":
+            return torch.cat(encoded, dim=self.channel_dim), encoded_len[0]
+        elif self.mode == "sum":
+            return torch([x.unsqueeze(-1) for x in encoded], dim=-1).sum(dim=-1), encoded_len[0]
+        elif self.mode == "mean" or self.mode == "avg":
+            return torch([x.unsqueeze(-1) for x in encoded], dim=-1).mean(dim=-1), encoded_len[0]
+        elif self.mode == "max":
+            return torch([x.unsqueeze(-1) for x in encoded], dim=-1).max(dim=-1), encoded_len[0]
+        elif self.mode == "min":
+            return torch([x.unsqueeze(-1) for x in encoded], dim=-1).min(dim=-1), encoded_len[0]
+        elif self.mode == "none":
+            return encoded, encoded_len
+        else:
+            raise ValueError(f"Unknown mode {self.mode}")
+
+
+@experimental
+class MultiAudioPerceptionModule(NeuralModule, Exportable):
+    """
+    Audio perception module that consists of multiple audio encoders and shared modality adapter.
+    This module is experimental. An example perception cfg is:
+    -------------------
+    perception:
+        modality_adapter:
+            _target_: nemo.collections.multimodal.speechllm.modules.PoolingMLPConnectors
+            hidden_dim: 512
+            pooling: 'cat'
+            pooling_factor: 2
+            num_layers: 4
+            input_dim: -1
+            output_dim: -1
+
+        spec_augment:
+            _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+            freq_masks: 2 # set to zero to disable it
+            time_masks: 10 # set to zero to disable it
+            freq_width: 27
+            time_width: 0.05
+
+        encoders:
+            asr_model:
+                _target_: nemo.collections.asr.models.ASRModel
+                output_key: d_model
+                freeze: True
+                pretrained_model: stt_en_fastconformer_transducer_large
+            ssl_model:
+                _target_: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel
+                output_key: d_model
+                freeze: True
+                pretrained_model: ssl_en_conformer_large
+                use_multi_layer_feat: True
+                multi_layer_feat:
+                layer_idx_list: [0,16]
+                aggregator:
+                    mode: "cat"
+                    pooling: "avg"
+                    rounding: "floor"
+
+            speaker_model:
+                segment_length_in_secs: 0.4
+                freeze: True
+                pretrained_model: titanet_large
+
+            ref_model: asr_model
+            aggregator:
+                mode: "cat"
+                pooling: "mean"
+                rounding: "floor"
+    -------------------
+    """
+
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        # Initialize components
+        self.aggregator = MultiFeatureAggregator(cfg.aggregator, channel_dim=1)
+        if 'spec_augment' in cfg and cfg.spec_augment is not None:
+            self.spec_augmentation = self.from_config_dict(cfg.spec_augment)
+        else:
+            self.spec_augmentation = None
+
+        self.encoder_cfg = cfg.encoders
+        if not isinstance(self.encoder_cfg, DictConfig):
+            raise TypeError(f"cfg.encoders must be a DictConfig, got {type(cfg.encoders)}")
+
+        preprocessor = {}
+        encoders = {}
+        for key, enc_cfg in self.encoder_cfg.items():
+            encoder = self.from_config_dict(enc_cfg.model)
+            if enc_cfg.get("use_multi_layer_feat", False) and enc_cfg.get("multi_layer_feat", None):
+                if not isinstance(encoder, ConformerEncoder):
+                    raise TypeError(
+                        f"Encoder {key} must be a ConformerEncoder when use_multi_layer_feat is True, got {type(encoder)}"
+                    )
+                if "_target_" in enc_cfg.multi_layer_feat.aggregator:
+                    aggregator = self.from_config_dict(enc_cfg.multi_layer_feat.aggregator)
+                else:
+                    aggregator = MultiFeatureAggregator(enc_cfg.multi_layer_feat.aggregator, channel_dim=1)
+                encoder = ConformerMultiLayerFeatureExtractor(
+                    encoder=encoder, layer_idx_list=enc_cfg.multi_layer_feat.layer_idx_list, aggregator=aggregator
+                )
+            encoders[key] = encoder
+            preprocessor[key] = (
+                self.from_config_dict(enc_cfg.get("preprocessor"))
+                if enc_cfg.get("preprocessor", None) is not None
+                else None
+            )
+        self.encoders = nn.ModuleDict(encoders)
+        self.preprocessor = nn.ModuleDict(preprocessor)
+
+        self.speaker_model = None
+        self.speaker_seg_len = None
+        if "speaker_model" in cfg and cfg.speaker_model.get("model", None) is not None:
+            self.speaker_model = EncDecSpeakerLabelModel(cfg=cfg.speaker_model.model)
+            self.speaker_model.spec_augmentation = self.spec_augmentation
+            self.speaker_seg_len = 1
+            if "preprocessor" in cfg.speaker_model.model:
+                self.speaker_seg_len = int(
+                    cfg.speaker_model.segment_length_in_secs // cfg.speaker_model.model.preprocessor.window_stride
+                )
+        self.ref_model = cfg.get("ref_model", None)
+        if self.ref_model is not None:
+            if self.ref_model not in self.encoders and (
+                self.ref_model != "speaker_model" and self.speaker_model is not None
+            ):
+                if self.ref_model == "speaker_model":
+                    raise ValueError(f"ref_model is `{self.ref_model}` but speaker_model is None")
+                raise ValueError(f"ref_model `{self.ref_model}` not found in encoders [{encoders.keys()}]")
+
+        self.modality_adapter = self.from_config_dict(cfg.modality_adapter)
+        if 'output_dim' not in cfg.modality_adapter and "d_model" in cfg.modality_adapter:  # e.g., conformer encoder
+            self.proj = nn.Linear(cfg.modality_adapter.d_model, cfg.output_dim)
+        else:
+            self.proj = nn.Identity()
+
+    def maybe_preprocess_audio(
+        self,
+        preprocessor,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
+    ):
+        has_input_signal = input_signal is not None and input_signal_length is not None
+        has_processed_signal = processed_signal is not None and processed_signal_length is not None
+        if (has_input_signal ^ has_processed_signal) is False:
+            raise ValueError(
+                f"{self.__class__} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive "
+                " with ``processed_signal`` and ``processed_signal_len`` arguments."
+            )
+
+        if not has_processed_signal and preprocessor is not None:
+            processed_signal, processed_signal_length = preprocessor(
+                input_signal=input_signal,
+                length=input_signal_length,
+            )
+        elif not has_processed_signal and preprocessor is None:
+            processed_signal, processed_signal_length = input_signal, input_signal_length
+        return processed_signal, processed_signal_length
+
+    def forward_speaker(
+        self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None
+    ):
+        has_input_signal = input_signal is not None and input_signal_length is not None
+        has_processed_signal = processed_signal is not None and processed_signal_length is not None
+        if (has_input_signal ^ has_processed_signal) is False:
+            raise ValueError(
+                f"{self.__class__} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive "
+                " with ``processed_signal`` and ``processed_signal_len`` arguments."
+            )
+        if not has_processed_signal:
+            processed_signal, processed_signal_length = self.speaker_model.preprocessor(
+                input_signal=input_signal,
+                length=input_signal_length,
+            )
+        # Spec augment is not applied during evaluation/testing
+        if self.spec_augmentation is not None and self.training:
+            processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
+
+        # encoded has shape [B, D, T], length has shape [B]
+        encoded, encoded_len = self.speaker_model.encoder(
+            audio_signal=processed_signal, length=processed_signal_length
+        )
+
+        # pad encoded to be divisible by speaker_seg_len
+        if encoded.shape[2] % self.speaker_seg_len != 0:
+            encoded = torch.cat(
+                [
+                    encoded,
+                    torch.zeros(
+                        encoded.shape[0],
+                        encoded.shape[1],
+                        self.speaker_seg_len - encoded.shape[2] % self.speaker_seg_len,
+                        device=encoded.device,
+                    ),
+                ],
+                dim=2,
+            )
+
+        B, D, T = encoded.shape
+        num_seg = int(T // self.speaker_seg_len)
+        encoded = encoded.view(int(B * num_seg), D, self.speaker_seg_len)  # [B*num_seg, D, seg_len]
+        encoded_len_seg = (encoded_len // self.speaker_seg_len).repeat_interleave(num_seg)  # [B*seg_len]
+
+        _, embeds = self.speaker_model.decoder(encoder_output=encoded, length=encoded_len_seg)
+
+        embeds = embeds.view(B, -1, num_seg)  # [B, D, num_seg]
+
+        embeds_len = encoded_len // self.speaker_seg_len  # [B]
+        return embeds, embeds_len
+
+    def forward(
+        self,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
+    ):
+        encoded_list = []
+        encoded_len_list = []
+        ref_idx = None
+        for key, encoder in self.encoders.items():
+            curr_processed_signal, curr_processed_signal_length = self.maybe_preprocess_audio(
+                self.preprocessor[key], input_signal, input_signal_length, processed_signal, processed_signal_length
+            )
+            # Spec augment is not applied during evaluation/testing
+            if self.spec_augmentation is not None and self.training:
+                processed_signal = self.spec_augmentation(
+                    input_spec=curr_processed_signal, length=curr_processed_signal_length
+                )
+            encoded, encoded_len = encoder(audio_signal=curr_processed_signal, length=curr_processed_signal_length)
+            if key == self.ref_model:
+                ref_idx = len(encoded_list)
+            encoded_list.append(encoded)
+            encoded_len_list.append(encoded_len)
+
+        if self.speaker_model is not None:
+            speaker_embeds, speaker_embeds_len = self.forward_speaker(
+                input_signal=input_signal,
+                input_signal_length=input_signal_length,
+                processed_signal=processed_signal,
+                processed_signal_length=processed_signal_length,
+            )
+            encoded_list.append(speaker_embeds)
+            encoded_len_list.append(speaker_embeds_len)
+        encoded_list, encoded_len_list = self.aggregator(
+            encoded=encoded_list, encoded_len=encoded_len_list, ref_idx=ref_idx
+        )
+        encoded, encoded_len = self.modality_adapter(audio_signal=encoded_list, length=encoded_len_list)
+        # b, c, t -> b, t, c
+        encoded = self.proj(encoded.transpose(1, 2))
+        return encoded, encoded_len
diff --git a/nemo/collections/multimodal/speech_llm/parts/__init__.py b/nemo/collections/multimodal/speech_llm/parts/__init__.py
new file mode 100644
index 000000000000..d0c4b8bd282c
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/parts/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import (
+    ceil_to_nearest,
+    get_num_samples_from_files,
+    maybe_cast_to_list,
+    shift_tokens_by_multi_audios,
+)
diff --git a/nemo/collections/multimodal/speech_llm/parts/mixins/__init__.py b/nemo/collections/multimodal/speech_llm/parts/mixins/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/parts/mixins/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py b/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py
new file mode 100644
index 000000000000..6071bda87057
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+import torch
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin, replace_prefix
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP, PEFTConfig
+from nemo.utils import logging
+
+
+class SpeechLLMAdapterMixin(NLPAdapterModelMixin):
+    def load_adapters(
+        self,
+        filepath: str,
+        peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None,
+        map_location: str = None,
+    ):
+        """
+        Utility method that restores only the adapter module(s), and not the entire model itself.
+        This allows the sharing of adapters which are often just a fraction of the size of the full model,
+        enabling easier delivery.
+
+        .. note::
+
+            During restoration, assumes that the model does not currently already have one or more adapter modules.
+
+        Args:
+            filepath: Filepath of the .ckpt or .nemo file.
+            peft_cfgs: One or more PEFTConfig objects that specify the PEFT method configuration.
+                If none, will infer from the .nemo checkpoint
+            map_location: Pytorch flag, where to place the adapter(s) state dict(s).
+        """
+
+        # Determine device
+        if map_location is None:
+            if torch.cuda.is_available():
+                map_location = 'cuda'
+            else:
+                map_location = 'cpu'
+
+        if filepath.endswith('.nemo'):
+            conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location)
+        elif filepath.endswith('.ckpt'):
+            state_dict = torch.load(filepath, map_location)['state_dict']
+        else:
+            raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
+        if not peft_cfgs:
+            assert filepath.endswith(
+                '.nemo'
+            ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument."
+            peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)]
+        if self.cfg.megatron_amp_O2:
+            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
+        self.add_adapter(peft_cfgs)
+        if not self.ptuning_only_and_non_first_stage:
+            target_keys = self.adapter_keys.union(self.tunable_base_param_keys)
+            if set(state_dict.keys()) != target_keys:
+                logging.warning(
+                    f"Unexpected keys found in state_dict: {set(state_dict.keys()) - target_keys}, missing keys in state_dict: {target_keys - set(state_dict.keys())}"
+                )
+        super(MegatronGPTModel, self).load_state_dict(state_dict, strict=False)
diff --git a/nemo/collections/multimodal/speech_llm/parts/utils/__init__.py b/nemo/collections/multimodal/speech_llm/parts/utils/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/parts/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py b/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py
new file mode 100644
index 000000000000..92a3548f9337
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import numpy as np
+import torch
+
+
+def maybe_cast_to_list(x):
+    if isinstance(x, np.ndarray):
+        return [item.tolist() for item in x]
+    return x
+
+
+def ceil_to_nearest(n, m):
+    return (n + m - 1) // m * m
+
+
+def get_num_samples_from_files(file_list):
+    if isinstance(file_list, str):
+        file_list = file_list.split(',')
+    num_samples = []
+    for file in file_list:
+        with open(file, 'r') as f:
+            lines = list(f.readlines())
+            num = len(lines)
+            if lines[-1] == '\n':
+                num -= 1
+            num_samples.append(num)
+    return num_samples
+
+
+def shift_tokens_by_multi_audios(
+    context_tokens, context_lengths, audio_feat_lens, context_start_idx, encoder_max_length
+):
+    """
+    split and shift the context tokens by the audio segments, then concatenate them back. This function assumes that the whole context
+    starts and ends with text tokens, and the audio segments are in between the text tokens. The audio segments are not allowed to be adjacent to each other.
+    Args:
+        context_tokens: tensor of shape [batch, max_context_len]
+        context_lengths: tensor of shape [batch,]
+        audio_feat_lens: List[List[int]]
+        context_start_idx: List[List[int]]
+        encoder_max_length: int
+    """
+    new_context_tokens = []
+    for i in range(context_tokens.shape[0]):
+        start_idx_list_i = context_start_idx[i] + [context_lengths[i]]
+        input_len_list = [start_idx_list_i[j + 1] - start_idx_list_i[j] for j in range(len(start_idx_list_i) - 1)]
+        context_tokens_list = context_tokens[i][: context_lengths[i]].split(input_len_list)
+        context_tokens_i = [context_tokens_list[0]]
+        for j in range(1, len(context_tokens_list)):
+            context_tokens_i.append(
+                torch.zeros(audio_feat_lens[i][j - 1], dtype=torch.long, device=context_tokens.device)
+            )
+            context_tokens_i.append(context_tokens_list[j])
+        context_tokens_i = torch.cat(context_tokens_i)
+        context_tokens_i = torch.nn.functional.pad(
+            context_tokens_i, (0, encoder_max_length - context_tokens_i.shape[0])
+        )
+        new_context_tokens.append(context_tokens_i)
+    new_context_tokens = torch.stack(new_context_tokens)
+    return new_context_tokens
+
+
+def get_nested_dict_value(d, key, sep="."):
+    """
+    Get the value of a nested dict given a key
+    Args:
+        d: dict
+        key: str
+    """
+    for k in key.split(sep):
+        d = d[k]
+    return d
+
+
+def align_feat_seq_list(
+    seq_list: List[torch.Tensor],
+    seq_len_list: List[torch.Tensor],
+    mode: str = "min",
+    pooling: str = 'mean',
+    target_len: Optional[int] = None,
+):
+    """
+    Align a list of feature sequences to the same length by repeating or discarding frames.
+    Args:
+        seq_list: List[torch.Tensor], list of tensors of shape [batch, hidden_size, seq_len]
+        seq_len_list: List[torch.Tensor], list of tensors of shape [batch,]
+        mode: str, "min" or "max"
+        pooling: str, "mean", "max", or "min"
+    Returns:
+        new_seq_list: List[torch.Tensor], list of tensors of shape [batch, hidden_size, new_seq_len]
+        new_seq_len_list: List[torch.Tensor], list of tensors of shape [batch,]
+    """
+    MODES = ["min", "max"]
+    if mode not in MODES:
+        raise ValueError(f"mode {mode} not supported, available modes: {MODES}")
+    POOLING = ["mean", "max", "min", "avg"]
+    if pooling not in POOLING:
+        raise ValueError(f"pooling {pooling} not supported, available modes: {POOLING}")
+
+    new_seq_len_list = []
+    new_seq_list = []
+
+    if target_len is None:
+        target_len = [x.size(-1) for x in seq_list]
+        target_len = min(target_len) if mode == "min" else max(target_len)
+
+    for seq, seq_len in zip(seq_list, seq_len_list):
+        curr_len = seq.size(-1)
+        if curr_len > target_len:
+            ratio = round(curr_len / target_len)
+            res = abs(ratio * target_len - curr_len)
+            if ratio * target_len > curr_len:  # e.g., ratio = 1.9
+                # repeat the last res frames
+                seq = torch.cat([seq, seq[:, :, -res:]], dim=-1)
+                seq_len += res * (seq_len > target_len).long()
+            elif ratio * target_len < curr_len:  # e.g., ratio = 2.1
+                # discard the last res frames
+                seq = seq[:, :, :-res]
+                seq_len -= res * (seq_len > target_len).long()
+            new_seq = seq.reshape(seq.size(0), seq.size(1), ratio, target_len)
+            if pooling == "min":
+                new_seq = new_seq.min(dim=2)
+            elif pooling == "max":
+                new_seq = new_seq.max(dim=2)
+            else:
+                new_seq = new_seq.mean(dim=2)
+            new_seq_len = torch.round(seq_len / ratio).long()
+        else:  # curr_len <= target_len
+            ratio = round(target_len / curr_len)
+            res = abs(ratio * curr_len - target_len)
+            new_seq = torch.repeat_interleave(seq, ratio, dim=-1)
+            new_seq_len = seq_len * ratio
+            if ratio * curr_len > target_len:  # e.g., ratio = 1.9
+                new_seq = new_seq[:, :, :target_len]
+                new_seq_len = (
+                    seq_len * ratio - (ratio * seq_len - target_len) * (ratio * seq_len > target_len).long()
+                )  # subtract additional frames
+            elif ratio * curr_len < target_len:  # e.g., ratio = 2.1
+                new_seq = torch.cat([new_seq, seq[:, :, -res:]], dim=-1)
+        new_seq_list.append(new_seq)
+        new_seq_len_list.append(new_seq_len)
+    return new_seq_list, new_seq_len_list
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index ea56429f4de1..536fc5bff7c8 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -174,7 +174,7 @@ def forward(self, **kwargs):
         the superclass by the square root of the hidden size specified in the configuration.
         """
         embeddings = super().forward(**kwargs)
-        return embeddings * torch.tensor(self.config.hidden_size ** 0.5, dtype=embeddings.dtype)
+        return embeddings * torch.tensor(self.config.hidden_size**0.5, dtype=embeddings.dtype)
 
 
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):
@@ -196,11 +196,14 @@ def __init__(self, model):
 
     def forward(self, tokens, position_ids, attention_mask):
         if self.fp8_enabled and HAVE_TE:
-            with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(
-                enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe
-            ), torch.no_grad(), torch.inference_mode(), torch.autocast(
-                'cuda', dtype=self.dtype
-            ), warnings.catch_warnings():
+            with (
+                transformer_engine.pytorch.onnx_export(self.fp8_enabled),
+                transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe),
+                torch.no_grad(),
+                torch.inference_mode(),
+                torch.autocast('cuda', dtype=self.dtype),
+                warnings.catch_warnings(),
+            ):
                 warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
                 assert tokens.shape == position_ids.shape
                 assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
@@ -211,9 +214,12 @@ def forward(self, tokens, position_ids, attention_mask):
                     labels=None,
                 )
         else:
-            with torch.no_grad(), torch.inference_mode(), torch.autocast(
-                'cuda', dtype=self.dtype
-            ), warnings.catch_warnings():
+            with (
+                torch.no_grad(),
+                torch.inference_mode(),
+                torch.autocast('cuda', dtype=self.dtype),
+                warnings.catch_warnings(),
+            ):
                 warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
                 assert tokens.shape == position_ids.shape
                 assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
@@ -509,7 +515,7 @@ def setup_optimizer_param_groups(self):
             self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model)
 
     def setup_mcore_distributed_parallel(self):
-        """Set up mcore distributed data parallel """
+        """Set up mcore distributed data parallel"""
         if self.with_distributed_adam and self.use_mcore_dist_optim:
             config = get_model_config(self.model[0])
             ddp_config = DistributedDataParallelConfig(
@@ -641,7 +647,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
                 if self.validation_param_sync_overlap:
                     param_sync_func = self.sync_overlap_parameters
             elif not self.use_mcore_dist_optim:
-                no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+                no_sync_func = partial(
+                    self._optimizer.no_sync,
+                    greedy_grad_copy=self.megatron_amp_O2,
+                )
                 grad_sync_func = self.reduce_overlap_gradients
                 param_sync_func = self.sync_overlap_parameters
             else:
@@ -744,9 +753,9 @@ def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only):
 
     def training_step(self, dataloader_iter):
         """
-            We pass the dataloader iterator function to the micro-batch scheduler.
-            The input batch to each micro-batch is fetched using the dataloader function
-            in the micro-batch fwd function.
+        We pass the dataloader iterator function to the micro-batch scheduler.
+        The input batch to each micro-batch is fetched using the dataloader function
+        in the micro-batch fwd function.
         """
         # Initialize userbuffer communicators.
         if self.initialize_ub:
@@ -877,7 +886,11 @@ def training_step(self, dataloader_iter):
         if self.log_memory_usage:
             mem_reserved = torch.cuda.max_memory_reserved()
             self.log(
-                'peak_memory_usage', mem_reserved, prog_bar=True, rank_zero_only=True, batch_size=1,
+                'peak_memory_usage',
+                mem_reserved,
+                prog_bar=True,
+                rank_zero_only=True,
+                batch_size=1,
             )
 
         ## logging
@@ -901,20 +914,29 @@ def training_step(self, dataloader_iter):
         lr = self._optimizer.param_groups[0]['lr']
         self.log('lr', lr, rank_zero_only=True, batch_size=1)
         self.log(
-            'global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True, batch_size=1,
+            'global_step',
+            self.trainer.global_step,
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
         )
 
         consumed_samples = self._compute_consumed_samples_after_training_step()
         # TODO: make sure compute_consumed_samples works for pipeline parallelism
         self.log(
-            'consumed_samples', consumed_samples, prog_bar=True, rank_zero_only=True, batch_size=1,
+            'consumed_samples',
+            consumed_samples,
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
         )
 
         if self.rampup_batch_size:
             self.prev_global_batch_size = current_global_batch_size
             self.prev_consumed_samples = consumed_samples
             num_microbatch_calculator.update(
-                consumed_samples=consumed_samples, consistency_check=False,
+                consumed_samples=consumed_samples,
+                consistency_check=False,
             )
             current_global_batch_size = num_microbatch_calculator.current_global_batch_size
             self.log('global_batch_size', current_global_batch_size, prog_bar=True, rank_zero_only=True, batch_size=1)
@@ -923,20 +945,20 @@ def training_step(self, dataloader_iter):
         return loss_mean
 
     def backward(self, *args, **kwargs):
-        """ LightningModule hook to do backward.
-            We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
-            No need to call it here.
+        """LightningModule hook to do backward.
+        We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
+        No need to call it here.
         """
         return
 
     def optimizer_zero_grad(self, *args, **kwargs):
-        """ LightningModule hook to zero grad.
-            We want this to do nothing as we are zeroing grads during the training_step.
+        """LightningModule hook to zero grad.
+        We want this to do nothing as we are zeroing grads during the training_step.
         """
         return
 
     def _append_sequence_parallel_module_grads(self, module, grads):
-        """ Helper method for allreduce_sequence_parallel_gradients"""
+        """Helper method for allreduce_sequence_parallel_gradients"""
 
         for param in module.parameters():
             sequence_parallel_param = getattr(param, 'sequence_parallel', False) or getattr(
@@ -954,9 +976,9 @@ def _append_sequence_parallel_module_grads(self, module, grads):
                 grads.append(grad.data)
 
     def allreduce_sequence_parallel_gradients(self):
-        """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
-            Modified from megatron-lm:
-            https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
+        """All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
+        Modified from megatron-lm:
+        https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
         """
 
         grads = []
@@ -974,8 +996,7 @@ def allreduce_sequence_parallel_gradients(self):
             buf.copy_(synced)
 
     def allreduce_fsdp_sharding_omitted_gradients(self):
-        """ All-reduce gradients of FSDP-sharding-omitted parameters in sharding domain (data-parallel domain).
-        """
+        """All-reduce gradients of FSDP-sharding-omitted parameters in sharding domain (data-parallel domain)."""
         assert isinstance(self.model, torch.nn.Module)
         grads = []
         for param in self.model.parameters():
@@ -1022,16 +1043,16 @@ def allreduce_first_last_embeddings(self):
                     torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group())
 
     def _make_data_iterator_list(self, data_iterator: Iterator) -> List[Iterator]:
-        """ Convert data iterator into form expected by Megatron
-
-            With interleaved pipeline parallelism, Megatron expects a
-            list of one data iterator per model chunk. Each model
-            chunk independently gets data from its data iterator, so
-            we need to interact with the data iterator multiple times
-            for each microbatch step. Instead of incorporating this
-            logic into the data loader, we cache the iterator's output
-            to the first model chunk and reuse it in the other model
-            chunks.
+        """Convert data iterator into form expected by Megatron
+
+        With interleaved pipeline parallelism, Megatron expects a
+        list of one data iterator per model chunk. Each model
+        chunk independently gets data from its data iterator, so
+        we need to interact with the data iterator multiple times
+        for each microbatch step. Instead of incorporating this
+        logic into the data loader, we cache the iterator's output
+        to the first model chunk and reuse it in the other model
+        chunks.
         """
 
         if not isinstance(self.model, list) or len(self.model) == 1:
@@ -1159,7 +1180,10 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                     required_keys.update(('labels', 'loss_mask'))
             if self.get_attention_mask_from_fusion and 'attention_mask' in required_keys:
                 required_keys.remove('attention_mask')
-            batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in batch.items()}
+            batch = {
+                key: val.cuda(non_blocking=True) if key in required_keys and isinstance(val, torch.Tensor) else None
+                for key, val in batch.items()
+            }
 
             # slice batch along sequence dimension for context parallelism
             batch = self.get_batch_on_this_context_parallel_rank(batch)
@@ -1323,10 +1347,10 @@ def id_func(output_tensor):
 
     def validation_step(self, dataloader_iter, dataloader_idx=0):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         mode = 'test' if self.trainer.testing else 'val'
         # Initialize userbuffer communicators.
@@ -1387,7 +1411,9 @@ def on_validation_epoch_end(self):
             if self.loss_broadcast_src_rank is None:
                 self.loss_broadcast_src_rank = parallel_state.get_pipeline_model_parallel_last_rank()
             torch.distributed.broadcast(
-                averaged_loss, self.loss_broadcast_src_rank, group=parallel_state.get_pipeline_model_parallel_group(),
+                averaged_loss,
+                self.loss_broadcast_src_rank,
+                group=parallel_state.get_pipeline_model_parallel_group(),
             )
 
         self.log('val_loss', averaged_loss, prog_bar=True, rank_zero_only=True, batch_size=1)
@@ -1492,7 +1518,10 @@ def build_train_valid_test_datasets(self):
                 dataset_type = MockGPTDataset if mock_dataset else GPTDataset
 
             self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                dataset_type, train_valid_test_num_samples, is_dataset_built_on_rank, dataset_config,
+                dataset_type,
+                train_valid_test_num_samples,
+                is_dataset_built_on_rank,
+                dataset_config,
             ).build()
 
         if self._train_ds is not None:
@@ -1702,16 +1731,16 @@ def list_available_models(self):
         return None
 
     def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any:
-        """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
-            When using pipeline parallelism, we need the global batch to remain on the CPU,
-            since the memory overhead will be too high when using a large number of microbatches.
-            Microbatches are transferred from CPU to GPU inside the pipeline.
+        """PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
+        When using pipeline parallelism, we need the global batch to remain on the CPU,
+        since the memory overhead will be too high when using a large number of microbatches.
+        Microbatches are transferred from CPU to GPU inside the pipeline.
         """
         return batch
 
     def _validate_trainer(self):
-        """ Certain trainer configurations can break training.
-            Here we try to catch them and raise an error.
+        """Certain trainer configurations can break training.
+        Here we try to catch them and raise an error.
         """
         if self.trainer.accumulate_grad_batches > 1:
             raise ValueError(
@@ -1788,9 +1817,9 @@ def on_load_checkpoint(self, checkpoint) -> None:
 
     def on_validation_model_zero_grad(self) -> None:
         """
-         Skip gradient zeroing at the beginning of validation routine.
-         This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
-         """
+        Skip gradient zeroing at the beginning of validation routine.
+        This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
+        """
         if not self.validation_param_sync_overlap:
             super().on_validation_model_zero_grad()
 
@@ -1859,9 +1888,9 @@ def initialize_last_rank_embeddings(self):
                     parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
     def _reset_activation_checkpointing_args(self):
-        """ Disables activation checkpointing completely and saves the values so that
-            _restore_activation_checkpointing_args can restore them later. This function must always be
-            called before _restore_activation_checkpointing_args.
+        """Disables activation checkpointing completely and saves the values so that
+        _restore_activation_checkpointing_args can restore them later. This function must always be
+        called before _restore_activation_checkpointing_args.
         """
         # Store values to restore them later.
         self.last_activations_checkpoint_granularity = self.cfg.activations_checkpoint_granularity
@@ -1888,9 +1917,9 @@ def _reset_activation_checkpointing_args(self):
                 module.language_model.encoder.activations_checkpoint_layers_per_pipeline = None
 
     def _restore_activation_checkpointing_args(self):
-        """ Restores the activation checkpointing parameters using the values saved by
-            _reset_activation_checkpointing_args. This function must never be called before
-            _reset_activation_checkpointing_args.
+        """Restores the activation checkpointing parameters using the values saved by
+        _reset_activation_checkpointing_args. This function must never be called before
+        _reset_activation_checkpointing_args.
         """
         # Restore config values.
         self.cfg.activations_checkpoint_granularity = self.last_activations_checkpoint_granularity
@@ -1917,9 +1946,9 @@ def _restore_activation_checkpointing_args(self):
                 )
 
     def _reset_sequence_parallelism_args(self):
-        """ Disables sequence parallelism completely and saves the values so that
-            _restore_sequence_parallelism_args can restore them later. This function must always be
-            called before _restore_sequence_parallelism_args.
+        """Disables sequence parallelism completely and saves the values so that
+        _restore_sequence_parallelism_args can restore them later. This function must always be
+        called before _restore_sequence_parallelism_args.
         """
         # Store values to restore them later.
         self.last_sequence_parallel = self.cfg.sequence_parallel
@@ -1936,9 +1965,9 @@ def _reset_sequence_parallelism_args(self):
                     mod.sequence_parallel = False
 
     def _restore_sequence_parallelism_args(self):
-        """ Restores the sequence parallelism parameters using the values saved by
-            _reset_sequence_parallelism_args. This function must never be called before
-            _reset_sequence_parallelism_args.
+        """Restores the sequence parallelism parameters using the values saved by
+        _reset_sequence_parallelism_args. This function must never be called before
+        _reset_sequence_parallelism_args.
         """
         # Restore config values.
         self.cfg.sequence_parallel = self.last_sequence_parallel
@@ -1952,10 +1981,10 @@ def _restore_sequence_parallelism_args(self):
                     mod.sequence_parallel = self.last_sequence_parallel
 
     def build_transformer_config(self) -> TransformerConfig:
-        """ Builds the megatron core gpt transformer config for the model.
-            For attributes in the nemo model config that are the same
-            as the megatron core TransformerConfig, we will use the value from the nemo model config.
-            For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """Builds the megatron core gpt transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
         """
 
         normalization = self.cfg.get('normalization', 'layernorm').lower()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index d7a5cf3f26bf..1b59b90d2968 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -354,7 +354,7 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
             token_count_avg = sum(batch['token_count']) / len(batch['token_count'])
 
         # Pass only torch.Tensor to prevent errors when process get_iterator_k_split()
-        batch = {k: v for k, v in batch.items() if isinstance(v, torch.Tensor)}
+        batch = {k: v for k, v in batch.items() if isinstance(v, (torch.Tensor, list))}
         _, seq_length = batch['tokens'].shape
         data_iter = get_iterator_k_split(batch, get_num_microbatches())
 
@@ -367,7 +367,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
         grad_sync_func = None
         param_sync_func = None
         if not forward_only and self.with_distributed_adam:
-            no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+            no_sync_func = partial(
+                self._optimizer.no_sync,
+                greedy_grad_copy=self.megatron_amp_O2,
+            )
             grad_sync_func = self.reduce_overlap_gradients
             param_sync_func = self.sync_overlap_parameters
 
@@ -855,13 +858,19 @@ def setup_training_dataloader(self):
         if hasattr(self, '_train_ds'):
             consumed_samples = self.compute_consumed_samples(0)
             self._train_dl = self.build_data_loader(
-                dataset=self._train_ds, data_cfg=self.cfg.data.train_ds, consumed_samples=consumed_samples,
+                dataset=self._train_ds,
+                data_cfg=self.cfg.data.train_ds,
+                consumed_samples=consumed_samples,
             )
 
     def setup_eval_dataloader(self, datasets, data_cfg):
         dataloaders = []
         for dataset in datasets:
-            eval_dl = self.build_data_loader(dataset=dataset, data_cfg=data_cfg, consumed_samples=0,)
+            eval_dl = self.build_data_loader(
+                dataset=dataset,
+                data_cfg=data_cfg,
+                consumed_samples=0,
+            )
             dataloaders.append(eval_dl)
         return dataloaders
 
diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py
index 48234459453e..75c50146bfab 100644
--- a/nemo/collections/nlp/modules/common/megatron/utils.py
+++ b/nemo/collections/nlp/modules/common/megatron/utils.py
@@ -22,6 +22,8 @@
 
 from torch import Tensor
 
+from nemo.utils import logging, logging_mode
+
 try:
     from apex.normalization import MixedFusedRMSNorm
     from apex.normalization.fused_layer_norm import FusedLayerNorm  # NOQA
@@ -310,9 +312,7 @@ def make_inference_attention_mask_3d(source_block, target_block, pad_id):
 def make_inference_history_mask_3d(block):
     batch, length = block.shape
     arange = torch.arange(length, device=block.device)
-    history_mask = (arange[None,] <= arange[:, None])[
-        None,
-    ]
+    history_mask = (arange[None,] <= arange[:, None])[None,]
     history_mask = history_mask.expand(batch, length, length)
     return history_mask
 
@@ -413,14 +413,56 @@ def get_all_params_for_weight_decay_optimization(
     return tuple(filter(lambda g: len(g['params']) > 0, param_groups))
 
 
-def get_iterator_k_split(batch: List[torch.Tensor], num_microbatches: int) -> Iterator:
+def split_list(inputs, num_chunks):
+    """
+    Split a list into equal sized chunks
+    """
+    chunk_size = len(inputs) // num_chunks
+    assert len(inputs) % chunk_size == 0, "Issue with batch size configuration!"
+    return [inputs[i : i + chunk_size] for i in range(0, len(inputs), chunk_size)]
+
+
+def get_iterator_k_split(batch: Union[Dict, List[torch.Tensor]], num_microbatches: int) -> Iterator:
+    """
+    Split a batch into k microbatches, where the batch size is divisible by k. Batch could be
+    a dictionary of tensors or a list of tensors. A dictionary batch could also have items of List type,
+    as long as the length of that list is the same as the batch size.
+    """
     if isinstance(batch, dict):
-        items = list(batch.items())
+        discard_items = [k for k, v in batch.items() if not isinstance(v, (torch.Tensor, list))]
+        if len(discard_items) > 0:
+            logging.warning(
+                f"Only support splitting torch.Tensor and List[torch.Tensor]. Discarding the following keys from the batch: {discard_items}",
+                mode=logging_mode.ONCE,
+            )
+
+        batch = {k: v for k, v in batch.items() if isinstance(v, (torch.Tensor, list))}
+        tensor_items = {k: v for k, v in batch.items() if isinstance(v, torch.Tensor)}
+        list_items = {k: v for k, v in batch.items() if isinstance(v, list)}
+
+        # Split tensor items
+        items = list(tensor_items.items())
         assert items[0][1].shape[0] % num_microbatches == 0, "Issue with batch size configuration!"
         split_batch = [torch.tensor_split(item[1], num_microbatches, dim=0) for item in items]
-        microbatches = [[(items[i][0], split_batch[i][j]) for i in range(len(items))] for j in range(num_microbatches)]
+
+        if len(list_items) == 0:
+            # Only have tensor items
+            microbatches = [
+                [(items[i][0], split_batch[i][j]) for i in range(len(items))] for j in range(num_microbatches)
+            ]
+        else:
+            # Split list items
+            list_items = list(list_items.items())
+            split_list_batch = [split_list(item[1], num_microbatches) for item in list_items]
+            # Merge tensor and list items
+            all_keys = [item[0] for item in items] + [item[0] for item in list_items]
+            all_split_batch = split_batch + split_list_batch
+            microbatches = [
+                [(all_keys[i], all_split_batch[i][j]) for i in range(len(all_keys))] for j in range(num_microbatches)
+            ]
         microbatches = [dict(elem) for elem in microbatches]
     else:
+        # Split a list of torch tensors
         assert batch[0].shape[0] % num_microbatches == 0, "Issue with batch size configuration!"
         split_batch = [
             torch.tensor_split(item, num_microbatches, dim=0) if torch.is_tensor(item) else item for item in batch
diff --git a/nemo/core/classes/common.py b/nemo/core/classes/common.py
index cf39ed134768..97757b2e3826 100644
--- a/nemo/core/classes/common.py
+++ b/nemo/core/classes/common.py
@@ -219,7 +219,10 @@ def _validate_input_types(self, input_types=None, ignore_collections=False, **kw
                     hasattr(value, 'neural_type')
                     and is_semantic_typecheck_enabled()
                     and not metadata.base_types[key].compare(value.neural_type)
-                    in (NeuralTypeComparisonResult.SAME, NeuralTypeComparisonResult.GREATER,)
+                    in (
+                        NeuralTypeComparisonResult.SAME,
+                        NeuralTypeComparisonResult.GREATER,
+                    )
                 ):
                     error_msg = [
                         f"{input_types[key].compare(value.neural_type)} :",
@@ -398,7 +401,10 @@ def __check_neural_type(self, obj, metadata: TypecheckMetadata, depth: int, name
             hasattr(obj, 'neural_type')
             and is_semantic_typecheck_enabled()
             and not type_val.compare(obj.neural_type)
-            in (NeuralTypeComparisonResult.SAME, NeuralTypeComparisonResult.GREATER,)
+            in (
+                NeuralTypeComparisonResult.SAME,
+                NeuralTypeComparisonResult.GREATER,
+            )
         ):
             raise TypeError(
                 f"{type_val.compare(obj.neural_type)} : \n"
@@ -711,6 +717,7 @@ def from_pretrained(
         return_config: bool = False,
         trainer: Optional['Trainer'] = None,
         save_restore_connector: SaveRestoreConnector = None,
+        return_model_file: Optional[bool] = False,
     ):
         """
         Instantiates an instance of NeMo from NVIDIA NGC cloud
@@ -726,6 +733,7 @@ def from_pretrained(
             strict: Passed to torch.load_state_dict. By default true.
             return_config: If set to true, will return just the underlying config of the restored
                 model as an OmegaConf DictConfig object without instantiating the model.
+            return_model_file: If set to true, will return just the downloaded model file in cache
 
         Returns:
             A model instance of a particular model class or its underlying config (if return_config is set).
@@ -751,6 +759,9 @@ def from_pretrained(
                 model_name=model_name, refresh_cache=refresh_cache
             )
 
+        if return_model_file:
+            return nemo_model_file_in_cache
+
         instance = class_.restore_from(
             restore_path=nemo_model_file_in_cache,
             override_config_path=override_config_path,
diff --git a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
index 690010ad29ca..f0c7847b8c9b 100644
--- a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
+++ b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
@@ -124,7 +124,11 @@
 )
 
 parser.add_argument(
-    "--metadata_path", required=False, default=None, type=str, help="Path to metadata file for the dataset.",
+    "--metadata_path",
+    required=False,
+    default=None,
+    type=str,
+    help="Path to metadata file for the dataset.",
 )
 
 parser.add_argument(
@@ -165,7 +169,10 @@
 )
 
 parser.add_argument(
-    "--buckets_num", type=int, default=1, help="Number of buckets to create based on duration.",
+    "--buckets_num",
+    type=int,
+    default=1,
+    help="Number of buckets to create based on duration.",
 )
 
 parser.add_argument(
@@ -617,6 +624,15 @@ def _read_manifest(self, manifest_path: str, config: ASRTarredDatasetConfig):
         with open(manifest_path, 'r', encoding='utf-8') as m:
             for line in m:
                 entry = json.loads(line)
+                audio_key = "audio_filepath" if "audio_filepath" in entry else "audio_file"
+                if audio_key not in entry:
+                    raise KeyError(f"Manifest entry does not contain 'audio_filepath' or  'audio_file' key: {entry}")
+                audio_filepath = entry[audio_key]
+                if not os.path.isfile(audio_filepath) and not os.path.isabs(audio_filepath):
+                    audio_filepath_abs = os.path.join(os.path.dirname(manifest_path), audio_filepath)
+                    if not os.path.isfile(audio_filepath_abs):
+                        raise FileNotFoundError(f"Could not find {audio_filepath} or {audio_filepath_abs}!")
+                    entry[audio_key] = audio_filepath_abs
                 if (config.max_duration is None or entry['duration'] < config.max_duration) and (
                     config.min_duration is None or entry['duration'] >= config.min_duration
                 ):
@@ -648,8 +664,7 @@ def _write_to_tar(self, tar, audio_filepath: str, squashed_filename: str) -> Non
             tar.addfile(ti, encoded_audio)
 
     def _create_shard(self, entries, target_dir, shard_id, manifest_folder):
-        """Creates a tarball containing the audio files from `entries`.
-        """
+        """Creates a tarball containing the audio files from `entries`."""
         if self.config.sort_in_shards:
             entries.sort(key=lambda x: x["duration"], reverse=False)
 
diff --git a/tests/collections/multimodal/test_speechllm_models.py b/tests/collections/multimodal/test_speechllm_models.py
new file mode 100644
index 000000000000..8698fed205ea
--- /dev/null
+++ b/tests/collections/multimodal/test_speechllm_models.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import pytest
+import pytorch_lightning as pl
+import torch
+from megatron.core import parallel_state
+from omegaconf import DictConfig, OmegaConf
+from pytorch_lightning.plugins.environments import TorchElasticEnvironment
+
+from nemo.collections.multimodal.speech_llm.models import modular_models
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import shift_tokens_by_multi_audios
+from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+
+
+class ModularAudioGPTModel(modular_models.ModularAudioGPTModel):
+    # disable logging to avoid MisconfigurationException
+    def log(self, *args, **kwargs):
+        pass
+
+
+def setup_module():
+    pl.seed_everything(1)
+    # init model parallel needed for LLM loss
+    init_method = 'tcp://'
+    master_ip = 'localhost'
+    master_port = '6000'
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(backend='gloo', world_size=1, rank=0, init_method=init_method)
+    parallel_state.initialize_model_parallel(1, 1)
+
+
+@pytest.fixture
+def llm_model_config():
+    this_test_dir = os.path.dirname(os.path.abspath(__file__))
+    # Although most of the stuff in model is loaded from ckpt, we need configs
+    # for e.g. cfg.model.optim
+    config = OmegaConf.load(
+        os.path.join(
+            this_test_dir,
+            "../../../examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yaml",
+        )
+    )
+    # TODO(zhehuai): move the following to Test /home/TestData
+    config.model.restore_from_path = "/root/home/works/TestData/pretrained_models/megatron_gpt/gpt_pretrain_220m_len_4096_pos_alibi_step_595508_gbs256.nemo"
+    config.model.micro_batch_size = 2
+    config.model.global_batch_size = 2
+    config.model.data.validation_ds.manifest_filepath = (
+        '/root/home/works/TestData/datasets/LibriSpeech/dev_clean_cleaned.json'
+    )
+    config.model.data.train_ds.manifest_filepath = (
+        '/root/home/works/TestData/datasets/LibriSpeech/dev_clean_cleaned.json'
+    )
+    return config
+
+
+@pytest.fixture
+def trainer_config():
+    config_trainer = DictConfig({})
+
+    if torch.cuda.is_available():
+        accelerator = "gpu"
+        torch.set_default_device('cuda')
+    else:
+        accelerator = "cpu"
+    config_trainer.accelerator = accelerator
+    config_trainer.devices = 1
+    config_trainer.num_nodes = 1
+    config_trainer.max_epochs = 4
+    config_trainer.max_steps = 1
+    config_trainer.val_check_interval = 1.0
+
+    # for PyTorch Native AMP set precision=16
+    config_trainer.precision = 32
+
+    # setup cluster environment parameters"
+    # use torch elastic cluster environment so `create_process_externally` is True
+    # the launcher is set to None. It will not try to spawn new processes.
+    # It won't create the misconfiguration error because of the `interactive session`
+    os.environ["LOCAL_RANK"] = "0"
+    os.environ["RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "1"
+
+    strategy = NLPDDPStrategy()
+    plugins = [TorchElasticEnvironment()]
+    trainer = pl.Trainer(logger=False, plugins=plugins, strategy=strategy, **config_trainer)
+    return trainer, config_trainer
+
+
+@pytest.fixture
+def perception_model_config():
+    preprocessor = {"_target_": "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor"}
+    encoder = {
+        "_target_": "nemo.collections.asr.modules.ConformerEncoder",
+        "feat_in": 64,
+        "n_layers": 8,
+        "d_model": 64,
+        "self_attention_model": "rel_pos_local_attn",
+        "att_context_size": [128, 128],
+    }
+
+    model_config = DictConfig(
+        {
+            "_target_": "nemo.collections.multimodal.speechllm.modules.speechllm_perception.AudioPerceptionModule",
+            "preprocessor": DictConfig(preprocessor),
+            "encoder": DictConfig(encoder),
+            "modality_adapter": DictConfig(encoder),
+            "output_dim": 1024,
+        }
+    )
+    return model_config
+
+
+@pytest.fixture
+def test_batch():
+    signal_len = torch.from_numpy(np.array([64000, 64000]))
+    transcript = torch.arange(10).reshape(2, 5).int()
+    tokens = transcript[:, :-1]
+    labels = transcript[:, 1:]
+    transcript_length = torch.Tensor([3, 2]).int()
+    # assuming context_lengths = [1, 1]
+    loss_mask = torch.Tensor([[0, 1, 1, 0], [0, 1, 0, 0]])
+    batch = {
+        'audio_signal_length': signal_len,
+        'tokens': tokens,
+        'tokens_length': transcript_length,
+        'contexts': torch.arange(260).reshape(2, 130).int(),
+        'context_lengths': torch.Tensor([1, 1]).int(),
+        'labels': labels,
+        'answers': labels,
+        'loss_mask': loss_mask,
+    }
+    batch['audio_signal'] = torch.randn([2, 64000])
+    return batch
+
+
+@pytest.mark.skip(reason="nedd to move pretrained GPT model to /home/works/TestData first")
+class TestModularAudioGPTModel:
+    @pytest.mark.unit
+    def test_init_and_train(self, llm_model_config, perception_model_config, trainer_config):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+
+        assert isinstance(model.model, GPTModel)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            save_path = str(Path(tmpdir) / "model.nemo")
+            model.train()
+            model.save_to(save_path)
+
+    @pytest.mark.unit
+    def test_prepare_llm_input(self, llm_model_config, perception_model_config, trainer_config, test_batch):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+        model.cuda()
+        model.train()
+        batch = {key: val.cuda(non_blocking=True) for key, val in test_batch.items()}
+        encoder_input, attention_mask, labels, loss_mask, encoder_length = model.prepare_llm_input(batch)
+        assert encoder_input.shape == (17, 2, 768)
+        assert np.allclose(encoder_input.sum().cpu().detach().numpy(), 15.783691)
+        assert attention_mask.shape == (2, 1, 17, 17)
+        assert labels.shape == (2, 17)
+        assert np.allclose(loss_mask.sum(axis=1).cpu().numpy(), [2, 1])
+        assert np.allclose(encoder_length.cpu().numpy(), (16, 15))
+
+    @pytest.mark.unit
+    def test_training_step(self, llm_model_config, perception_model_config, trainer_config, test_batch):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+        model.cuda()
+        model.on_train_start()
+        model.setup()
+        model.train()
+        loss_mean = model.training_step(iter([test_batch]), None)
+        assert np.allclose(loss_mean.cpu().detach().numpy(), 5.7052)
+
+    @pytest.mark.unit
+    def test_validation_step(self, llm_model_config, perception_model_config, trainer_config, test_batch):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+        model.cuda()
+        model.train()
+        batch = {key: val.cuda(non_blocking=True) for key, val in test_batch.items()}
+        loss_mean = model.validation_step(iter([batch]), 0)
+        assert np.allclose(loss_mean['loss'].cpu().detach().numpy(), 5.7052)
+
+    @pytest.mark.unit
+    def test_predict_step(self, llm_model_config, perception_model_config, trainer_config, test_batch):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+        model.cuda()
+        model.train()
+        batch = {key: val.cuda(non_blocking=True) for key, val in test_batch.items()}
+        response = model.predict_step(batch, 0, 0)
+        ground_truth = 'to suit you. Please note these are lecture notes from an alternate presentation. Copyright  ⁇ '
+        assert response['sentences'][0] == ground_truth
+
+    @pytest.mark.unit
+    def test_concat_multi_features(self, llm_model_config, perception_model_config, trainer_config):
+        llm_model_config.model.pretrained_audio_model = "stt_en_fastconformer_transducer_large"
+        llm_model_config.model.perception = perception_model_config
+        trainer, llm_model_config.trainer = trainer_config
+        model = ModularAudioGPTModel.restore_from_pretrained_models(llm_model_config, trainer=trainer)
+        model.eval()
+
+        feat_dim = 32
+        encoded = [torch.ones([3, 16, feat_dim]), torch.ones([3, 16, feat_dim])]
+        encoded_len = [torch.LongTensor([12, 8, 4]), torch.LongTensor([12, 8, 4])]
+        input_embeds = torch.zeros([2, 32, feat_dim])
+        input_length = torch.LongTensor([32, 28])
+        context_start_idx = [[0, 4, 12, 20], [0, 8, 16, 25]]
+        encoder_input, encoder_length = model._concat_multi_features(
+            encoded, encoded_len, input_embeds, input_length, context_start_idx
+        )
+        assert encoder_input.shape == (2, 56, feat_dim)  # max audio_len + text_len = (12 + 8 + 4) + 32 = 56
+        assert encoder_length.shape == (2,)
+        assert np.allclose(encoder_length.cpu().numpy(), (56, 52))
+        assert encoder_input[0, : context_start_idx[0][1]].sum() == 0  # first 4 features are text features
+        assert np.allclose(
+            encoder_input[0, context_start_idx[0][1] : context_start_idx[0][1] + encoded_len[0][0]],
+            torch.ones([encoded_len[0][0], feat_dim]),
+        )
+
+    @pytest.mark.unit
+    def test_shift_tokens_by_multi_audios(self):
+        """This test is put here because its functionality is similar to _concat_multi_features()"""
+        encoder_max_length = 64
+        audio_len = [torch.LongTensor([12, 8, 4]), torch.LongTensor([12, 8, 4])]
+        context_tokens = torch.ones([2, 32])
+        context_length = torch.LongTensor([32, 28])
+        context_start_idx = [[0, 4, 12, 20], [0, 8, 16, 25]]
+        new_context_tokens = shift_tokens_by_multi_audios(
+            context_tokens, context_length, audio_len, context_start_idx, encoder_max_length
+        )
+        assert new_context_tokens.shape == (2, 64)
+        assert np.allclose(new_context_tokens[0, : context_start_idx[0][1]], torch.ones([context_start_idx[0][1]]))
+        assert np.allclose(
+            new_context_tokens[0, context_start_idx[0][1] : context_start_idx[0][1] + audio_len[0][0]],
+            torch.zeros([audio_len[0][0]]),
+        )

From 820a285dcbafa72265a79c27d2656d636eed17b1 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Mon, 13 May 2024 09:10:09 -0700
Subject: [PATCH 068/178] ASR_dev_run_Speech_To_Text_HF_Finetuning optional as
 flaky (#9180)

---
 .github/workflows/cicd-main.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 252843bcc0ce..ef646ab92e7b 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -622,7 +622,7 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  ASR_dev_run_Speech_To_Text_HF_Finetuning:
+  OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure-gpus-1
     timeout-minutes: 10
@@ -667,8 +667,8 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_finetuning_results
             rm -rf examples/asr/speech_finetuning_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+        #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+        #  if: "failure()"
 
   ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
     needs: [cicd-test-container-setup]
@@ -6521,7 +6521,7 @@ jobs:
       - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet
       - ASR_dev_run_Speech_Pre-training_-_CitriNet
       - ASR_dev_run_Speech_To_Text_Finetuning
-      - ASR_dev_run_Speech_To_Text_HF_Finetuning
+      #- OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning
       - ASR_dev_run_Speech_to_Text_WPE_-_Conformer
       - ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
       - L2_Speech_to_Text_EMA

From a0e9ee3c49d6eddd8d5372c3a7e44b509d2edc0e Mon Sep 17 00:00:00 2001
From: Eric Harper <complex451@gmail.com>
Date: Mon, 13 May 2024 11:06:51 -0600
Subject: [PATCH 069/178] update (#9181)

Signed-off-by: eharper <eharper@nvidia.com>
---
 nemo/package_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/package_info.py b/nemo/package_info.py
index b253927a6b38..59805e0e04d3 100644
--- a/nemo/package_info.py
+++ b/nemo/package_info.py
@@ -16,7 +16,7 @@
 MAJOR = 2
 MINOR = 0
 PATCH = 0
-PRE_RELEASE = 'rc0'
+PRE_RELEASE = 'rc1'
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

From 7a23bfa3969da3acb60a3f00a5191652833ca880 Mon Sep 17 00:00:00 2001
From: Ao Tang <aot@nvidia.com>
Date: Mon, 13 May 2024 13:53:19 -0400
Subject: [PATCH 070/178] Change FIM Dataset Random Seed Init (#9165)

* change seed to dataset init

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>
---
 .../megatron/gpt_fim_dataset.py               | 50 ++++++++++---------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
index 474761c41d67..358dbc22a2cd 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+from nemo.utils import logging
 
 try:
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
@@ -36,8 +37,8 @@
 class GPTFIMDatasetConfig(GPTDatasetConfig):
     """Configuration object for Megatron Core GPT FIM datasets
 
-        Attributes:
-            fim: fill in the middle parameters config
+    Attributes:
+        fim: fill in the middle parameters config
     """
 
     def __init__(self, fim, **kwargs):
@@ -79,6 +80,27 @@ def __init__(
         super().__init__(indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config)
 
         self.indexed_dataset = indexed_dataset
+        self.np_rng = np.random.RandomState(seed=self.config.random_seed)
+        logging.info(f"Initialized FIM RNG with seed = {self.config.random_seed}")
+        # get FIM params
+        self.fim_rate = self.config.fim.get('rate', 0.5)
+        self.fim_spm_rate = self.config.fim.get('spm_rate', 0.5)
+        self.fragment_fim_rate = self.config.fim.get('fragment_rate', 0.5)
+        split_sample = self.config.fim.get('split_sample', None)
+        self.fim_split_sample = self.config.tokenizer.tokens_to_ids(split_sample) if split_sample else None
+        self.no_fim_prefix = self.config.fim.get('no_prefix', None)
+
+        # get extra tokens ids
+        fim_tokens = self.config.fim.extra_tokens
+        fim_tokens = [fim_tokens.prefix, fim_tokens.middle, fim_tokens.suffix, fim_tokens.pad, fim_tokens.eod]
+        fim_tokens_ids = self.config.tokenizer.tokens_to_ids(fim_tokens)
+        (
+            self.prefix_tok_id,
+            self.middle_tok_id,
+            self.suffix_tok_id,
+            self.pad_tok_id,
+            self.eod_tok_id,
+        ) = fim_tokens_ids
 
     def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
         """Get the text (token ids) and document ids for a given index
@@ -126,29 +148,9 @@ def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[np.ndarray,
 
         sample = np.concatenate(sample_parts)
 
-        # get FIM params
-        self.fim_rate = self.config.fim.get('rate', 0.5)
-        self.fim_spm_rate = self.config.fim.get('spm_rate', 0.5)
-        self.fragment_fim_rate = self.config.fim.get('fragment_rate', 0.5)
-        split_sample = self.config.fim.get('split_sample', None)
-        self.fim_split_sample = self.config.tokenizer.tokens_to_ids(split_sample) if split_sample else None
-        self.no_fim_prefix = self.config.fim.get('no_prefix', None)
-
-        # get extra tokens ids
-        fim_tokens = self.config.fim.extra_tokens
-        fim_tokens = [fim_tokens.prefix, fim_tokens.middle, fim_tokens.suffix, fim_tokens.pad, fim_tokens.eod]
-        fim_tokens_ids = self.config.tokenizer.tokens_to_ids(fim_tokens)
-        (
-            self.prefix_tok_id,
-            self.middle_tok_id,
-            self.suffix_tok_id,
-            self.pad_tok_id,
-            self.eod_tok_id,
-        ) = fim_tokens_ids
-
         sample_len = sample.shape[0]
         segment_breaks = np.argwhere(sample == self.eod_tok_id)
-        np_rng = np.random.RandomState(seed=self.config.random_seed)
+        np_rng = self.np_rng
 
         if segment_breaks.shape != (0, 1):  # then there is an EOD token in this example
             curr_start_position = 0
@@ -245,7 +247,7 @@ def _permute(
         no_fim_prefix=None,
     ):
         """
-        Take in a sample (np array w/ size (0,chunklength)) and perform a FIM transformation on it. 
+        Take in a sample (np array w/ size (0,chunklength)) and perform a FIM transformation on it.
         Maintain the same sample length (if transform creates a few extra tokens, drop them).
         """
         if np_rng.binomial(1, fim_rate):  # sample bernoulli dist

From 43686ecef00837bca9a1c63e64759dc57d4fe2f7 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Mon, 13 May 2024 15:40:54 -0700
Subject: [PATCH 071/178] increase time limit for Speech_Checkpoints_tests
 (#9186)

---
 .github/workflows/cicd-main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index ef646ab92e7b..4652e4d19f89 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -6484,7 +6484,7 @@ jobs:
   Speech_Checkpoints_tests:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
-    timeout-minutes: 10
+    timeout-minutes: 20
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 

From 467d94b7b9ab796b49025487edc05e635e0f8a94 Mon Sep 17 00:00:00 2001
From: gdengk <160076886+gdengk@users.noreply.github.com>
Date: Mon, 13 May 2024 15:58:56 -0700
Subject: [PATCH 072/178] fix ep rank (#9161)

Signed-off-by: Gao Deng <gdeng@nvidia.com>
---
 nemo/collections/nlp/modules/common/megatron/megatron_init.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
index 5d5b65b360ee..341e534bcd89 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
@@ -315,7 +315,7 @@ def fake_initialize_model_parallel(
     if expert_model_parallel_size_ is not None and expert_model_parallel_size_ > 1:
         for ranks in rank_generator.get_ranks('ep', independent_ep=True):
             if rank in ranks:
-                expert_model_parallel_rank = list(ranks).index(rank) // tensor_model_parallel_size
+                expert_model_parallel_rank = list(ranks).index(rank)
 
     # Build the pipeline model-parallel groups and embedding groups
     # (first and last rank in each pipeline model-parallel group).

From 77090d4e5e218261b1fe6b3a931d16f4083f2d53 Mon Sep 17 00:00:00 2001
From: meatybobby <bobchen@nvidia.com>
Date: Mon, 13 May 2024 16:14:34 -0700
Subject: [PATCH 073/178] TRTLLM new API support (#9003)

* Add trtllm checkpoint

* Change model config

* fix no query_group

* Using build API

* Change export to new API

* Update generate API

* Fix runtime config

* Fix for llama

* Fix for ptuning

* Fix TP issue

* Change TP rank for building weight dict

* Add lora config

* add prompt embedding table config

* Fix PP isue

* PP layers fix

* Fix no prompt task ids

* Add bos for Gemma

* Add multi block mode

* Embedding and layernorm for PP

* MPI multiprocess support for multinode

* Only output text on first rank

* Change to ModelRunnerCpp

* Add falcon

* Add rotary_pct default value

* Falcon fix

* Add MOE config

* Fix MOE weight dict

* Clean code

* Add rotary_base

* Fix MOE config

* Fix falcon new architecture

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix Gemma 7B

* Add rotary_scaling

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

---------

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: abharwani <abharwani@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 nemo/export/tensorrt_llm.py                   | 138 ++++++++------
 nemo/export/trt_llm/decoder/__init__.py       |   8 +
 nemo/export/trt_llm/nemo/convert.py           |  71 ++++---
 nemo/export/trt_llm/nemo/nemo_ckpt_convert.py |  39 ++--
 nemo/export/trt_llm/nemo_utils.py             | 180 +++++++++++++++++-
 nemo/export/trt_llm/tensorrt_llm_build.py     |  90 ++++++++-
 nemo/export/trt_llm/tensorrt_llm_run.py       | 130 ++++++-------
 scripts/export/export_to_trt_llm.py           |  12 +-
 8 files changed, 468 insertions(+), 200 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 033044b3b328..af4f1b6699ee 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -30,9 +30,10 @@
 from nemo.export.tarutils import TarPath, unpack_tarball
 from nemo.export.trt_llm.model_config_trt import model_config_to_tensorrt_llm
 from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer
-from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_llm_to_model_config
+from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_to_trtllm_config
 from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
 from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
+from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine
 from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_refit
 from nemo.export.trt_llm.utils import is_nemo_file
 
@@ -115,6 +116,7 @@ def export(
         max_output_token: int = 256,
         max_batch_size: int = 8,
         max_prompt_embedding_table_size=None,
+        use_parallel_embedding: bool = False,
         use_inflight_batching: bool = False,
         enable_context_fmha: bool = True,
         paged_kv_cache: bool = False,
@@ -188,65 +190,70 @@ def export(
 
         self.model = None
 
-        tmp_dir = tempfile.TemporaryDirectory()
-        nemo_export_dir = Path(tmp_dir.name)
+        if tensorrt_llm.mpi_rank() == 0:
+            tmp_dir = tempfile.TemporaryDirectory()
+            nemo_export_dir = Path(tmp_dir.name)
 
-        if nemo_checkpoint_path.endswith("qnemo"):
-            if os.path.isdir(nemo_checkpoint_path):
-                nemo_export_dir = nemo_checkpoint_path
+            if nemo_checkpoint_path.endswith("qnemo"):
+                if os.path.isdir(nemo_checkpoint_path):
+                    nemo_export_dir = nemo_checkpoint_path
+                else:
+                    unpack_tarball(nemo_checkpoint_path, tmp_dir.name)
+                    nemo_checkpoint_path = tmp_dir.name
+                self.tokenizer = get_nmt_tokenizer(nemo_checkpoint_path)
+
+                qnemo_to_tensorrt_llm(
+                    nemo_checkpoint_path=nemo_checkpoint_path,
+                    engine_dir=self.model_dir,
+                    max_input_len=max_input_token,
+                    max_output_len=max_output_token,
+                    max_batch_size=max_batch_size,
+                    max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                    lora_target_modules=lora_target_modules,
+                )
             else:
-                unpack_tarball(nemo_checkpoint_path, tmp_dir.name)
-                nemo_checkpoint_path = tmp_dir.name
-            self.tokenizer = get_nmt_tokenizer(nemo_checkpoint_path)
-
-            qnemo_to_tensorrt_llm(
-                nemo_checkpoint_path=nemo_checkpoint_path,
-                engine_dir=self.model_dir,
-                max_input_len=max_input_token,
-                max_output_len=max_output_token,
-                max_batch_size=max_batch_size,
-                max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                lora_target_modules=lora_target_modules,
-            )
-        else:
-            model_configs, self.tokenizer = nemo_llm_to_model_config(
-                in_file=nemo_checkpoint_path,
-                decoder_type=model_type,
-                dtype=dtype,
-                tensor_parallel_size=tensor_parallel_size,
-                pipeline_parallel_size=pipeline_parallel_size,
-                nemo_export_dir=nemo_export_dir,
-                save_nemo_model_config=save_nemo_model_config,
-            )
+                weights_dicts, model_configs, self.tokenizer = nemo_to_trtllm_config(
+                    in_file=nemo_checkpoint_path,
+                    decoder_type=model_type,
+                    dtype=dtype,
+                    tensor_parallel_size=tensor_parallel_size,
+                    pipeline_parallel_size=pipeline_parallel_size,
+                    use_parallel_embedding=use_parallel_embedding,
+                    nemo_export_dir=nemo_export_dir,
+                    save_nemo_model_config=save_nemo_model_config,
+                )
 
-            model_config_to_tensorrt_llm(
-                model_configs,
-                self.model_dir,
-                world_size=tensor_parallel_size * pipeline_parallel_size,
-                max_input_len=max_input_token,
-                max_output_len=max_output_token,
-                max_batch_size=max_batch_size,
-                max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                use_inflight_batching=use_inflight_batching,
-                paged_kv_cache=paged_kv_cache,
-                enable_context_fmha=enable_context_fmha,
-                enable_multi_block_mode=enable_multi_block_mode,
-                use_lora_plugin=use_lora_plugin,
-                lora_target_modules=lora_target_modules,
-                max_lora_rank=max_lora_rank,
-            )
+                for weight_dict, model_config in zip(weights_dicts, model_configs):
+                    build_and_save_engine(
+                        max_input_len=max_input_token,
+                        max_output_len=max_output_token,
+                        max_batch_size=max_batch_size,
+                        model_config=model_config,
+                        model_weights=weight_dict,
+                        model_dir=self.model_dir,
+                        model_type=model_type,
+                        lora_ckpt_list=self.lora_ckpt_list,
+                        use_lora_plugin=use_lora_plugin,
+                        max_lora_rank=max_lora_rank,
+                        lora_target_modules=lora_target_modules,
+                        max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                        enable_multi_block_mode=enable_multi_block_mode,
+                    )
 
-        tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
-        if os.path.exists(tokenizer_path):
-            shutil.copy(tokenizer_path, self.model_dir)
-        else:
-            self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer'))
+            tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
+            if os.path.exists(tokenizer_path):
+                shutil.copy(tokenizer_path, self.model_dir)
+            else:
+                self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer'))
+
+            nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
+            if os.path.exists(nemo_model_config):
+                shutil.copy(nemo_model_config, self.model_dir)
 
-        nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
-        if os.path.exists(nemo_model_config):
-            shutil.copy(nemo_model_config, self.model_dir)
+            tmp_dir.cleanup()
 
-        tmp_dir.cleanup()
+        if tensorrt_llm.mpi_world_size() > 1:
+            tensorrt_llm.mpi_barrier()
 
         if load_model:
             self._load()
@@ -279,7 +286,9 @@ def build(
 
         # Build or refit TRT-LLM engine from a nemo model.
         model_configs = nemo_llm_model_to_model_config(
-            nemo_model=nemo_model, decoder_type=model_type, nemo_model_config=nemo_model_config,
+            nemo_model=nemo_model,
+            decoder_type=model_type,
+            nemo_model_config=nemo_model_config,
         )
 
         model_config_to_tensorrt_llm(
@@ -298,7 +307,9 @@ def build(
         )
 
     def refit(
-        self, nemo_model, nemo_model_config,
+        self,
+        nemo_model,
+        nemo_model_config,
     ):
         assert self.use_refit, "TRT-LLM model must be built() with refit=True"
 
@@ -329,7 +340,6 @@ def forward(
         output_log_probs: bool = False,
         **sampling_kwargs,
     ):
-
         """
         Exports nemo checkpoints to TensorRT-LLM.
 
@@ -394,7 +404,7 @@ def forward(
                             ), "Task: {0} doesn't exist in the task list.".format(task_ids[i])
                             input_task_ids.append(self.task_ids[task_ids[i]])
             if not streaming:
-                if torch.distributed.is_initialized():
+                if torch.distributed.is_initialized() or tensorrt_llm.mpi_world_size() > 1:
                     multiprocessed_env = True
                 else:
                     multiprocessed_env = False
@@ -478,7 +488,7 @@ def get_hidden_size(self):
         if self.config is None:
             return None
         else:
-            return self.config["builder_config"]["hidden_size"]
+            return self.config["pretrained_config"]["hidden_size"]
 
     @property
     def get_triton_input(self):
@@ -665,7 +675,9 @@ def _get_prompt_embedding_table_ckpt(self, prompt_embeddings_checkpoint_path):
             return weights.cpu().detach()
 
     def _get_prompt_embedding_table(
-        self, prompt_embeddings_table=None, prompt_embeddings_checkpoint_path=None,
+        self,
+        prompt_embeddings_table=None,
+        prompt_embeddings_checkpoint_path=None,
     ):
         if prompt_embeddings_table is not None and prompt_embeddings_checkpoint_path is not None:
             LOGGER.warning(
@@ -694,15 +706,15 @@ def _get_prompt_embedding_table(
                 raise TypeError(prompt_embeddings_checkpoint_path + " is not a nemo file.")
             prompt_embeddings_table = self._get_prompt_embedding_table_ckpt(prompt_embeddings_checkpoint_path)
 
-        dtype = self.config['builder_config']['precision']
+        dtype = self.config['pretrained_config']['dtype']
         prompt_embeddings_table = prompt_embeddings_table.to(
             dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype)
         ).cuda()
 
-        if prompt_embeddings_table.size(dim=1) != self.config["builder_config"]["hidden_size"]:
+        if prompt_embeddings_table.size(dim=1) != self.config["pretrained_config"]["hidden_size"]:
             raise Exception(
                 "Hidden dimension of the model is {0} and does not match with the dimension of the prompt table.".format(
-                    self.config["builder_config"]["hidden_size"]
+                    self.config["pretrained_config"]["hidden_size"]
                 )
             )
 
diff --git a/nemo/export/trt_llm/decoder/__init__.py b/nemo/export/trt_llm/decoder/__init__.py
index 5fe749408cb9..b5e22b5e513e 100644
--- a/nemo/export/trt_llm/decoder/__init__.py
+++ b/nemo/export/trt_llm/decoder/__init__.py
@@ -40,6 +40,14 @@
     DECODER_GEMMA: GemmaDecoderLayerConfigBuilder,
 }
 
+DECODER_MODEL_TYPE = {
+    DECODER_GPT2: 'GPTForCausalLM',
+    DECODER_GPTNEXT: 'GPTForCausalLM',
+    DECODER_LLAMA: 'LLaMAForCausalLM',
+    DECODER_GEMMA: 'GemmaForCausalLM',
+    DECODER_FALCON: 'FalconForCausalLM',
+}
+
 
 def build_decoder_layer_config(layer, decoder: str, dtype=trt.float16, rank=0, tensor_parallel=1):
     """Builds the decoder layer config with the input torch module."""
diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py
index 09476da6b939..7598b3f6825f 100644
--- a/nemo/export/trt_llm/nemo/convert.py
+++ b/nemo/export/trt_llm/nemo/convert.py
@@ -39,12 +39,12 @@ def gpu_map_location(storage, loc):
 
 
 def save_val(val, dir, key, tp_num=None):
-    suffix = "bin" if tp_num is None else f"{tp_num}.bin"
+    suffix = "" if tp_num is None else f".{tp_num}.bin"
     # Transpose linear layer weights to the correct shape.
     if len(val.shape) >= 2:
         val = np.ascontiguousarray(np.transpose(val.reshape(val.shape[0], -1), [1, 0]))
     global weights_dict
-    weights_dict[f"model.{key}.{suffix}"] = val
+    weights_dict[f"{key}{suffix}"] = val
 
 
 def save_split(split_vals, dir, key, i, split_factor):
@@ -55,10 +55,10 @@ def save_split(split_vals, dir, key, i, split_factor):
 def save_expert_split(split_vals, dir, key, i, split_factor):
     for j, val in enumerate(split_vals):
         tp_num = i * split_factor + j
-        suffix = "bin" if tp_num is None else f"{tp_num}.bin"
+        suffix = "" if tp_num is None else f".{tp_num}.bin"
 
         global weights_dict
-        weights_dict[f"model.{key}.{suffix}"] = val
+        weights_dict[f"{key}{suffix}"] = val
 
 
 def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
@@ -183,6 +183,9 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
 
     save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
 
+    layer_num = key.split(".")[1]
+    layer_prefix = f'transformer.layers.{layer_num}'
+
     if not isinstance(vals, list):
         vals = [vals]
 
@@ -210,12 +213,27 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         or "final_layernorm.bias" in key
     ):
         # shared weights, only need to convert the weights of rank 0
-        if "post_self_attn_layernorm.weight" in key:
-            key = key.replace("post_self_attn_layernorm.weight", "post_attention_layernorm.weight")
-        elif "mlp.linear_fc2.bias" in key:
-            key = key.replace("mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias")
-        elif "attention.linear_proj.bias" in key:
-            key = key.replace("attention.linear_proj.bias", "attention.dense.bias")
+        if "post_self_attn_layernorm" in key or "post_attention_layernorm" in key:
+            if key.endswith('weight'):
+                key = f'{layer_prefix}.post_layernorm.weight'
+            else:
+                key = f'{layer_prefix}.post_layernorm.bias'
+        elif "mlp.linear_fc2.bias" in key or "mlp.dense_4h_to_h.bias" in key:
+            key = f'{layer_prefix}.mlp.proj.bias'
+        elif "attention.linear_proj.bias" in key or "attention.dense.bias" in key:
+            key = f'{layer_prefix}.attention.dense.bias'
+        elif "final_layernorm" in key:
+            key = key.replace("final_layernorm", "transformer.ln_f")
+        elif "input_layernorm" in key:
+            if key.endswith('weight'):
+                key = f'{layer_prefix}.input_layernorm.weight'
+            else:
+                key = f'{layer_prefix}.input_layernorm.bias'
+        elif "pre_mlp_layernorm" in key:
+            if key.endswith('weight'):
+                key = f'{layer_prefix}.post_layernorm.weight'
+            else:
+                key = f'{layer_prefix}.post_layernorm.bias'
         if tp_rank == 0:
             save_val(vals[0], saved_dir, key)
 
@@ -228,10 +246,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         cat_dim = 0
         val = np.concatenate(vals, axis=cat_dim)
         split_vals = np.split(val, split_factor, axis=cat_dim)
-        if "attention.linear_proj.weight" in key:
-            key = key.replace("attention.linear_proj.weight", "attention.dense.weight")
-        elif "mlp.linear_fc2.weight" in key:
-            key = key.replace("mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight")
+        if "attention.linear_proj.weight" in key or "attention.dense.weight" in key:
+            key = f'{layer_prefix}.attention.dense.weight'
+        elif "mlp.linear_fc2.weight" in key or "mlp.dense_4h_to_h.weight" in key:
+            key = f'{layer_prefix}.mlp.proj.weight'
         save_split(split_vals, saved_dir, key, tp_rank, split_factor)
         if act_range is not None and int8_outputs == "all":
             base_key = key.replace(".weight", "")
@@ -251,8 +269,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         val = np.concatenate(vals, axis=cat_dim)
         split_vals = np.split(val, split_factor, axis=cat_dim)
 
-        if "mlp.linear_fc1" in key:
-            key = key.replace("mlp.linear_fc1", "mlp.dense_h_to_4h")
+        if key.endswith("weight"):
+            key = f'{layer_prefix}.mlp.fc.weight'
+        else:
+            key = f'{layer_prefix}.mlp.fc.bias'
         save_split(split_vals, saved_dir, key, tp_rank, split_factor)
         if act_range is not None and int8_outputs == "all":
             base_key = key.replace(".weight", "")
@@ -261,8 +281,10 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
 
         if split_gated_activation:
             assert not save_int8
-            prefix, dot, suffix = key.rpartition(".")
-            key = prefix + ".gate" + dot + suffix
+            if key.endswith("weight"):
+                key = f'{layer_prefix}.mlp.gate.weight'
+            else:
+                key = f'{layer_prefix}.mlp.gate.bias'
 
             gate = np.concatenate(gates, axis=cat_dim)
             split_vals = np.split(gate, split_factor, axis=cat_dim)
@@ -279,9 +301,6 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
             write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
 
     elif "attention.query_key_value.bias" in key or "attention.linear_qkv.bias" in key:
-        if "attention.linear_qkv.bias" in key:
-            key = key.replace("attention.linear_qkv.bias", "attention.query_key_value.bias")
-
         qkv_hidden_dim = vals[0].shape[0]
         size_per_head = qkv_hidden_dim // (num_attention_heads + 2 * num_kv_heads)
         q_num = num_attention_heads // num_kv_heads
@@ -304,6 +323,7 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
             np.concatenate([q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], axis=0)
             for i in range(split_factor)
         ]
+        key = f'{layer_prefix}.attention.qkv.bias'
         save_split(split_vals, saved_dir, key, tp_rank, split_factor)
 
     elif "attention.query_key_value.weight" in key or "attention.linear_qkv.weight" in key:
@@ -342,8 +362,7 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
             for i in range(split_factor)
         ]
 
-        if "attention.linear_qkv.weight" in key:
-            key = key.replace("attention.linear_qkv.weight", "attention.query_key_value.weight")
+        key = f'{layer_prefix}.attention.qkv.weight'
         save_split(split_vals, saved_dir, key, tp_rank, split_factor)
         if save_int8:
             base_key = key.replace(".weight", "")
@@ -366,8 +385,8 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         pass
     elif "mlp.router.weight" in key:
         val = np.concatenate(vals, axis=1)
-        split_vals = np.split(val, split_factor, axis=0)
-        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+        key = f'{layer_prefix}.mlp.router.weight'
+        save_val(val, saved_dir, key)
     elif "experts.linear_fc1.weight" in key:
         cat_dim = -1
         val = np.concatenate(vals, axis=cat_dim)
@@ -378,12 +397,14 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
         split_w3s = np.split(w3, split_factor, axis=1)
 
         split_vals = [np.concatenate(item, axis=1) for item in zip(split_w3s, split_w1s)]
+        key = f'{layer_prefix}.mlp.experts_weight_1'
         save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor)
 
     elif "experts.linear_fc2.weight" in key:
         cat_dim = -1
         val = np.concatenate(vals, axis=cat_dim)
         split_vals = np.split(val, split_factor, axis=cat_dim)
+        key = f'{layer_prefix}.mlp.experts_weight_2'
         save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor)
     else:
         print(f"[WARNING] {key} not handled by converter")
diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
index d9135d5c0c21..44133de381bd 100644
--- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
+++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
@@ -27,7 +27,7 @@
 import tensorstore  # This is important even though not used. Otherwise zarr raises error.
 import torch
 import zarr
-from tensorrt_llm._utils import np_bfloat16, str_dtype_to_torch, torch_to_numpy
+from tensorrt_llm._utils import np_bfloat16, pad_vocab_size, str_dtype_to_torch, torch_to_numpy
 from tqdm import tqdm
 from transformers import AutoTokenizer, GPT2Tokenizer, LlamaConfig
 
@@ -174,6 +174,7 @@ def convert_dist_checkpoint(unpacked_checkpoints_dir: UnpackedNemoCheckpointDir,
     multi_query_mode = nemo_model_config.get("multi_query_mode", False)
     num_attention_heads = nemo_model_config["num_attention_heads"]
     kv_channels = nemo_model_config.get("kv_channels", None)
+    use_parallel_embedding = args.use_parallel_embedding
     if num_kv_heads == 0:
         if multi_query_mode:
             num_kv_heads = 1
@@ -191,6 +192,7 @@ def convert_dist_checkpoint(unpacked_checkpoints_dir: UnpackedNemoCheckpointDir,
         "kv_channels": kv_channels,
         "use_attention_nemo_shape": True,
         "transpose_weights": True,
+        "use_parallel_embedding": use_parallel_embedding,
     }
 
     # split_factor: in how many parts a TP training node is split
@@ -202,22 +204,30 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
             if has_position_embedding:
                 val = model[get_layer_name("position_embedding", prefix)]
                 val = torch_to_numpy(val.to(storage_type).cpu())
-                model_level_weights["model.wpe.bin"].append(val)
+                model_level_weights["transformer.position_embedding.weight"].append(val)
         if pp_idx == 0:
             val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
             if embedding_scaling:
                 val = val * float(math.sqrt(hidden_size))
 
+            vocab_size = val.shape[0]
+            if use_parallel_embedding:
+                # Pad vocab_size first
+                if vocab_size % inference_tp_size != 0:
+                    vocab_size_padded = pad_vocab_size(vocab_size, inference_tp_size)
+                    pad_width = vocab_size_padded - vocab_size
+                    val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0)
+
             val = torch_to_numpy(val.to(storage_type).cpu())
-            model_level_weights["model.wte.bin"].append(val)
+            model_level_weights["transformer.vocab_embedding.weight"].append(val)
             if share_embeddings_and_output:
                 val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
                 val = torch_to_numpy(val.to(storage_type).cpu())
-                model_level_weights["model.lm_head.weight.bin"].append(val)
+                model_level_weights["lm_head.weight"].append(val)
         if has_lm_head and pp_idx == training_pp_size - 1:
             val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
             val = torch_to_numpy(val.to(storage_type).cpu())
-            model_level_weights["model.lm_head.weight.bin"].append(val)
+            model_level_weights["lm_head.weight"].append(val)
 
     weights_dict = {}
 
@@ -280,7 +290,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
         model_level_weights[key] = np.concatenate(values, axis=0)
 
         weights_dict[key] = model_level_weights[key]
-    vocab_size = model_level_weights["model.wte.bin"].shape[0]
 
     if nemo_model_config["tokenizer"].get("library", None) == "huggingface":
         tokenizer = AutoTokenizer.from_pretrained(
@@ -293,23 +302,7 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
         tokenizer_config["model"] = os.path.join(out_dir, "tokenizer.model")
         tokenizer = build_tokenizer(tokenizer_config)
 
-    llm_config = nemo_to_llm_config(
-        nemo_model_config, vocab_size, tokenizer.eos_token_id, tokenizer.bos_token_id, args.decoder_type,
-    )
-
-    llm_config.is_mcore = is_mcore
-
-    config = configparser.ConfigParser()
-    decoder_name_dict = {"llama": "llama", "falcon": "falcon"}
-    model_name = decoder_name_dict[args.decoder_type] if args.decoder_type in decoder_name_dict else "gpt"
-
-    config[model_name] = {k: str(v) for k, v in vars(llm_config).items()}
-    config[model_name]["storage_dtype"] = args.storage_type
-    config_path = out_dir / "config.ini"
-    with config_path.open("w") as config_file:
-        config.write(config_file)
-
-    return weights_dict, llm_config, tokenizer
+    return weights_dict, nemo_model_config, tokenizer
 
 
 @torch.no_grad()
diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py
index ee2073fa518d..d735cab36b00 100644
--- a/nemo/export/trt_llm/nemo_utils.py
+++ b/nemo/export/trt_llm/nemo_utils.py
@@ -28,9 +28,14 @@
 import numpy as np
 import tensorrt_llm
 from tensorrt_llm import str_dtype_to_trt
-from transformers import AutoTokenizer, LlamaConfig, PretrainedConfig, PreTrainedTokenizer
+from tensorrt_llm._utils import pad_vocab_size
+from tensorrt_llm.functional import non_gated_version
+from tensorrt_llm.layers import MoeConfig
+from tensorrt_llm.models.modeling_utils import PretrainedConfig
+from transformers import AutoTokenizer, LlamaConfig, PreTrainedTokenizer
 
 from nemo.export.tarutils import TarPath
+from nemo.export.trt_llm.decoder import DECODER_MODEL_TYPE
 from nemo.export.trt_llm.model_config import (
     LAYERNORM_DEFAULT,
     LAYERNORM_RMS,
@@ -56,6 +61,7 @@ def _nemo_llm_decode(
     storage_type: str = "bfloat16",
     load_checkpoints_on_gpu: bool = False,
     decoder_type: str = "gptnext",
+    use_parallel_embedding: bool = False,
     save_nemo_model_config: bool = False,
 ) -> Tuple[Dict[str, np.ndarray], PretrainedConfig, PreTrainedTokenizer]:
     """Decodes the NEMO file and returns the weights dict, llm config and tokenizer."""
@@ -67,6 +73,7 @@ def _nemo_llm_decode(
     args.load_checkpoints_on_gpu = load_checkpoints_on_gpu
     args.verbose = False
     args.decoder_type = decoder_type
+    args.use_parallel_embedding = use_parallel_embedding
 
     if not os.path.exists(in_file):
         LOGGER.error("%s does not exist", in_file)
@@ -194,7 +201,9 @@ def nemo_llm_to_model_config(
 
 
 def to_word_list_format(
-    word_dict: List[List[str]], tokenizer=None, ref_str="<extra_id_1>",
+    word_dict: List[List[str]],
+    tokenizer=None,
+    ref_str="<extra_id_1>",
 ):
     '''
     format of word_dict
@@ -250,7 +259,10 @@ def to_word_list_format(
 
 
 def nemo_llm_model_to_model_config(
-    nemo_model: str, decoder_type: str, nemo_model_config: str, dtype_str: str = "float32",
+    nemo_model: str,
+    decoder_type: str,
+    nemo_model_config: str,
+    dtype_str: str = "float32",
 ) -> Tuple[List[ModelConfig], PreTrainedTokenizer]:
     """Converts the NEMO model object and construct the `ModelConfig` before tensorrt_llm deployment."""
     from megatron.core import parallel_state
@@ -297,8 +309,8 @@ def nemo_llm_model_to_model_config(
 
     LOGGER.info(
         f'''Resharing: Rank {tensorrt_llm.mpi_rank()} mapping:
-        tp_rank  {parallel_state.get_tensor_model_parallel_rank()} -> {model_config.mapping.tp_rank}, 
-        pp_rank  {parallel_state.get_pipeline_model_parallel_rank()} -> {model_config.mapping.pp_rank}, 
+        tp_rank  {parallel_state.get_tensor_model_parallel_rank()} -> {model_config.mapping.tp_rank},
+        pp_rank  {parallel_state.get_pipeline_model_parallel_rank()} -> {model_config.mapping.pp_rank},
         tp_group {model_config.mapping.tp_group}'''
     )
 
@@ -321,3 +333,161 @@ def nemo_llm_model_to_model_config(
     model_config.lm_head.weight = lm_head_weight
 
     return [model_config]
+
+
+def nemo_to_trtllm_config(
+    in_file: str,
+    decoder_type: str,
+    nemo_export_dir: Union[str, Path],
+    dtype: str = "bfloat16",
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    use_parallel_embedding: bool = False,
+    save_nemo_model_config: bool = False,
+) -> Tuple[List[Dict], List[PretrainedConfig], PreTrainedTokenizer]:
+    """Converts the NEMO file and construct the `PretrainedConfig` before tensorrt_llm deployment."""
+    dtype_str = dtype
+
+    weights_dict, nemo_model_config, tokenizer = _nemo_llm_decode(
+        in_file=in_file,
+        out_dir=nemo_export_dir,
+        tensor_parallelism=tensor_parallel_size,
+        processes=1,
+        storage_type=dtype_str,
+        use_parallel_embedding=use_parallel_embedding,
+        load_checkpoints_on_gpu=False,
+        decoder_type=decoder_type,
+        save_nemo_model_config=save_nemo_model_config,
+    )
+
+    world_size = tensor_parallel_size * pipeline_parallel_size
+
+    lm_head_weight = weights_dict["lm_head.weight"]
+
+    vocab_size = weights_dict["transformer.vocab_embedding.weight"].shape[0]
+    vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size)
+
+    if vocab_size_padded != vocab_size:
+        pad_width = vocab_size_padded - vocab_size
+        lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
+
+    hidden_act = nemo_model_config.get('activation')
+    hidden_act = (
+        hidden_act.split("-")[-1] if nemo_model_config.get('num_moe_experts', 0) else non_gated_version(hidden_act)
+    )
+
+    config = {
+        'architecture': DECODER_MODEL_TYPE[decoder_type],
+        'dtype': dtype_str,
+        'num_hidden_layers': nemo_model_config.get('num_layers'),
+        'num_attention_heads': nemo_model_config.get('num_attention_heads'),
+        'num_key_value_heads': nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
+        'head_size': nemo_model_config.get('kv_channels'),
+        'hidden_size': nemo_model_config.get('hidden_size'),
+        'intermediate_size': nemo_model_config.get('ffn_hidden_size'),
+        'norm_epsilon': nemo_model_config.get('layernorm_epsilon'),
+        'vocab_size': vocab_size_padded,
+        'position_embedding_type': (
+            "rope_gpt_neox" if nemo_model_config.get('position_embedding_type') == "rope" else "learned_absolute"
+        ),
+        'max_position_embeddings': nemo_model_config.get('max_position_embeddings'),
+        'hidden_act': hidden_act,
+        'use_parallel_embedding': use_parallel_embedding,
+        'embedding_sharding_dim': 0,
+        'share_embedding_table': False,
+        'quantization': {
+            'quant_algo': None,
+            'kv_cache_quant_algo': None,
+        },
+        'bias': nemo_model_config.get('bias'),
+        'apply_query_key_layer_scaling': False,
+        'rotary_pct': nemo_model_config.get('rotary_percentage', 1.0),
+        'rotary_base': nemo_model_config.get('rotary_base', 10000),
+        'moe_num_experts': nemo_model_config.get('num_moe_experts', 0),
+        'moe_top_k': nemo_model_config.get('moe_router_topk'),
+        'moe_normalization_mode': nemo_model_config.get(
+            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+        ),
+        'moe_tp_mode': nemo_model_config.get('moe_tp_mode', MoeConfig.ParallelismMode.TENSOR_PARALLEL),
+        'logits_dtype': 'float32',
+        'world_size': world_size,
+        'tp_size': tensor_parallel_size,
+        'pp_size': pipeline_parallel_size,
+    }
+
+    model_configs = []
+    weights_dicts = []
+    num_layers = nemo_model_config.get('num_layers')
+    rotary_scaling = nemo_model_config.get("seq_len_interpolation_factor")
+
+    if decoder_type == "falcon":
+        config["new_decoder_architecture"] = False if num_layers == 32 else True
+        config["parallel_attention"] = True
+    if rotary_scaling is not None:
+        config["rotary_scaling"] = {"type": "linear", "factor": float(rotary_scaling)}
+
+    pp_key = {
+        "transformer.vocab_embedding.weight",
+        "transformer.position_embedding.weight",
+        "lm_head.weight",
+        "transformer.ln_f.weight",
+        "transformer.ln_f.bias",
+    }
+
+    for i in range(world_size):
+        mapping = tensorrt_llm.Mapping(
+            world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size
+        )
+        layers_range = mapping.pp_layers(num_layers)
+
+        weights_dict_local = {}
+        for k, v in weights_dict.items():
+            if k in pp_key:
+                continue
+            new_key = k
+            if new_key.endswith(".bin"):  # TP split
+                if new_key.endswith(f"{mapping.tp_rank}.bin"):
+                    new_key = new_key.replace(f".{mapping.tp_rank}.bin", "")
+            if "layers" in new_key:  # PP
+                layer_num = int(new_key.split(".")[2])
+                if layer_num in layers_range:
+                    new_key = new_key.replace(f"layers.{layer_num}", f"layers.{layer_num-layers_range[0]}")
+            if config.get("new_decoder_architecture", False) and "post_layernorm" in new_key:
+                new_key = new_key.replace("post_layernorm", "mlp_layernorm")
+            weights_dict_local[new_key] = v
+
+        if mapping.is_first_pp_rank():
+            embedding_weight = (
+                np.ascontiguousarray(
+                    split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank)
+                )
+                if use_parallel_embedding
+                else weights_dict["transformer.vocab_embedding.weight"]
+            )
+
+            weights_dict_local["transformer.vocab_embedding.weight"] = embedding_weight
+
+            pos_embedding_weight = weights_dict.get("transformer.position_embedding.weight")
+            if pos_embedding_weight is not None:
+                if use_parallel_embedding:
+                    pos_embedding_weight = np.ascontiguousarray(
+                        split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank)
+                    )
+                weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight
+
+        if mapping.is_last_pp_rank():
+            weights_dict_local["lm_head.weight"] = np.ascontiguousarray(
+                split(lm_head_weight, mapping.tp_size, mapping.tp_rank)
+            )
+            weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"]
+
+            ln_f_bias = weights_dict.get("transformer.ln_f.bias")
+            if ln_f_bias is not None:
+                weights_dict_local["transformer.ln_f.bias"] = ln_f_bias
+
+        model_config = PretrainedConfig(**config)
+        model_config.mapping = mapping
+        model_configs.append(model_config)
+        weights_dicts.append(weights_dict_local)
+
+    return weights_dicts, model_configs, tokenizer
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index 3ad27a2eb9a6..ac8d9094ea32 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -25,10 +25,13 @@
 import torch
 from tensorrt_llm import str_dtype_to_trt
 from tensorrt_llm._utils import np_dtype_to_trt
-from tensorrt_llm.builder import Builder
+from tensorrt_llm.builder import BuildConfig, Builder
+from tensorrt_llm.commands.build import build as build_trtllm
 from tensorrt_llm.logger import logger
-from tensorrt_llm.models.modeling_utils import add_lora
+from tensorrt_llm.lora_manager import LoraBuildConfig
+from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights
 from tensorrt_llm.network import net_guard
+from tensorrt_llm.plugin import PluginConfig
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
 
@@ -57,11 +60,11 @@ def serialize_engine(engine, path):
 
 def refit_runtime_engine(params, cuda_engine):
     '''
-        @brief: Inplace refit one TensorRT cuda engine using weights from the network,
-            user should guarantee that the engine is built with REFIT flag, and the network has the same structure with the engine.
-        @param engine_buffer: A serialized TensorRT engine.
-        @param network: Network object.
-        @return: A serialized TRT engine if refit successfully, None otherwise
+    @brief: Inplace refit one TensorRT cuda engine using weights from the network,
+        user should guarantee that the engine is built with REFIT flag, and the network has the same structure with the engine.
+    @param engine_buffer: A serialized TensorRT engine.
+    @param network: Network object.
+    @return: A serialized TRT engine if refit successfully, None otherwise
     '''
     logger.info(f'Refit runtime engine')
     tik = time.time()
@@ -88,7 +91,11 @@ def refit_runtime_engine(params, cuda_engine):
 
 
 def build_rank_engine(
-    tensorrt_llm_gpt, builder: Builder, builder_config: tensorrt_llm.builder.BuilderConfig, engine_name, args,
+    tensorrt_llm_gpt,
+    builder: Builder,
+    builder_config: tensorrt_llm.builder.BuilderConfig,
+    engine_name,
+    args,
 ):
 
     str_dtype_to_trt(args.dtype)
@@ -348,3 +355,70 @@ def build(
     tok = time.time()
     t = time.strftime("%H:%M:%S", time.gmtime(tok - tik))
     logger.info(f"Total time of building all {args.mapping.world_size} engines: {t}")
+
+
+def build_and_save_engine(
+    max_input_len=1024,
+    max_output_len=1024,
+    max_batch_size=4,
+    model_dir=None,
+    model_weights=None,
+    model_config=None,
+    model_type='gpt',
+    lora_ckpt_list=None,
+    use_lora_plugin=None,
+    max_lora_rank=64,
+    lora_target_modules=None,
+    max_prompt_embedding_table_size=0,
+    enable_multi_block_mode: bool = False,
+):
+    try:
+        model_cls = getattr(tensorrt_llm.models, model_config.architecture)
+    except:
+        raise AttributeError(f"Could not find TRTLLM model type: {model_type}!")
+
+    logger.set_level("info")
+    str_dtype = model_config.dtype
+    plugin_config = PluginConfig()
+    plugin_config.set_gpt_attention_plugin(dtype=str_dtype)
+    plugin_config.set_gemm_plugin(dtype=str_dtype)
+    plugin_config.set_plugin("multi_block_mode", enable_multi_block_mode)
+    max_num_tokens = max_batch_size * max_input_len
+
+    build_dict = {
+        'max_input_len': max_input_len,
+        'max_output_len': max_output_len,
+        'max_batch_size': max_batch_size,
+        'max_beam_width': 1,
+        'max_num_tokens': max_num_tokens,
+        'opt_num_tokens': None,
+        'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
+        'gather_context_logits': False,
+        'gather_generation_logits': False,
+        'strongly_typed': False,
+        'builder_opt': None,
+    }
+    build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
+
+    if use_lora_plugin is not None:
+        build_config.plugin_config.set_lora_plugin(use_lora_plugin)
+        lora_config = LoraBuildConfig(
+            lora_dir=lora_ckpt_list,
+            lora_ckpt_source='nemo',
+            max_lora_rank=max_lora_rank,
+            lora_target_modules=lora_target_modules,
+        )
+        build_config.lora_config = lora_config
+
+    model = model_cls.from_config(model_config)
+    model = optimize_model(
+        model,
+        use_parallel_embedding=model_config.use_parallel_embedding,
+        share_embedding_table=model_config.share_embedding_table,
+    )
+    preprocess_weights(model_weights, model_config)
+    model.load(model_weights)
+    engine = build_trtllm(model, build_config)
+    engine.save(model_dir)
+
+    return engine
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index c490f37e1fc4..92fc36272f7c 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -26,7 +26,7 @@
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraManager
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import ModelConfig, SamplingConfig
+from tensorrt_llm.runtime import ModelConfig, ModelRunnerCpp, SamplingConfig
 from transformers import PreTrainedTokenizer
 
 from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
@@ -55,7 +55,7 @@ class TensorrtLLMHostContext:
 class TensorrtLLMWorkerContext:
     """The MPI worker side context for TRT LLM inference."""
 
-    decoder: tensorrt_llm.runtime.GenerationSession = None
+    decoder: ModelRunnerCpp = None
     sampling_config: SamplingConfig = None
     lora_manager: LoraManager = None
     max_batch_size: int = 0
@@ -135,42 +135,38 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b
 
         engine_dir = Path(engine_dir)
         config_path = engine_dir / "config.json"
-        model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path)
+        # model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path)
 
-        runtime_rank = tensorrt_llm.mpi_rank()
+        with open(config_path, "r") as f:
+            config = json.load(f)
 
-        assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound"
-        runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank, tp_size=tp_size, pp_size=pp_size)
+        max_batch_size = config["build_config"]["max_batch_size"]
+        max_input_len = config["build_config"]["max_input_len"]
+        max_output_len = config["build_config"]["max_output_len"]
+        max_beam_width = config["build_config"]["max_beam_width"]
 
-        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
-        engine_name = get_engine_name(MODEL_NAME, dtype, tp_size, pp_size, runtime_rank)
-        serialize_path = os.path.join(engine_dir, engine_name)
-        logger.info(f"Reading from serialize path {serialize_path}")
+        runtime_rank = tensorrt_llm.mpi_rank()
 
-        with open(serialize_path, "rb") as f:
-            engine_buffer = f.read()
-        decoder = tensorrt_llm.runtime.GenerationSession(
-            model_config, engine_buffer, runtime_mapping, debug_mode=False
+        decoder = ModelRunnerCpp.from_dir(
+            engine_dir=engine_dir,
+            lora_dir=lora_ckpt_list,
+            lora_ckpt_source="nemo",
+            rank=runtime_rank,
+            max_batch_size=max_batch_size,
+            max_input_len=max_input_len,
+            max_output_len=max_output_len,
+            max_beam_width=max_beam_width,
+            debug_mode=False,
         )
 
         sampling_config = SamplingConfig(
             end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams
         )
 
-        if decoder.use_lora_plugin:
-            lora_manager = LoraManager()
-            if lora_ckpt_list is not None:
-                lora_manager.load_from_nemo(
-                    model_files=lora_ckpt_list, model_config=model_config, runtime_mapping=runtime_mapping,
-                )
-        else:
-            lora_manager = None
-
         # Initialize the global context so it can be used during `run` API.
         global tensorrt_llm_worker_context
         tensorrt_llm_worker_context.decoder = decoder
         tensorrt_llm_worker_context.sampling_config = sampling_config
-        tensorrt_llm_worker_context.lora_manager = lora_manager
         tensorrt_llm_worker_context.max_batch_size = max_batch_size
         tensorrt_llm_worker_context.max_input_len = max_input_len
 
@@ -207,7 +203,6 @@ def _forward(
         decoder = tensorrt_llm_worker_context.decoder
         assert decoder is not None, "Invalid worker context, decoder is not loaded."
         sampling_config = tensorrt_llm_worker_context.sampling_config
-        lora_manager = tensorrt_llm_worker_context.lora_manager
         max_batch_size = tensorrt_llm_worker_context.max_batch_size
         max_input_len = tensorrt_llm_worker_context.max_input_len
 
@@ -217,60 +212,36 @@ def _forward(
         max_length = max(input_lengths)
         assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}"
         pad_id = sampling_config.pad_id
-
-        if decoder.remove_input_padding:
-            line_encoded = torch.concat(input_tensors).cuda()
-        else:
-            line_encoded = torch.nested.to_padded_tensor(
-                torch.nested.nested_tensor(input_tensors, dtype=torch.int32), pad_id
-            ).cuda()
-
-        input_lengths = torch.tensor(input_lengths, dtype=torch.int32).cuda()
-
-        if prompt_table is None:
-            ptuning_args = []
-        else:
-            if task_vocab_size is None:
-                raise Exception("task_vocab_size cannot be None")
-
-            task_vocab_size = torch.tensor([task_vocab_size], dtype=torch.int32, device="cuda")
-            task_ids = torch.tensor(task_ids, dtype=torch.int32, device="cuda")
-            prompt_table = prompt_table.cuda()
-            ptuning_args = [prompt_table, task_ids, task_vocab_size]
+        end_id = sampling_config.end_id
+        num_beams = sampling_config.num_beams
 
         with torch.no_grad():
-            sampling_config.top_k = top_k
-            sampling_config.top_p = top_p
-            sampling_config.temperature = temperature
-            for key, param in sampling_kwargs.items():
-                # set any additional SamplingConfig kwargs
-                setattr(sampling_config, key, param)
-
-            decoder.setup(
-                batch_size,
-                max_context_length=max_length,
-                max_new_tokens=max_output_len,
-                lora_manager=lora_manager,
-                lora_uids=lora_uids,
-            )
+            prompt_tasks = None if task_ids is None else ",".join(str(task) for task in task_ids)
 
-            outputs = decoder.decode(
-                line_encoded,
-                input_lengths,
-                sampling_config,
-                *ptuning_args,
+            outputs = decoder.generate(
+                input_tensors,
+                max_new_tokens=max_output_len,
+                end_id=end_id,
+                pad_id=pad_id,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                num_beams=num_beams,
                 stop_words_list=stop_words_list,
                 bad_words_list=bad_words_list,
-                no_repeat_ngram_size=no_repeat_ngram_size,
+                lora_uids=lora_uids,
+                prompt_table=prompt_table,
+                prompt_tasks=prompt_tasks,
                 streaming=streaming,
                 output_sequence_lengths=True,
                 return_dict=True,
             )
+
             torch.cuda.synchronize()
 
         runtime_rank = tensorrt_llm.mpi_rank()
         if runtime_rank == 0 or multiprocessed_env:
-            return outputs, decoder.log_probs
+            return outputs
         else:
             return None
 
@@ -290,10 +261,14 @@ def load(
     config_path = os.path.join(engine_dir, "config.json")
     with open(config_path, "r") as f:
         config = json.load(f)
-    world_size = config["builder_config"]["world_size"]
+    world_size = config["pretrained_config"]["mapping"]["world_size"]
     if world_size == 1:
         _load(tokenizer, engine_dir, lora_ckpt_list, num_beams)
         executor = None
+    elif tensorrt_llm.mpi_world_size() > 1:
+        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams)
+        executor = None
+        tensorrt_llm.mpi_barrier()
     else:
         executor = MPIPoolExecutor(max_workers=world_size)
         futures = []
@@ -303,9 +278,9 @@ def load(
         for future in futures:
             future.result()
 
-    max_batch_size = config["builder_config"]["max_batch_size"]
-    max_input_len = config["builder_config"]["max_input_len"]
-    add_bos = config["builder_config"]["add_bos"]
+    max_batch_size = config["build_config"]["max_batch_size"]
+    max_input_len = config["build_config"]["max_input_len"]
+    add_bos = True if config["pretrained_config"]["architecture"] == "GemmaForCausalLM" else False
 
     return TensorrtLLMHostContext(
         executor=executor,
@@ -355,7 +330,10 @@ def load_refit(
     # Manipulate the tensorrt_llm mapping to make it compatible with the multiprocessed env.
     assert tensorrt_llm.mpi_world_size() == torch.distributed.get_world_size(), "MPI world size mismatch"
     runtime_mapping = tensorrt_llm.Mapping(
-        world_size=tensorrt_llm.mpi_world_size(), rank=runtime_rank, tp_size=tensorrt_llm.mpi_world_size(), pp_size=1,
+        world_size=tensorrt_llm.mpi_world_size(),
+        rank=runtime_rank,
+        tp_size=tensorrt_llm.mpi_world_size(),
+        pp_size=1,
     )
 
     engine_name = get_engine_name(
@@ -386,7 +364,9 @@ def load_refit(
         lora_manager = LoraManager()
         if lora_ckpt_list is not None:
             lora_manager.load_from_nemo(
-                model_files=lora_ckpt_list, model_config=model_config, runtime_mapping=runtime_mapping,
+                model_files=lora_ckpt_list,
+                model_config=model_config,
+                runtime_mapping=runtime_mapping,
             )
     else:
         lora_manager = None
@@ -576,7 +556,7 @@ def generate(
     if no_repeat_ngram_size is not None:
         no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device())
 
-    outputs, log_probs = forward(
+    outputs = forward(
         input_tensors=input_tensors,
         max_output_len=max_output_len,
         host_context=host_context,
@@ -596,6 +576,8 @@ def generate(
         **sampling_kwargs,
     )
     assert outputs is not None
+    if tensorrt_llm.mpi_rank() != 0:
+        return None
 
     output_ids = outputs['output_ids']
     sequence_lengths = outputs['sequence_lengths']
@@ -656,7 +638,7 @@ def generate_streaming(
     if no_repeat_ngram_size is not None:
         no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device())
 
-    outputs, log_probs = forward(
+    outputs = forward(
         input_tensors=input_tensors,
         max_output_len=max_output_len,
         host_context=host_context,
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
index 9798473dd880..5e5833444f65 100644
--- a/scripts/export/export_to_trt_llm.py
+++ b/scripts/export/export_to_trt_llm.py
@@ -78,7 +78,6 @@ def get_args(argv):
         '--use_lora_plugin',
         nargs='?',
         const=None,
-        default=False,
         choices=['float16', 'float32', 'bfloat16'],
         help="Activates the lora plugin which enables embedding sharing.",
     )
@@ -86,7 +85,16 @@ def get_args(argv):
         '--lora_target_modules',
         nargs='+',
         default=None,
-        choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",],
+        choices=[
+            "attn_qkv",
+            "attn_q",
+            "attn_k",
+            "attn_v",
+            "attn_dense",
+            "mlp_h_to_4h",
+            "mlp_gate",
+            "mlp_4h_to_h",
+        ],
         help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
     )
     parser.add_argument(

From b1628cf231eff0ca96a94b5f840b0dcbb7f2d667 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Mon, 13 May 2024 22:35:05 -0500
Subject: [PATCH 074/178] Alit/optim 8k (#9166)

* fix fuser issue with dynamo

* optimized 4k seq len

* optim 8k

* add checkpointing

* add ckpt arg

* fix minor bug

* minor fix

* more optimized chkpting

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* addressing comments

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

---------

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
Co-authored-by: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 .../conf/megatron_griffin_config.yaml         |   1 +
 .../megatron_griffin_finetuning_config.yaml   |   1 +
 .../megatron_griffin_generate_config.yaml     |   2 +-
 .../megatron/griffin/griffin_block.py         | 165 +++++++++++++--
 .../megatron/griffin/griffin_model.py         |  26 ++-
 .../megatron/griffin/recurrent_layer.py       |  20 +-
 .../megatron/griffin/recurrent_module.py      | 194 ++++++++++++------
 .../megatron_griffin_model.py                 |  20 +-
 8 files changed, 324 insertions(+), 105 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
index c080ff846ba1..1d3620493162 100644
--- a/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_config.yaml
@@ -108,6 +108,7 @@ model:
   # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
   # 'full' will checkpoint the entire transformer layer.
   activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers
   activations_checkpoint_method: null # 'uniform', 'block'
   # 'uniform' divides the total number of transformer layers and checkpoints the input activation
   # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
index e144c784fb0c..f92f971eb059 100644
--- a/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_finetuning_config.yaml
@@ -117,6 +117,7 @@ model:
   # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
   # 'full' will checkpoint the entire transformer layer.
   activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers  activations_checkpoint_method: null # 'uniform', 'block'
   activations_checkpoint_method: null # 'uniform', 'block'
   # 'uniform' divides the total number of transformer layers and checkpoints the input activation
   # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
diff --git a/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml b/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
index b09cce5671c9..e22b615d48aa 100644
--- a/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_griffin_generate_config.yaml
@@ -121,7 +121,7 @@ model:
   # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
   # 'full' will checkpoint the entire transformer layer.
   activations_checkpoint_granularity: null # 'selective' or 'full' 
-  activations_checkpoint_method: null # 'uniform', 'block'
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers  activations_checkpoint_method: null # 'uniform', 'block'
   # 'uniform' divides the total number of transformer layers and checkpoints the input activation
   # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
   # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py
index 3fc26a51f3c1..d8954ad1b3c3 100755
--- a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_block.py
@@ -11,17 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from megatron.core.models.common.language_module.language_module import LanguageModule
-from megatron.core.transformer.custom_layers.transformer_engine import TENorm
-from megatron.core.transformer.spec_utils import build_module
-from megatron.core.transformer.transformer_config import TransformerConfig
-from torch import nn
-
+from torch import Tensor, nn
 from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_layer_spec import (
     griffin_mqa_layer_with_transformer_engine_spec,
     griffin_recurrent_layer_with_transformer_engine_spec,
 )
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core.models.common.language_module.language_module import LanguageModule
+    from megatron.core.packed_seq_params import PackedSeqParams
+    from megatron.core.transformer.custom_layers.transformer_engine import TENorm, te_checkpoint
+    from megatron.core.transformer.spec_utils import build_module
+    from megatron.core.transformer.transformer_config import TransformerConfig
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
 
 
 def get_griffin_layers(num_layers):
@@ -41,16 +50,22 @@ def get_griffin_layers(num_layers):
 
 
 def create_block(
-    config, layer_spec, layer_idx,
+    config,
+    layer_spec,
+    layer_idx,
 ):
-    block = build_module(layer_spec, config,)
+    block = build_module(
+        layer_spec,
+        config,
+    )
     block.layer_number = layer_idx + 1
     return block
 
 
 class GriffinStack(LanguageModule):
     def __init__(
-        self, config: TransformerConfig,
+        self,
+        config: TransformerConfig,
     ):
 
         super().__init__(config)
@@ -58,17 +73,139 @@ def __init__(
         self.griffin_layers = get_griffin_layers(self.config.num_layers)
 
         self.layers = nn.ModuleList(
-            [create_block(self.config, layer_spec, layer_idx=i,) for i, layer_spec in enumerate(self.griffin_layers)]
+            [
+                create_block(
+                    self.config,
+                    layer_spec,
+                    layer_idx=i,
+                )
+                for i, layer_spec in enumerate(self.griffin_layers)
+            ]
         )
         self.final_layernorm = TENorm(
-            config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
         )
+        self.num_layers = len(self.layers)
+
+    def _get_layer(self, layer_number: int):
+        return self.layers[layer_number]
+
+    def _checkpointed_forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        context: Tensor = None,
+        context_mask: Tensor = None,
+        rotary_pos_emb: Tensor = None,
+        packed_seq_params: PackedSeqParams = None,
+    ):
+        """Forward method with activation checkpointing."""
+
+        def custom(start: int, end: int):
+            def custom_forward(
+                hidden_states,
+                attention_mask,
+                context,
+                context_mask,
+                rotary_pos_emb,
+                packed_seq_params,
+            ):
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    hidden_states, context = layer(
+                        hidden_states=hidden_states,
+                        attention_mask=attention_mask,
+                        context=context,
+                        context_mask=context_mask,
+                        rotary_pos_emb=rotary_pos_emb,
+                        inference_params=None,
+                        packed_seq_params=packed_seq_params,
+                    )
+                return hidden_states, context
+
+            return custom_forward
+
+        def checkpoint_handler(forward_func):
+            if self.config.fp8:
+                return te_checkpoint(
+                    forward_func,
+                    self.config.distribute_saved_activations,
+                    tensor_parallel.random.get_cuda_rng_tracker,
+                    parallel_state.get_tensor_model_parallel_group(),
+                    hidden_states,
+                    attention_mask,
+                    context,
+                    context_mask,
+                    rotary_pos_emb,
+                    packed_seq_params,
+                )
+            else:
+                return tensor_parallel.checkpoint(
+                    forward_func,
+                    self.config.distribute_saved_activations,
+                    hidden_states,
+                    attention_mask,
+                    context,
+                    context_mask,
+                    rotary_pos_emb,
+                    packed_seq_params,
+                )
+
+        if self.config.recompute_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and checkpoint
+            # the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            l = 0
+            while l < self.num_layers:
+                hidden_states, context = checkpoint_handler(custom(l, l + self.config.recompute_num_layers))
+
+                l += self.config.recompute_num_layers
+
+        elif self.config.recompute_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            recompute_skip_num_layers = 0
+            for l in range(self.num_layers):
+                # Skip recomputation when input grad computation is not needed.
+                # Need to have at least one input tensor with gradient computation
+                # for re-enterant autograd engine.
+                if self.config.fp8 and not hidden_states.requires_grad:
+                    recompute_skip_num_layers += 1
+                if l >= recompute_skip_num_layers and l < self.config.recompute_num_layers + recompute_skip_num_layers:
+                    hidden_states, context = checkpoint_handler(custom(l, l + 1))
+                else:
+                    hidden_states, context = custom(l, l + 1)(
+                        hidden_states,
+                        attention_mask,
+                        context,
+                        context_mask,
+                        rotary_pos_emb,
+                        packed_seq_params,
+                    )
+        else:
+            raise ValueError("Invalid activation recompute method.")
+
+        return hidden_states
 
     def forward(self, hidden_states, attention_mask, rotary_pos_emb):
 
-        for layer in self.layers:
+        if (
+            self.config.recompute_granularity == 'full'
+            and self.training
+            and not self.config.activations_checkpoint_recurrent
+        ):
+            hidden_states = self._checkpointed_forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                rotary_pos_emb=rotary_pos_emb,
+            )
+        else:
+            for layer in self.layers:
 
-            hidden_states, _ = layer(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
+                hidden_states, _ = layer(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb)
 
         hidden_states = self.final_layernorm(hidden_states)
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
index 4531b64d1d96..7a327a3a35cb 100755
--- a/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/griffin_model.py
@@ -13,15 +13,23 @@
 # limitations under the License.
 
 import math
-
 import torch
-from megatron.core import tensor_parallel
-from megatron.core.jit import jit_fuser
-from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
-from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
-from megatron.core.models.common.language_module.language_module import LanguageModule
-from megatron.core.transformer.transformer_config import TransformerConfig
-from torch import Tensor, nn
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    from megatron.core import tensor_parallel
+    from megatron.core.jit import jit_fuser
+    from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+    from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+    from megatron.core.models.common.language_module.language_module import LanguageModule
+    from megatron.core.transformer.transformer_config import TransformerConfig
+    from torch import Tensor, nn
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
 
 from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_block import GriffinStack
 
@@ -142,7 +150,7 @@ def forward(
         position_ids: Tensor = None,
         attention_mask: Tensor = None,
         labels: Tensor = None,
-        **extra_arg
+        **extra_arg,
     ):
         if input_ids is None:
             input_ids = self.input_tensor
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py
index 8263f54889a0..3a33f8966fd2 100755
--- a/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_layer.py
@@ -14,13 +14,21 @@
 
 from dataclasses import dataclass
 from typing import Union
-
-from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.spec_utils import ModuleSpec, build_module
-from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import make_viewless_tensor
 from torch import Tensor
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
+    from megatron.core.transformer.module import MegatronModule
+    from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+    from megatron.core.transformer.transformer_config import TransformerConfig
+    from megatron.core.utils import make_viewless_tensor
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
 
 
 @dataclass
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
index d91c07718917..033d3abec732 100755
--- a/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/griffin/recurrent_module.py
@@ -17,33 +17,50 @@
 from typing import Union
 
 import torch
+import torch._dynamo
 from accelerated_scan.triton import scan
 from causal_conv1d import causal_conv1d_fn
 from einops import rearrange
-from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
-from megatron.core.jit import jit_fuser
-from megatron.core.transformer.identity_op import IdentityOp
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.spec_utils import ModuleSpec, build_module
-from megatron.core.transformer.transformer_config import TransformerConfig
 from torch import nn
 
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    from megatron.core import tensor_parallel
+    from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
+    from megatron.core.jit import jit_fuser
+    from megatron.core.transformer.identity_op import IdentityOp
+    from megatron.core.transformer.module import MegatronModule
+    from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+    from megatron.core.transformer.transformer_config import TransformerConfig
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
+
+torch._dynamo.config.suppress_errors = True
+
 
 # Class copied from https://github.com/google-deepmind/recurrentgemma
 class BlockDiagonalLinear(nn.Module):
     """Block-diagonal linear layer."""
 
     def __init__(
-        self, width: int, num_blocks: int, w_init_variance_scale: float = 1.0,
+        self,
+        width: int,
+        num_blocks: int,
+        w_init_variance_scale: float = 1.0,
     ):
         """Initializes the BlockDiagonalLinear.
 
-    Args:
-      width: The number of dimensions of the input and output.
-      num_blocks: The number of diagonal blocks in the layer.
-      w_init_variance_scale: A parameters that scales the variance of the
-        initialization of the weights.
-    """
+        Args:
+          width: The number of dimensions of the input and output.
+          num_blocks: The number of diagonal blocks in the layer.
+          w_init_variance_scale: A parameters that scales the variance of the
+            initialization of the weights.
+        """
         super().__init__()
         self.width = width
         self.num_blocks = num_blocks
@@ -62,25 +79,46 @@ def w_init_(self, w: torch.Tensor) -> None:
         std = math.sqrt(self.w_init_variance_scale / self.block_width)
         torch.nn.init.normal_(w, mean=0.0, std=std)
 
-    def forward(self, x):
-        """Calls the BlockDiagonalLinear."""
-        # Split x to blocks.
-        bs, seq_l = x.shape[0], x.shape[1]
+    @jit_fuser
+    def _fused_pre_reshape_(self, x, bs, seq_l):
         x = (
             x.reshape(bs, seq_l, self.num_blocks, self.block_width)
             .permute(2, 0, 1, 3)
             .reshape(self.num_blocks, bs * seq_l, self.block_width)
         )
-        x = (torch.bmm(x, self.w).permute(1, 0, 2) + self.b).reshape(bs, seq_l, self.num_blocks * self.block_width)
-        out = torch.sigmoid(x)
-        return out
+        return x
+
+    @jit_fuser
+    def _post_add_reshape_sigmoid_(self, x, bs, seq_l):
+        x = (x.permute(1, 0, 2) + self.b).reshape(bs, seq_l, self.num_blocks * self.block_width)
+        x = torch.sigmoid(x)
+        return x
+
+    def forward(self, x):
+        """Calls the BlockDiagonalLinear."""
+        # Split x to blocks.
+        bs, seq_l = x.shape[0], x.shape[1]
+        x = self._fused_pre_reshape_(x, bs, seq_l)
+
+        x = torch.bmm(x, self.w)
+        x = self._post_add_reshape_sigmoid_(x, bs, seq_l)
+
+        return x
 
 
 # Class copied from https://github.com/google-deepmind/recurrentgemma
 
 
 @jit_fuser
-def _scan_preprocess_(a, x, reset):
+def _scan_preprocess_(x, gate_a, gate_x, reset, a_params):
+
+    log_a = -8.0 * gate_a * nn.functional.softplus(a_params)
+    a = torch.exp(log_a)
+    gated_x = x * gate_x
+    multiplier = torch.sqrt((1 - torch.exp(2 * log_a)) + 1e-6)
+    multiplier = reset + (1 - reset) * multiplier
+    x = gated_x * multiplier.type(x.dtype)
+
     assert x.ndim == 3
     assert a.shape == x.shape[-a.ndim :]
     assert a.dtype == x.dtype
@@ -94,38 +132,54 @@ def _scan_preprocess_(a, x, reset):
     a = a.permute(0, 2, 1)
     x = x.contiguous()
     a = a.contiguous()
+
     return a, x
 
 
 def rnn_scan(
-    x, a, reset,
+    x,
+    gate_a,
+    gate_x,
+    reset,
+    a_params,
+    # x, a, reset,
 ):
     """Runs the recurrence of a linear RNN.
 
-  Args:
-    x: The input sequence.
-    a: The diagonal of the recurrence matrix `A`.
-    reset: Indicator of document boundaries, e.g. when to reset the hidden
-      state of the RNN.
-    h0: The initial hidden state.
-
-  Returns:
-    The output of the linear recurrence.
-  """
-    a, x = _scan_preprocess_(a, x, reset)
+    Args:
+      x: The input sequence.
+      a: The diagonal of the recurrence matrix `A`.
+      reset: Indicator of document boundaries, e.g. when to reset the hidden
+        state of the RNN.
+      h0: The initial hidden state.
+
+    Returns:
+      The output of the linear recurrence.
+    """
+
+    a, x = _scan_preprocess_(x, gate_a, gate_x, reset, a_params)
+
     y = scan(a.float(), x.float()).type_as(x)
+
     y = y.permute(0, 2, 1)
+
     return y, None
 
 
 # Class copied from https://github.com/google-deepmind/recurrentgemma
 
 
-def rnn_param_init(*, width: int, min_rad: float, max_rad: float, transform: str = "softplus",) -> torch.Tensor:
+def rnn_param_init(
+    *,
+    width: int,
+    min_rad: float,
+    max_rad: float,
+    transform: str = "softplus",
+) -> torch.Tensor:
     """Initializes the `A` parameter of the RG-LRU uniformly on a ring."""
     unif = torch.rand(width)
     # Proportional to area in a ring.
-    a_real = 0.5 * torch.log(unif * (max_rad ** 2 - min_rad ** 2) + min_rad ** 2 + 1e-8)
+    a_real = 0.5 * torch.log(unif * (max_rad**2 - min_rad**2) + min_rad**2 + 1e-8)
 
     if transform == "softplus":
         # Inverse transform.
@@ -141,17 +195,20 @@ class RGLRU(nn.Module):
     """A Real-Gated Linear Recurrent Unit (RG-LRU) layer."""
 
     def __init__(
-        self, width: int, num_heads: int, w_init_variance_scale: float = 1.0,
+        self,
+        width: int,
+        num_heads: int,
+        w_init_variance_scale: float = 1.0,
     ):
         """Initializes the RG-LRU.
 
-    Args:
-      width: The number of dimensions of the input and output.
-      num_heads: The number of diagonal blocks in the input and A gate layers.
-      w_init_variance_scale: Initialization parameter for the
-        BlockDiagonalLinear layers of the gates. See the `BlockDiagonalLinear`
-        layer for details.
-    """
+        Args:
+          width: The number of dimensions of the input and output.
+          num_heads: The number of diagonal blocks in the input and A gate layers.
+          w_init_variance_scale: Initialization parameter for the
+            BlockDiagonalLinear layers of the gates. See the `BlockDiagonalLinear`
+            layer for details.
+        """
         super().__init__()
         self.width = width
         self.num_heads = num_heads
@@ -160,7 +217,9 @@ def __init__(
         # Parameters and layers.
         self.a_param = nn.Parameter(self.a_param_init)
         self.input_gate = BlockDiagonalLinear(
-            width=self.width, num_blocks=self.num_heads, w_init_variance_scale=w_init_variance_scale,
+            width=self.width,
+            num_blocks=self.num_heads,
+            w_init_variance_scale=w_init_variance_scale,
         )
         self.a_gate = BlockDiagonalLinear(
             width=self.width, num_blocks=self.num_heads, w_init_variance_scale=self.w_init_variance_scale
@@ -184,18 +243,22 @@ def _fused_pst_gates_(self, x, gate_a, gate_x, reset):
         return normalized_x, a
 
     def __call__(
-        self, x, segment_pos, prev_h,
+        self,
+        x,
+        segment_pos,
+        prev_h,
     ):
         """Calls the RG-LRU.
 
-    Args:
-      x: Sequence of input activations.
-      segment_pos: Position of each token in the sequence.
-      prev_h: The previous hidden state of the RG-LRU.
+        Args:
+          x: Sequence of input activations.
+          segment_pos: Position of each token in the sequence.
+          prev_h: The previous hidden state of the RG-LRU.
+
+        Returns:
+          Output of the block together with the updated hidden state.
+        """
 
-    Returns:
-      Output of the block together with the updated hidden state.
-    """
         for param in self.parameters():
             param.data_ptr()
 
@@ -207,9 +270,7 @@ def __call__(
         gate_x = self.input_gate(x)
         gate_a = self.a_gate(x)
 
-        # Compute the parameter `A` of the recurrence.
-        normalized_x, a = self._fused_pst_gates_(x, gate_a, gate_x, reset)
-        y, last_h = rnn_scan(x=normalized_x, a=a, reset=reset)
+        y, last_h = rnn_scan(x, gate_a, gate_x, reset, self.a_param)
 
         return y, last_h
 
@@ -230,11 +291,17 @@ def __init__(self, config, width, temporal_width):
         )
 
     def forward(
-        self, x, segment_pos=None, prev_x=None,
+        self,
+        x,
+        segment_pos=None,
+        prev_x=None,
     ):
         x = x.permute(0, 2, 1)
         output = causal_conv1d_fn(
-            x=x, weight=rearrange(self.conv_1d.weight, "d 1 w -> d w"), bias=self.conv_1d.bias, activation=None,
+            x=x,
+            weight=rearrange(self.conv_1d.weight, "d 1 w -> d w"),
+            bias=self.conv_1d.bias,
+            activation=None,
         ).permute(0, 2, 1)
         return output, None
 
@@ -314,6 +381,11 @@ def __init__(
             submodules.rg_lru, width=self.config.hidden_size, num_heads=self.config.num_attention_heads
         )
 
+    def checkpoint_handler(self, forward_func, x, segment_pos, prev_x):
+        return tensor_parallel.checkpoint(
+            forward_func, self.config.distribute_saved_activations, x, segment_pos, prev_x
+        )
+
     def forward(self, hidden_states, attention_mask=None, rotary_pos_emb=None):
 
         segment_pos = torch.arange(hidden_states.shape[0]).unsqueeze(0).repeat(hidden_states.shape[1], 1).cuda()
@@ -326,9 +398,13 @@ def forward(self, hidden_states, attention_mask=None, rotary_pos_emb=None):
 
         x = _fused_permute_add_(x_intermidiate_parallel, x_bias_parallel)
 
-        x, _ = self.conv_1d(x=x, segment_pos=segment_pos, prev_x=None)
+        if self.config.activations_checkpoint_recurrent and self.training:
+            x, _ = self.checkpoint_handler(self.conv_1d, x=x, segment_pos=segment_pos, prev_x=None)
+            x, _ = self.checkpoint_handler(self.rg_lru, x=x, segment_pos=segment_pos, prev_x=None)
 
-        x, _ = self.rg_lru(x=x, segment_pos=segment_pos, prev_h=None,)
+        else:
+            x, _ = self.conv_1d(x=x, segment_pos=segment_pos, prev_x=None)
+            x, _ = self.rg_lru(x=x, segment_pos=segment_pos, prev_h=None)
 
         x = _fused_permute_mult_(x, y)
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py b/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
index 20ad376b8f98..1e5a2f0c15c0 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_griffin_model.py
@@ -18,15 +18,6 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron.griffin.griffin_model import GriffinModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
-
-try:
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-    TransformerConfig = ApexGuardDefaults
-    HAVE_MEGATRON_CORE = False
 
 
 class MegatronGriffinModel(MegatronGPTModel):
@@ -35,13 +26,6 @@ class MegatronGriffinModel(MegatronGPTModel):
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
-        if not HAVE_MEGATRON_CORE:
-            raise ImportError(
-                "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
-            )
-
-        # build the transformer config
-        # TODO: add type hint once pip package is out
 
         self.vocab_size = cfg.get('vocab_size', 256000)
         self.cfg = cfg
@@ -70,8 +54,12 @@ def forward(self, input_ids, position_ids=None, attention_mask=None, labels=None
 
     def build_transformer_config(self):
         transformer_config = super().build_transformer_config()
+        transformer_config.activations_checkpoint_recurrent = self.cfg.get('activations_checkpoint_recurrent', False)
         transformer_config.gated_linear_unit = self.cfg.get('gated_linear_unit', True)
         transformer_config.layernorm_zero_centered_gamma = self.cfg.get('layernorm_zero_centered_gamma', True)
+        assert (
+            not transformer_config.activations_checkpoint_recurrent or not transformer_config.recompute_granularity
+        ), "Either the recurrent checkpoiting or the full/custom checkpointing should be set"
 
         return transformer_config
 

From 93907f000dbaeed899556c5ae224557172233412 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Tue, 14 May 2024 12:01:13 -0400
Subject: [PATCH 075/178] Bucketing duration bins: less optimal but instant
 init when not provided + fixes in estimation script (#9157)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Bucketing duration bins: less optimal but instant init when not provided + fixes in estimation script

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix CPU mem hungriness

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Make estimate duration bins work for every kind of manifest

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Support more type of inputs

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fixes

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* msg

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Apply isort and black reformatting

Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: pablo-garay <pablo-garay@users.noreply.github.com>
---
 nemo/collections/common/data/lhotse/cutset.py | 84 ++++++++++++++-----
 .../common/data/lhotse/dataloader.py          | 62 ++++++++++++--
 .../common/data/lhotse/nemo_adapters.py       | 32 ++++---
 .../convert_to_tarred_audio_dataset.py        |  2 +-
 .../estimate_duration_bins.py                 | 53 ++++++++----
 5 files changed, 177 insertions(+), 56 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index cb2efe0312d2..775395400d8e 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -127,7 +127,7 @@ def read_dataset_config(config) -> tuple[CutSet, bool]:
         "shard_seed": config.shard_seed,
         "text_field": config.text_field,
         "lang_field": config.lang_field,
-        "missing_sampling_rate_ok": config.missing_sampling_rate_ok,
+        "metadata_only": config.metadata_only,
         "max_open_streams": config.max_open_streams,
     }
     input_cfg = config.input_cfg
@@ -164,7 +164,10 @@ def parse_group(grp_cfg: DictConfig, propagate_attrs: dict) -> [CutSet, bool]:
         is_tarred = True
         cuts = read_txt_pair_paths(grp_cfg)
     elif grp_cfg.type == "group":
-        cuts, is_tarred = parse_and_combine_datasets(grp_cfg.input_cfg, propagate_attrs=propagate_attrs,)
+        cuts, is_tarred = parse_and_combine_datasets(
+            grp_cfg.input_cfg,
+            propagate_attrs=propagate_attrs,
+        )
     else:
         raise ValueError(f"Unrecognized group: {grp_cfg.type}")
     # Attach extra tags to every utterance dynamically, if provided.
@@ -176,7 +179,10 @@ def parse_group(grp_cfg: DictConfig, propagate_attrs: dict) -> [CutSet, bool]:
 def read_txt_paths(config: DictConfig) -> CutSet:
     return CutSet(
         LhotseTextAdapter(
-            paths=config.paths, language=config.language, shuffle_shards=config.shuffle, shard_seed=config.shard_seed,
+            paths=config.paths,
+            language=config.language,
+            shuffle_shards=config.shuffle,
+            shard_seed=config.shard_seed,
         )
     ).repeat()
 
@@ -238,6 +244,7 @@ def parse_and_combine_datasets(
             weights=weights if weights else None,
             max_open_streams=propagate_attrs["max_open_streams"],
             seed=propagate_attrs["shard_seed"],
+            metadata_only=propagate_attrs["metadata_only"],
         )
     else:
         (cuts,) = cuts
@@ -261,11 +268,16 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
         # - integer means we'll set a specific seed in every worker, and data would be duplicated across them.
         #   This is mostly useful for unit testing or debugging.
         shard_seed = config.shard_seed
+        metadata_only = config.metadata_only
         if config.get("cuts_path") is not None:
             warnings.warn("Note: lhotse.cuts_path will be ignored because lhotse.shar_path was provided.")
         if isinstance(config.shar_path, (str, Path)):
             logging.info(f"Initializing Lhotse Shar CutSet (tarred) from a single data source: '{config.shar_path}'")
-            cuts = CutSet.from_shar(in_dir=config.shar_path, shuffle_shards=True, seed=shard_seed).repeat()
+            cuts = CutSet.from_shar(
+                **_resolve_shar_inputs(config.shar_path, metadata_only), shuffle_shards=True, seed=shard_seed
+            )
+            if not metadata_only:
+                cuts = cuts.repeat()
         else:
             # Multiple datasets in Lhotse Shar format: we will dynamically multiplex them
             # with probability approximately proportional to their size
@@ -278,7 +290,9 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
             for item in config.shar_path:
                 if isinstance(item, (str, Path)):
                     path = item
-                    cs = CutSet.from_shar(in_dir=path, shuffle_shards=True, seed=shard_seed)
+                    cs = CutSet.from_shar(
+                        **_resolve_shar_inputs(path, metadata_only), shuffle_shards=True, seed=shard_seed
+                    )
                     weight = len(cs)
                 else:
                     assert isinstance(item, Sequence) and len(item) == 2 and isinstance(item[1], (int, float)), (
@@ -288,11 +302,19 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
                         f"We got: '{item}'"
                     )
                     path, weight = item
-                    cs = CutSet.from_shar(in_dir=path, shuffle_shards=True, seed=shard_seed)
+                    cs = CutSet.from_shar(
+                        **_resolve_shar_inputs(path, metadata_only), shuffle_shards=True, seed=shard_seed
+                    )
                 logging.info(f"- {path=} {weight=}")
-                cutsets.append(cs.repeat())
+                cutsets.append(cs)
                 weights.append(weight)
-            cuts = mux(*cutsets, weights=weights, max_open_streams=config.max_open_streams, seed=config.shard_seed)
+            cuts = mux(
+                *cutsets,
+                weights=weights,
+                max_open_streams=config.max_open_streams,
+                seed=config.shard_seed,
+                metadata_only=metadata_only,
+            )
     else:
         # Regular Lhotse manifest points to individual audio files (like native NeMo manifest).
         path = config.cuts_path
@@ -300,6 +322,13 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
     return cuts
 
 
+def _resolve_shar_inputs(path: str | Path, only_metadata: bool) -> dict:
+    if only_metadata:
+        return dict(fields={"cuts": sorted(Path(path).glob("cuts.*"))})
+    else:
+        return dict(in_dir=path)
+
+
 def resolve_relative_paths(cut: Cut, manifest_path: str) -> Cut:
     if isinstance(cut, PaddingCut):
         return cut
@@ -352,20 +381,24 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
     common_kwargs = {
         "text_field": config.text_field,
         "lang_field": config.lang_field,
+        "shuffle_shards": config.shuffle,
+        "shard_seed": config.shard_seed,
     }
     # The option below is to allow a special case of NeMo manifest iteration as Lhotse CutSet
-    # without performing any I/O. NeMo manifests typically don't have sampling_rate information required by Lhotse.
-    # This is useful for utility scripts that iterate metadata and estimate optimal batching settings.
-    notar_kwargs = {"missing_sampling_rate_ok": config.missing_sampling_rate_ok}
+    # without performing any I/O. NeMo manifests typically don't have sampling_rate information required by Lhotse,
+    # so lhotse has to look up the headers of audio files to fill it on-the-fly.
+    # (this only has an impact on non-tarred data; tarred data is read into memory anyway).
+    # This is useful for utility scripts that iterate metadata and estimate optimal batching settings
+    # and other data statistics.
+    notar_kwargs = {"metadata_only": config.metadata_only}
+    metadata_only = config.metadata_only
     if isinstance(config.manifest_filepath, (str, Path)):
         logging.info(f"Initializing Lhotse CutSet from a single NeMo manifest (tarred): '{config.manifest_filepath}'")
-        if is_tarred:
+        if is_tarred and not metadata_only:
             cuts = CutSet(
                 LazyNeMoTarredIterator(
                     config.manifest_filepath,
                     tar_paths=config.tarred_audio_filepaths,
-                    shuffle_shards=config.shuffle,
-                    shard_seed=config.shard_seed,
                     **common_kwargs,
                 )
             ).repeat()
@@ -393,12 +426,10 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
         for manifest_info, (tar_path,) in zip(config.manifest_filepath, tar_paths):
             # First, convert manifest_path[+tar_path] to an iterator.
             manifest_path = manifest_info[0]
-            if is_tarred:
+            if is_tarred and not metadata_only:
                 nemo_iter = LazyNeMoTarredIterator(
                     manifest_path=manifest_path,
                     tar_paths=tar_path,
-                    shuffle_shards=config.shuffle,
-                    shard_seed=config.shard_seed,
                     **common_kwargs,
                 )
             else:
@@ -431,12 +462,22 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
                 cutsets.append(CutSet(nemo_iter))
                 weights.append(weight)
         # Finally, we multiplex the dataset streams to mix the data.
-        cuts = mux(*cutsets, weights=weights, max_open_streams=config.max_open_streams, seed=config.shard_seed)
+        cuts = mux(
+            *cutsets,
+            weights=weights,
+            max_open_streams=config.max_open_streams,
+            seed=config.shard_seed,
+            metadata_only=metadata_only,
+        )
     return cuts
 
 
 def mux(
-    *cutsets: CutSet, weights: list[int | float], max_open_streams: int | None = None, seed: str | int = "trng"
+    *cutsets: CutSet,
+    weights: list[int | float],
+    max_open_streams: int | None = None,
+    seed: str | int = "trng",
+    metadata_only: bool = False,
 ) -> CutSet:
     """
     Helper function to call the right multiplexing method flavour in lhotse.
@@ -444,9 +485,12 @@ def mux(
     it will select a more appropriate multiplexing strategy.
     """
     if max_open_streams is not None:
+        assert not metadata_only, "max_open_streams and metadata_only options are not compatible"
         cuts = CutSet.infinite_mux(*cutsets, weights=weights, seed=seed, max_open_streams=max_open_streams)
     else:
-        cuts = CutSet.mux(*[cs.repeat() for cs in cutsets], weights=weights, seed=seed)
+        if not metadata_only:
+            cutsets = [cs.repeat() for cs in cutsets]
+        cuts = CutSet.mux(*cutsets, weights=weights, seed=seed)
     return cuts
 
 
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 9efd6444aecd..32bbc1f3e8f4 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -95,7 +95,9 @@ class LhotseDataLoadingConfig:
 
     # 4. Optional Lhotse data augmentation.
     #   a. On-the-fly noise/audio mixing.
-    noise_path: Any | None = None  # str | dict where dict can have any of keys: manifest_filepath, tarred_audio_filepaths, cuts_path, shar_path
+    noise_path: Any | None = (
+        None  # str | dict where dict can have any of keys: manifest_filepath, tarred_audio_filepaths, cuts_path, shar_path
+    )
     noise_snr: tuple[float, float] = (10.0, 20.0)
     noise_mix_prob: float = 0.5
     #   b. On-the-fly 3-way speed perturbation.
@@ -114,7 +116,9 @@ class LhotseDataLoadingConfig:
     cut_into_windows_duration: Optional[float] = None  # set this to enable
     cut_into_windows_hop: Optional[float] = None
     #       III) common options
-    keep_excessive_supervisions: bool = True  # when a cut is truncated in the middle of a supervision, should we keep them.
+    keep_excessive_supervisions: bool = (
+        True  # when a cut is truncated in the middle of a supervision, should we keep them.
+    )
     #   e. RIR augmentation (synthetic RIR if rir_path is None)
     #   at the moment supports only Lhotse recording manifests, e.g. https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/rir_noise.py
     rir_enabled: bool = False
@@ -126,11 +130,15 @@ class LhotseDataLoadingConfig:
     lang_field: str = "lang"  # key to read the language tag from
     # Enables iteration of NeMo non-tarred manifests that don't have a "sampling_rate" key without performing any I/O.
     # Note that this will not allow actual dataloading; it's only for manifest iteration as Lhotse objects.
-    missing_sampling_rate_ok: bool = False
+    metadata_only: bool = False
 
 
 def get_lhotse_dataloader_from_config(
-    config: DictConfig, global_rank: int, world_size: int, dataset: torch.utils.data.Dataset, tokenizer=None,
+    config: DictConfig,
+    global_rank: int,
+    world_size: int,
+    dataset: torch.utils.data.Dataset,
+    tokenizer=None,
 ) -> torch.utils.data.DataLoader:
     """
     Set up a Lhotse training dataloder.
@@ -205,7 +213,11 @@ def get_lhotse_dataloader_from_config(
     #    and applying it here (before sampler/dataset) ensures optimal
     #    bucket allocation.
     if config.perturb_speed:
-        cuts = CutSet.mux(cuts, cuts.perturb_speed(0.9), cuts.perturb_speed(1.1),)
+        cuts = CutSet.mux(
+            cuts,
+            cuts.perturb_speed(0.9),
+            cuts.perturb_speed(1.1),
+        )
 
     # 2.d: truncation/slicing
     if config.truncate_duration is not None:
@@ -249,6 +261,7 @@ def get_lhotse_dataloader_from_config(
             f"Creating a Lhotse DynamicBucketingSampler "
             f"(max_batch_duration={config.batch_duration} max_batch_size={config.batch_size})"
         )
+        # Determine the bucket duration bins
         sampler = DynamicBucketingSampler(
             cuts,
             constraint=constraint,
@@ -257,7 +270,7 @@ def get_lhotse_dataloader_from_config(
             shuffle_buffer_size=config.shuffle_buffer_size,
             seed=config.shard_seed,
             num_buckets=config.num_buckets,
-            duration_bins=config.bucket_duration_bins,
+            duration_bins=determine_bucket_duration_bins(config),
             num_cuts_for_bins_estimate=config.num_cuts_for_bins_estimate,
             buffer_size=config.bucket_buffer_size,
             rank=0 if is_tarred else global_rank,
@@ -291,7 +304,10 @@ def get_lhotse_dataloader_from_config(
         # object with texts joined by a whitespace so that "regular" dataset classes don't
         # have to add a special support for multi-supervision cuts.
         sampler = sampler.map(
-            CutConcatenate(gap=config.concatenate_gap_seconds, duration_factor=config.concatenate_duration_factor,)
+            CutConcatenate(
+                gap=config.concatenate_gap_seconds,
+                duration_factor=config.concatenate_duration_factor,
+            )
         )
         if config.db_norm is not None:
             sampler = sampler.map(partial(_normalize_loudness, db_norm=config.db_norm))
@@ -326,12 +342,38 @@ def get_lhotse_dataloader_from_config(
         # the meta-data to Dataset, which performs the actual I/O inside its __getitem__ method.
         dloader_kwargs = dict(dataset=dataset, sampler=sampler)
     dloader = torch.utils.data.DataLoader(
-        **dloader_kwargs, batch_size=None, num_workers=config.num_workers, pin_memory=config.pin_memory,
+        **dloader_kwargs,
+        batch_size=None,
+        num_workers=config.num_workers,
+        pin_memory=config.pin_memory,
     )
 
     return dloader
 
 
+def determine_bucket_duration_bins(config):
+    if config.bucket_duration_bins is not None:
+        # Bucket duration bins are provided: just use them.
+        return config.bucket_duration_bins
+    # Bucket duration bins are not set.
+    if config.use_multimodal_sampling:
+        # For multimodal sampling it's currently impossible to define a linspace over durations
+        # because the buckets are counted in the number of tokens.
+        # The bins will be auto-estimated by lhotse at the cost of a slight lag in the training start.
+        return None
+    elif config.max_duration is not None and config.max_duration < float("inf"):
+        # If max duration is provided, we can use that to compute uniformly distant bucket bins.
+        # This is not optimal but should be close enough for users who didn't want to estimate these up-front.
+        begin = config.min_duration if config.min_duration is not None and config.min_duration > 0 else 0.0
+        end = config.max_duration
+        return np.linspace(begin, end, config.num_buckets + 1)[1:-1].tolist()
+    else:
+        # If we don't know max_duration, we can't guess a reasonable estimate of the upper bound of
+        # durations.
+        # The bins will be auto-estimated by lhotse at the cost of a slight lag in the training start.
+        return None
+
+
 def make_structured_with_schema_warnings(config: DictConfig) -> DictConfig:
     """
     Checks the schema and fills missing default option values.
@@ -377,7 +419,9 @@ class MultimodalSamplingConstraint(SamplingConstraint):
 
     def __post_init__(self):
         self._internal = TokenConstraint(
-            max_tokens=self.batch_tokens, max_examples=self.batch_size, quadratic_length=self.quadratic_factor,
+            max_tokens=self.batch_tokens,
+            max_examples=self.batch_size,
+            quadratic_length=self.quadratic_factor,
         )
 
     def add(self, example: Any) -> None:
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index b8769b041b4f..b2ca1186c8e3 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -49,7 +49,7 @@ class LazyNeMoIterator:
 
     .. caution:: We will perform some I/O (as much as required by soundfile.info) to discover the sampling rate
         of the audio file. If this is not acceptable, convert the manifest to Lhotse format which contains
-        sampling rate info. For pure metadata iteration purposes we also provide a ``missing_sampling_rate_ok`` flag that
+        sampling rate info. For pure metadata iteration purposes we also provide a ``metadata_only`` flag that
         will create only partially valid Lhotse objects (with metadata related to sampling rate / num samples missing).
 
     Example::
@@ -62,16 +62,23 @@ def __init__(
         path: str | Path,
         text_field: str = "text",
         lang_field: str = "lang",
-        missing_sampling_rate_ok: bool = False,
+        metadata_only: bool = False,
+        shuffle_shards: bool = False,
+        shard_seed: int | Literal["randomized", "trng"] = "trng",
     ) -> None:
-        self.source = LazyJsonlIterator(path)
+        self.path = path
+        self.shuffle_shards = shuffle_shards
+        self.shard_seed = shard_seed
+        paths = expand_sharded_filepaths(path)
+        if len(paths) == 1:
+            self.source = LazyJsonlIterator(paths[0])
+        else:
+            self.source = LazyIteratorChain(
+                *(LazyJsonlIterator(p) for p in paths), shuffle_iters=self.shuffle_shards, seed=self.shard_seed
+            )
         self.text_field = text_field
         self.lang_field = lang_field
-        self.missing_sampling_rate_ok = missing_sampling_rate_ok
-
-    @property
-    def path(self) -> str | Path:
-        return self.source.path
+        self.metadata_only = metadata_only
 
     def __iter__(self) -> Generator[Cut, None, None]:
         for data in self.source:
@@ -104,7 +111,12 @@ def __len__(self) -> int:
     def __add__(self, other):
         return LazyIteratorChain(self, other)
 
-    def _create_recording(self, audio_path: str, duration: float, sampling_rate: int | None = None,) -> Recording:
+    def _create_recording(
+        self,
+        audio_path: str,
+        duration: float,
+        sampling_rate: int | None = None,
+    ) -> Recording:
         if sampling_rate is not None:
             # TODO(pzelasko): It will only work with single-channel audio in the current shape.
             return Recording(
@@ -115,7 +127,7 @@ def _create_recording(self, audio_path: str, duration: float, sampling_rate: int
                 duration=duration,
                 channel_ids=[0],
             )
-        elif self.missing_sampling_rate_ok:
+        elif self.metadata_only:
             return Recording(
                 id=audio_path,
                 sources=[AudioSource(type="file", channels=[0], source=audio_path)],
diff --git a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
index f0c7847b8c9b..c3b5cef57cbc 100644
--- a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
+++ b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
@@ -412,7 +412,7 @@ def estimate_dynamic_bucketing_duration_bins(self, manifest_path: str, num_bucke
         from lhotse.dataset.sampling.dynamic_bucketing import estimate_duration_buckets
         from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator
 
-        cuts = CutSet(LazyNeMoIterator(manifest_path, missing_sampling_rate_ok=True))
+        cuts = CutSet(LazyNeMoIterator(manifest_path, metadata_only=True))
         bins = estimate_duration_buckets(cuts, num_buckets=num_buckets)
         print(
             f"Note: we estimated the optimal bucketing duration bins for {num_buckets} buckets. "
diff --git a/scripts/speech_recognition/estimate_duration_bins.py b/scripts/speech_recognition/estimate_duration_bins.py
index 687c2af59ad2..cca101731772 100644
--- a/scripts/speech_recognition/estimate_duration_bins.py
+++ b/scripts/speech_recognition/estimate_duration_bins.py
@@ -13,6 +13,10 @@
 # limitations under the License.
 
 import argparse
+from itertools import islice
+from pathlib import Path
+
+from lhotse.cut import Cut
 from lhotse.dataset.sampling.dynamic_bucketing import estimate_duration_buckets
 from omegaconf import OmegaConf
 
@@ -23,14 +27,18 @@
 def parse_args():
     parser = argparse.ArgumentParser(
         description="Estimate duration bins for Lhotse dynamic bucketing using a sample of the input dataset. "
-        "The dataset is read either from one or more manifest files and supports data weighting."
+        "The dataset is read either from one or more manifest files and supports data weighting.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
     parser.add_argument(
         "input",
-        help='Same input format as in model configs under model.train_ds.manifest_filepath. Options: '
-        '1) "path.json"; '
-        '2) "[[path1.json],[path2.json],...]"; '
-        '3) "[[path1.json,weight1],[path2.json,weight2],...]"',
+        help='Data input. Options: '
+        '1) "path.json" - any single NeMo manifest; '
+        '2) "[[path1.json],[path2.json],...]" - any collection of NeMo manifests; '
+        '3) "[[path1.json,weight1],[path2.json,weight2],...]" - any collection of weighted NeMo manifests; '
+        '4) "input_cfg.yaml" - a new option supporting input configs, same as in model training \'input_cfg\' arg; '
+        '5) "path/to/shar_data" - a path to Lhotse Shar data directory; '
+        '6) "key=val" - in case none of the previous variants cover your case: "key" is the key you\'d use in NeMo training config with its corresponding value ',
     )
     parser.add_argument("-b", "--buckets", type=int, default=30, help="The desired number of buckets.")
     parser.add_argument(
@@ -38,7 +46,8 @@ def parse_args():
         "--num_examples",
         type=int,
         default=-1,
-        help="The number of examples (utterances) to estimate the bins. -1 means use all data.",
+        help="The number of examples (utterances) to estimate the bins. -1 means use all data "
+        "(be careful: it could be iterated over infinitely).",
     )
     parser.add_argument(
         "-l",
@@ -62,25 +71,36 @@ def parse_args():
 
 def main():
     args = parse_args()
+    if '=' in args.input:
+        inp_arg = args.input
+    elif args.input.endswith(".yaml"):
+        inp_arg = f"input_cfg={args.input}"
+    elif Path(args.input).is_dir():
+        inp_arg = f"shar_path={args.input}"
+    else:
+        inp_arg = f"manifest_filepath={args.input}"
     config = OmegaConf.merge(
         OmegaConf.structured(LhotseDataLoadingConfig),
-        OmegaConf.from_dotlist([f"manifest_filepath={args.input}", "missing_sampling_rate_ok=true"]),
+        OmegaConf.from_dotlist([inp_arg, "metadata_only=true"]),
     )
     cuts, _ = read_cutset_from_config(config)
     min_dur, max_dur = args.min_duration, args.max_duration
-    discarded, tot = 0, 0
+    nonaudio, discarded, tot = 0, 0, 0
 
     def duration_ok(cut) -> bool:
-        nonlocal discarded, tot
-        ans = min_dur <= cut.duration <= max_dur
-        if not ans:
-            discarded += 1
+        nonlocal nonaudio, discarded, tot
         tot += 1
-        return ans
+        if not isinstance(cut, Cut):
+            nonaudio += 1
+            return False
+        if not (min_dur <= cut.duration <= max_dur):
+            discarded += 1
+            return False
+        return True
 
     cuts = cuts.filter(duration_ok)
     if (N := args.num_examples) > 0:
-        cuts = cuts.subset(first=N)
+        cuts = islice(cuts, N)
     duration_bins = estimate_duration_buckets(cuts, num_buckets=args.buckets)
     duration_bins = f"[{','.join(str(round(b, ndigits=5)) for b in duration_bins)}]"
     if args.quiet:
@@ -89,11 +109,12 @@ def duration_ok(cut) -> bool:
     if discarded:
         ratio = discarded / tot
         print(f"Note: we discarded {discarded}/{tot} ({ratio:.2%}) utterances due to min/max duration filtering.")
+    if nonaudio:
+        print(f"Note: we discarded {nonaudio} non-audio examples found during iteration.")
+    print(f"Used {tot - nonaudio - discarded} examples for the estimation.")
     print("Use the following options in your config:")
     print(f"\tnum_buckets={args.buckets}")
     print(f"\tbucket_duration_bins={duration_bins}")
-    print("Computing utterance duration distribution...")
-    cuts.describe()  # prints a nice table with duration stats + other info
 
 
 if __name__ == "__main__":

From acbd4e00ae2618c36ed9dad265d339e77a57832a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 14 May 2024 21:55:34 +0400
Subject: [PATCH 076/178] Enable CUDA graphs by default only for transcription
 (#9196) (#9197)

* Enable CUDA graphs only for transcription. Sync streams before capture.

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 examples/asr/transcribe_speech.py             |  17 +-
 examples/asr/transcribe_speech_parallel.py    |   6 +-
 .../asr/parts/submodules/rnnt_decoding.py     |  15 +-
 .../parts/submodules/rnnt_greedy_decoding.py  |  20 +-
 .../submodules/rnnt_loop_labels_computer.py   | 121 +++++++---
 .../submodules/tdt_loop_labels_computer.py    | 225 +++++++++++-------
 6 files changed, 266 insertions(+), 138 deletions(-)

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index c8372c422e7b..1763c2035805 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -29,6 +29,7 @@
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
 from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
+from nemo.collections.asr.parts.submodules.rnnt_greedy_decoding import GreedyBatchedRNNTInferConfig
 from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import (
@@ -121,9 +122,9 @@ class TranscriptionConfig:
     pretrained_name: Optional[str] = None  # Name of a pretrained model
     audio_dir: Optional[str] = None  # Path to a directory which contains audio files
     dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
-    channel_selector: Optional[
-        Union[int, str]
-    ] = None  # Used to select a single channel from multichannel audio, or use average across channels
+    channel_selector: Optional[Union[int, str]] = (
+        None  # Used to select a single channel from multichannel audio, or use average across channels
+    )
     audio_key: str = 'audio_filepath'  # Used to override the default audio key in dataset_manifest
     eval_config_yaml: Optional[str] = None  # Path to a yaml file of config of evaluation
     presort_manifest: bool = True  # Significant inference speedup on short-form data due to padding reduction
@@ -161,7 +162,10 @@ class TranscriptionConfig:
     ctc_decoding: CTCDecodingConfig = CTCDecodingConfig()
 
     # Decoding strategy for RNNT models
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
+    # enable CUDA graphs for transcription
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(
+        fused_batch_size=-1, greedy=GreedyBatchedRNNTInferConfig(use_cuda_graph_decoder=True)
+    )
 
     # Decoding strategy for AED models
     multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
@@ -407,7 +411,10 @@ def autocast(dtype=None):
                 override_cfg.augmentor = augmentor
                 override_cfg.text_field = cfg.gt_text_attr_name
                 override_cfg.lang_field = cfg.gt_lang_attr_name
-                transcriptions = asr_model.transcribe(audio=filepaths, override_config=override_cfg,)
+                transcriptions = asr_model.transcribe(
+                    audio=filepaths,
+                    override_config=override_cfg,
+                )
 
     if cfg.dataset_manifest is not None:
         logging.info(f"Finished transcribing from manifest file: {cfg.dataset_manifest}")
diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py
index c0af8f97146a..df2f31072851 100644
--- a/examples/asr/transcribe_speech_parallel.py
+++ b/examples/asr/transcribe_speech_parallel.py
@@ -84,6 +84,7 @@
 from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel
 from nemo.collections.asr.models.configs.asr_models_config import ASRDatasetConfig
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
+from nemo.collections.asr.parts.submodules.rnnt_greedy_decoding import GreedyBatchedRNNTInferConfig
 from nemo.core.config import TrainerConfig, hydra_runner
 from nemo.utils import logging
 from nemo.utils.get_rank import is_global_rank_zero
@@ -100,7 +101,10 @@ class ParallelTranscriptionConfig:
     use_cer: bool = False
 
     # decoding strategy for RNNT models
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig()
+    # enable CUDA graphs for transcription
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(
+        fused_batch_size=-1, greedy=GreedyBatchedRNNTInferConfig(use_cuda_graph_decoder=True)
+    )
 
     # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
     decoder_type: Optional[str] = None
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
index 5fa225864f8c..2416d916ac13 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -331,7 +331,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
                         loop_labels=self.cfg.greedy.get('loop_labels', True),
-                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
                     )
                 else:
                     self.decoding = rnnt_greedy_decoding.GreedyBatchedTDTInfer(
@@ -347,7 +347,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         include_duration_confidence=self.tdt_include_duration_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
-                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
                     )
 
             else:
@@ -1175,7 +1175,11 @@ class RNNTDecoding(AbstractRNNTDecoding):
     """
 
     def __init__(
-        self, decoding_cfg, decoder, joint, vocabulary,
+        self,
+        decoding_cfg,
+        decoder,
+        joint,
+        vocabulary,
     ):
         # we need to ensure blank is the last token in the vocab for the case of RNNT and Multi-blank RNNT.
         blank_id = len(vocabulary) + joint.num_extra_outputs
@@ -1186,7 +1190,10 @@ def __init__(
         self.labels_map = dict([(i, vocabulary[i]) for i in range(len(vocabulary))])
 
         super(RNNTDecoding, self).__init__(
-            decoding_cfg=decoding_cfg, decoder=decoder, joint=joint, blank_id=blank_id,
+            decoding_cfg=decoding_cfg,
+            decoder=decoder,
+            joint=joint,
+            blank_id=blank_id,
         )
 
         if isinstance(self.decoding, rnnt_beam_decoding.BeamRNNTInfer):
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index b2fa9b85b5fd..fa7a5cc95fec 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -45,7 +45,10 @@
 from nemo.utils import logging
 
 
-def pack_hypotheses(hypotheses: List[rnnt_utils.Hypothesis], logitlen: torch.Tensor,) -> List[rnnt_utils.Hypothesis]:
+def pack_hypotheses(
+    hypotheses: List[rnnt_utils.Hypothesis],
+    logitlen: torch.Tensor,
+) -> List[rnnt_utils.Hypothesis]:
 
     if hasattr(logitlen, 'cpu'):
         logitlen_cpu = logitlen.to('cpu')
@@ -139,8 +142,7 @@ class _GreedyRNNTInfer(Typing, ConfidenceMethodMixin):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return {
             "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
             "encoded_lengths": NeuralType(tuple('B'), LengthsType()),
@@ -149,8 +151,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {"predictions": [NeuralType(elements_type=HypothesisType())]}
 
     def __init__(
@@ -578,6 +579,7 @@ class GreedyBatchedRNNTInfer(_GreedyRNNTInfer, WithOptionalCudaGraphs):
             (evaluating Joint multiple times in inner loop); It uses a minimal possible amount of calls
             to prediction network (with maximum possible batch size),
             which makes it especially useful for scaling the prediction network.
+        use_cuda_graph_decoder: if CUDA graphs should be enabled for decoding (currently recommended only for inference)
     """
 
     def __init__(
@@ -590,7 +592,7 @@ def __init__(
         preserve_frame_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
         loop_labels: bool = True,
-        use_cuda_graph_decoder: bool = True,
+        use_cuda_graph_decoder: bool = False,
     ):
         super().__init__(
             decoder_model=decoder_model,
@@ -2358,7 +2360,7 @@ class GreedyBatchedRNNTInferConfig:
     tdt_include_duration_confidence: bool = False
     confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig())
     loop_labels: bool = True
-    use_cuda_graph_decoder: bool = True
+    use_cuda_graph_decoder: bool = False
 
     def __post_init__(self):
         # OmegaConf.structured ensures that post_init check is always executed
@@ -2695,6 +2697,8 @@ class GreedyBatchedTDTInfer(_GreedyRNNTInfer, WithOptionalCudaGraphs):
                 Supported values:
                     - 'lin' for using the linear mapping.
                     - 'exp' for using exponential mapping with linear shift.
+
+        use_cuda_graph_decoder: if CUDA graphs should be enabled for decoding (currently recommended only for inference)
     """
 
     def __init__(
@@ -2708,7 +2712,7 @@ def __init__(
         preserve_frame_confidence: bool = False,
         include_duration_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
-        use_cuda_graph_decoder: bool = True,
+        use_cuda_graph_decoder: bool = False,
     ):
         super().__init__(
             decoder_model=decoder_model,
diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
index b920dba09cfd..718deb7a409c 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
@@ -112,7 +112,9 @@ def __init__(
         self.max_time = max_time
 
         self.encoder_output_projected = torch.zeros(
-            (self.batch_size, self.max_time, encoder_dim), dtype=float_dtype, device=self.device,
+            (self.batch_size, self.max_time, encoder_dim),
+            dtype=float_dtype,
+            device=self.device,
         )
         self.encoder_output_length = torch.zeros((self.batch_size,), dtype=torch.long, device=self.device)
 
@@ -288,7 +290,9 @@ def reset_cuda_graphs_state(self):
         self.separate_graphs = None
 
     def loop_labels_torch(
-        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         """
         Pure PyTorch implementation
@@ -361,7 +365,8 @@ def loop_labels_torch(
             # blank label in `labels` tensor means "end of hypothesis" (for this index)
             logits = (
                 self.joint.joint_after_projection(
-                    encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                    encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1),
+                    decoder_output,
                 )
                 .squeeze(1)
                 .squeeze(1)
@@ -378,9 +383,11 @@ def loop_labels_torch(
                     time_indices=time_indices_current_labels,
                     logits=logits if self.preserve_alignments else None,
                     labels=labels if self.preserve_alignments else None,
-                    confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
-                    if self.preserve_frame_confidence
-                    else None,
+                    confidence=(
+                        self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
+                        if self.preserve_frame_confidence
+                        else None
+                    ),
                 )
 
             # advance_mask is a mask for current batch for searching non-blank labels;
@@ -397,7 +404,8 @@ def loop_labels_torch(
                 torch.where(advance_mask, time_indices, time_indices_current_labels, out=time_indices_current_labels)
                 logits = (
                     self.joint.joint_after_projection(
-                        encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                        encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1),
+                        decoder_output,
                     )
                     .squeeze(1)
                     .squeeze(1)
@@ -416,9 +424,11 @@ def loop_labels_torch(
                         time_indices=time_indices_current_labels,
                         logits=logits if self.preserve_alignments else None,
                         labels=more_labels if self.preserve_alignments else None,
-                        confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
-                        if self.preserve_frame_confidence
-                        else None,
+                        confidence=(
+                            self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
+                            if self.preserve_frame_confidence
+                            else None
+                        ),
                     )
 
                 blank_mask = labels == self._blank_index
@@ -432,19 +442,27 @@ def loop_labels_torch(
             # this seems to be redundant, but used in the `loop_frames` output
             torch.ne(active_mask, active_mask_prev, out=became_inactive_mask)
             self.decoder.batch_replace_states_mask(
-                src_states=state, dst_states=last_decoder_state, mask=became_inactive_mask,
+                src_states=state,
+                dst_states=last_decoder_state,
+                mask=became_inactive_mask,
             )
 
             # store hypotheses
             if self.max_symbols is not None:
                 # pre-allocated memory, no need for checks
                 batched_hyps.add_results_masked_no_checks_(
-                    active_mask, labels, time_indices_current_labels, scores,
+                    active_mask,
+                    labels,
+                    time_indices_current_labels,
+                    scores,
                 )
             else:
                 # auto-adjusted storage
                 batched_hyps.add_results_masked_(
-                    active_mask, labels, time_indices_current_labels, scores,
+                    active_mask,
+                    labels,
+                    time_indices_current_labels,
+                    scores,
                 )
 
             # stage 4: to avoid looping, go to next frame after max_symbols emission
@@ -455,7 +473,8 @@ def loop_labels_torch(
                     active_mask,
                     torch.logical_and(
                         torch.logical_and(
-                            labels != self._blank_index, batched_hyps.last_timestep_lasts >= self.max_symbols,
+                            labels != self._blank_index,
+                            batched_hyps.last_timestep_lasts >= self.max_symbols,
                         ),
                         batched_hyps.last_timestep == time_indices,
                     ),
@@ -470,7 +489,9 @@ def loop_labels_torch(
         return batched_hyps, None, last_decoder_state
 
     def loop_labels_cuda_graphs(
-        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         """
         Implementation with CUDA graphs.
@@ -565,7 +586,9 @@ def _create_inner_while_loop_kernel(cls):
         return run_nvrtc(kernel_string, b"inner_find_non_blank_conditional", cls.CUDA_PROGRAM_NAME)
 
     def _graph_reinitialize(
-        self, encoder_output_projected: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output_projected: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ):
         batch_size, max_time, encoder_dim = encoder_output_projected.shape
 
@@ -602,25 +625,34 @@ def _partial_graphs_compile(self):
         """Compile decoding by parts"""
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.state.device)
+        stream_for_graph.wait_stream(torch.cuda.default_stream(self.state.device))
         self.separate_graphs = SeparateGraphsLoopLabels()
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.separate_graphs.before_outer_loop, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.separate_graphs.before_outer_loop, stream=stream_for_graph),
         ):
             self._before_outer_loop()
 
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.separate_graphs.before_inner_loop, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.separate_graphs.before_inner_loop, stream=stream_for_graph),
         ):
             self._before_inner_loop_get_decoder_output()
             self._before_inner_loop_get_joint_output()
 
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.separate_graphs.inner_loop_code, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.separate_graphs.inner_loop_code, stream=stream_for_graph),
         ):
             self._inner_loop_code()
 
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.separate_graphs.after_inner_loop, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.separate_graphs.after_inner_loop, stream=stream_for_graph),
         ):
             self._after_inner_loop()
 
@@ -628,9 +660,12 @@ def _full_graph_compile(self):
         """Compile full graph for decoding"""
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.state.device)
+        stream_for_graph.wait_stream(torch.cuda.default_stream(self.state.device))
         self.full_graph = torch.cuda.CUDAGraph()
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.full_graph, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.full_graph, stream=stream_for_graph),
         ):
             self._before_outer_loop()
 
@@ -644,7 +679,8 @@ def _full_graph_compile(self):
             outer_loop_kernel = self._create_outer_while_loop_kernel()
             active_mask_any_ptr = np.array([self.state.active_mask_any.data_ptr()], dtype=np.uint64)
             outer_loop_args = np.array(
-                [outer_loop_conditional_handle.getPtr(), active_mask_any_ptr.ctypes.data], dtype=np.uint64,
+                [outer_loop_conditional_handle.getPtr(), active_mask_any_ptr.ctypes.data],
+                dtype=np.uint64,
             )
             # loop while there are active utterances
             with with_conditional_node(
@@ -657,7 +693,11 @@ def _full_graph_compile(self):
                 (inner_loop_conditional_handle,) = cu_call(cudart.cudaGraphConditionalHandleCreate(graph, 0, 0))
                 advance_mask_any_ptr = np.array([self.state.advance_mask_any.data_ptr()], dtype=np.uint64)
                 inner_loop_args = np.array(
-                    [inner_loop_conditional_handle.getPtr(), advance_mask_any_ptr.ctypes.data,], dtype=np.uint64,
+                    [
+                        inner_loop_conditional_handle.getPtr(),
+                        advance_mask_any_ptr.ctypes.data,
+                    ],
+                    dtype=np.uint64,
                 )
                 with with_conditional_node(
                     inner_while_loop_kernel, inner_loop_args, inner_loop_conditional_handle, device=self.state.device
@@ -734,9 +774,11 @@ def _before_inner_loop_get_joint_output(self):
                 time_indices=self.state.time_indices_current_labels,
                 logits=logits if self.preserve_alignments else None,
                 labels=self.state.labels if self.preserve_alignments else None,
-                confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
-                if self.preserve_frame_confidence
-                else None,
+                confidence=(
+                    self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
+                    if self.preserve_frame_confidence
+                    else None
+                ),
             )
 
         # advance_mask is a mask for current batch for searching non-blank labels;
@@ -785,9 +827,11 @@ def _inner_loop_code(self):
                 time_indices=self.state.time_indices_current_labels,
                 logits=logits if self.preserve_alignments else None,
                 labels=more_labels if self.preserve_alignments else None,
-                confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
-                if self.preserve_frame_confidence
-                else None,
+                confidence=(
+                    self._get_confidence_tensor(F.log_softmax(logits, dim=-1)).to(dtype=float_dtype)
+                    if self.preserve_frame_confidence
+                    else None
+                ),
             )
 
         # blank_mask = self.labels == self._blank_index
@@ -813,7 +857,10 @@ def _after_inner_loop(self):
         )
 
         self.state.batched_hyps.add_results_masked_no_checks_(
-            self.state.active_mask, self.state.labels, self.state.time_indices_current_labels, self.state.scores,
+            self.state.active_mask,
+            self.state.labels,
+            self.state.time_indices_current_labels,
+            self.state.scores,
         )
 
         # stage 4: to avoid looping, go to next frame after max_symbols emission
@@ -837,7 +884,9 @@ def _after_inner_loop(self):
         torch.any(self.state.active_mask, out=self.state.active_mask_any)
 
     def __call__(
-        self, x: torch.Tensor, out_len: torch.Tensor,
+        self,
+        x: torch.Tensor,
+        out_len: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         if self.cuda_graphs_mode is not None and x.device.type == "cuda":
             return self.loop_labels_cuda_graphs(encoder_output=x, encoder_output_length=out_len)
diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
index 4e514966db2b..7ad7065e019c 100644
--- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -117,7 +117,9 @@ def __init__(
         self.max_time = max_time
 
         self.encoder_output_projected = torch.zeros(
-            (self.batch_size, self.max_time, encoder_dim), dtype=float_dtype, device=self.device,
+            (self.batch_size, self.max_time, encoder_dim),
+            dtype=float_dtype,
+            device=self.device,
         )
         self.encoder_output_length = torch.zeros((self.batch_size,), dtype=torch.long, device=self.device)
 
@@ -301,7 +303,9 @@ def force_cuda_graphs_mode(self, mode: Optional[Union[str, CudaGraphsMode]]):
         self.state = None
 
     def loop_labels_torch(
-        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         """
         Pure PyTorch implementation
@@ -379,7 +383,8 @@ def loop_labels_torch(
             # blank label in `labels` tensor means "end of hypothesis" (for this index)
             logits = (
                 self.joint.joint_after_projection(
-                    encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                    encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1),
+                    decoder_output,
                 )
                 .squeeze(1)
                 .squeeze(1)
@@ -400,23 +405,27 @@ def loop_labels_torch(
                     time_indices=time_indices_current_labels,
                     logits=logits if self.preserve_alignments else None,
                     labels=labels if self.preserve_alignments else None,
-                    confidence=torch.stack(
-                        (
-                            self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
-                                dtype=float_dtype
+                    confidence=(
+                        torch.stack(
+                            (
+                                self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
+                                    dtype=float_dtype
+                                ),
+                                self._get_confidence_tensor(F.log_softmax(logits[:, -num_durations:], dim=-1)).to(
+                                    dtype=float_dtype
+                                ),
                             ),
-                            self._get_confidence_tensor(F.log_softmax(logits[:, -num_durations:], dim=-1)).to(
+                            dim=-1,
+                        )
+                        if self.include_duration_confidence
+                        else (
+                            self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
                                 dtype=float_dtype
-                            ),
-                        ),
-                        dim=-1,
-                    )
-                    if self.include_duration_confidence
-                    else self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
-                        dtype=float_dtype
-                    )
-                    if self.preserve_frame_confidence
-                    else None,
+                            )
+                            if self.preserve_frame_confidence
+                            else None
+                        )
+                    ),
                 )
 
             # advance_mask is a mask for current batch for searching non-blank labels;
@@ -433,7 +442,8 @@ def loop_labels_torch(
                 torch.where(advance_mask, time_indices, time_indices_current_labels, out=time_indices_current_labels)
                 logits = (
                     self.joint.joint_after_projection(
-                        encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                        encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1),
+                        decoder_output,
                     )
                     .squeeze(1)
                     .squeeze(1)
@@ -454,23 +464,27 @@ def loop_labels_torch(
                         time_indices=time_indices_current_labels,
                         logits=logits if self.preserve_alignments else None,
                         labels=more_labels if self.preserve_alignments else None,
-                        confidence=torch.stack(
-                            (
-                                self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
-                                    dtype=float_dtype
+                        confidence=(
+                            torch.stack(
+                                (
+                                    self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
+                                        dtype=float_dtype
+                                    ),
+                                    self._get_confidence_tensor(F.log_softmax(logits[:, -num_durations:], dim=-1)).to(
+                                        dtype=float_dtype
+                                    ),
                                 ),
-                                self._get_confidence_tensor(F.log_softmax(logits[:, -num_durations:], dim=-1)).to(
+                                dim=-1,
+                            )
+                            if self.include_duration_confidence
+                            else (
+                                self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
                                     dtype=float_dtype
-                                ),
-                            ),
-                            dim=-1,
-                        )
-                        if self.include_duration_confidence
-                        else self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1)).to(
-                            dtype=float_dtype
-                        )
-                        if self.preserve_frame_confidence
-                        else None,
+                                )
+                                if self.preserve_frame_confidence
+                                else None
+                            )
+                        ),
                     )
 
                 blank_mask = labels == self._blank_index
@@ -487,19 +501,27 @@ def loop_labels_torch(
             # this seems to be redundant, but used in the `loop_frames` output
             torch.ne(active_mask, active_mask_prev, out=became_inactive_mask)
             self.decoder.batch_replace_states_mask(
-                src_states=state, dst_states=last_decoder_state, mask=became_inactive_mask,
+                src_states=state,
+                dst_states=last_decoder_state,
+                mask=became_inactive_mask,
             )
 
             # store hypotheses
             if self.max_symbols is not None:
                 # pre-allocated memory, no need for checks
                 batched_hyps.add_results_masked_no_checks_(
-                    active_mask, labels, time_indices_current_labels, scores,
+                    active_mask,
+                    labels,
+                    time_indices_current_labels,
+                    scores,
                 )
             else:
                 # auto-adjusted storage
                 batched_hyps.add_results_masked_(
-                    active_mask, labels, time_indices_current_labels, scores,
+                    active_mask,
+                    labels,
+                    time_indices_current_labels,
+                    scores,
                 )
 
             # stage 4: to avoid looping, go to next frame after max_symbols emission
@@ -510,7 +532,8 @@ def loop_labels_torch(
                     active_mask,
                     torch.logical_and(
                         torch.logical_and(
-                            labels != self._blank_index, batched_hyps.last_timestep_lasts >= self.max_symbols,
+                            labels != self._blank_index,
+                            batched_hyps.last_timestep_lasts >= self.max_symbols,
                         ),
                         batched_hyps.last_timestep == time_indices,
                     ),
@@ -525,7 +548,9 @@ def loop_labels_torch(
         return batched_hyps, None, last_decoder_state
 
     def loop_labels_cuda_graphs(
-        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         """
         Implementation with CUDA graphs.
@@ -620,7 +645,9 @@ def _create_inner_while_loop_kernel(cls):
         return run_nvrtc(kernel_string, b"inner_find_non_blank_conditional", cls.CUDA_PROGRAM_NAME)
 
     def _graph_reinitialize(
-        self, encoder_output_projected: torch.Tensor, encoder_output_length: torch.Tensor,
+        self,
+        encoder_output_projected: torch.Tensor,
+        encoder_output_length: torch.Tensor,
     ):
         batch_size, max_time, encoder_dim = encoder_output_projected.shape
 
@@ -659,25 +686,34 @@ def _partial_graphs_compile(self):
         """Compile decoding by parts"""
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.state.device)
+        stream_for_graph.wait_stream(torch.cuda.default_stream(self.state.device))
         self.separate_graphs = SeparateGraphsLoopLabels()
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.separate_graphs.before_outer_loop, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.separate_graphs.before_outer_loop, stream=stream_for_graph),
         ):
             self._before_outer_loop()
 
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.separate_graphs.before_inner_loop, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.separate_graphs.before_inner_loop, stream=stream_for_graph),
         ):
             self._before_inner_loop_get_decoder_output()
             self._before_inner_loop_get_joint_output()
 
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.separate_graphs.inner_loop_code, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.separate_graphs.inner_loop_code, stream=stream_for_graph),
         ):
             self._inner_loop_code()
 
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.separate_graphs.after_inner_loop, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.separate_graphs.after_inner_loop, stream=stream_for_graph),
         ):
             self._after_inner_loop()
 
@@ -685,9 +721,12 @@ def _full_graph_compile(self):
         """Compile full graph for decoding"""
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.state.device)
+        stream_for_graph.wait_stream(torch.cuda.default_stream(self.state.device))
         self.full_graph = torch.cuda.CUDAGraph()
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.full_graph, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.full_graph, stream=stream_for_graph),
         ):
             self._before_outer_loop()
 
@@ -700,7 +739,8 @@ def _full_graph_compile(self):
             outer_loop_kernel = self._create_outer_while_loop_kernel()
             active_mask_any_ptr = np.array([self.state.active_mask_any.data_ptr()], dtype=np.uint64)
             outer_loop_args = np.array(
-                [outer_loop_conditional_handle.getPtr(), active_mask_any_ptr.ctypes.data], dtype=np.uint64,
+                [outer_loop_conditional_handle.getPtr(), active_mask_any_ptr.ctypes.data],
+                dtype=np.uint64,
             )
 
             # loop while there are active utterances
@@ -714,7 +754,11 @@ def _full_graph_compile(self):
                 (inner_loop_conditional_handle,) = cu_call(cudart.cudaGraphConditionalHandleCreate(graph, 0, 0))
                 advance_mask_any_ptr = np.array([self.state.advance_mask_any.data_ptr()], dtype=np.uint64)
                 inner_loop_args = np.array(
-                    [inner_loop_conditional_handle.getPtr(), advance_mask_any_ptr.ctypes.data,], dtype=np.uint64,
+                    [
+                        inner_loop_conditional_handle.getPtr(),
+                        advance_mask_any_ptr.ctypes.data,
+                    ],
+                    dtype=np.uint64,
                 )
                 # while self.advance_mask_any.item():
 
@@ -797,23 +841,27 @@ def _before_inner_loop_get_joint_output(self):
                 time_indices=self.state.time_indices_current_labels,
                 logits=logits if self.preserve_alignments else None,
                 labels=self.state.labels if self.preserve_alignments else None,
-                confidence=torch.stack(
-                    (
+                confidence=(
+                    torch.stack(
+                        (
+                            self._get_confidence_tensor(
+                                F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
+                            ).to(dtype=float_dtype),
+                            self._get_confidence_tensor(
+                                F.log_softmax(logits[:, -self.state.all_durations.shape[0] :], dim=-1)
+                            ).to(dtype=float_dtype),
+                        ),
+                        dim=-1,
+                    )
+                    if self.include_duration_confidence
+                    else (
                         self._get_confidence_tensor(
                             F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                        ).to(dtype=float_dtype),
-                        self._get_confidence_tensor(
-                            F.log_softmax(logits[:, -self.state.all_durations.shape[0] :], dim=-1)
-                        ).to(dtype=float_dtype),
-                    ),
-                    dim=-1,
-                )
-                if self.include_duration_confidence
-                else self._get_confidence_tensor(
-                    F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                ).to(dtype=float_dtype)
-                if self.preserve_frame_confidence
-                else None,
+                        ).to(dtype=float_dtype)
+                        if self.preserve_frame_confidence
+                        else None
+                    )
+                ),
             )
 
         # advance_mask is a mask for current batch for searching non-blank labels;
@@ -864,23 +912,27 @@ def _inner_loop_code(self):
                 time_indices=self.state.time_indices_current_labels,
                 logits=logits if self.preserve_alignments else None,
                 labels=more_labels if self.preserve_alignments else None,
-                confidence=torch.stack(
-                    (
+                confidence=(
+                    torch.stack(
+                        (
+                            self._get_confidence_tensor(
+                                F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
+                            ).to(dtype=float_dtype),
+                            self._get_confidence_tensor(
+                                F.log_softmax(logits[:, -self.state.all_durations.shape[0] :], dim=-1)
+                            ).to(dtype=float_dtype),
+                        ),
+                        dim=-1,
+                    )
+                    if self.include_duration_confidence
+                    else (
                         self._get_confidence_tensor(
                             F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                        ).to(dtype=float_dtype),
-                        self._get_confidence_tensor(
-                            F.log_softmax(logits[:, -self.state.all_durations.shape[0] :], dim=-1)
-                        ).to(dtype=float_dtype),
-                    ),
-                    dim=-1,
-                )
-                if self.include_duration_confidence
-                else self._get_confidence_tensor(
-                    F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
-                ).to(dtype=float_dtype)
-                if self.preserve_frame_confidence
-                else None,
+                        ).to(dtype=float_dtype)
+                        if self.preserve_frame_confidence
+                        else None
+                    )
+                ),
             )
 
         # blank_mask = self.labels == self._blank_index
@@ -913,7 +965,10 @@ def _after_inner_loop(self):
         )
 
         self.state.batched_hyps.add_results_masked_no_checks_(
-            self.state.active_mask, self.state.labels, self.state.time_indices_current_labels, self.state.scores,
+            self.state.active_mask,
+            self.state.labels,
+            self.state.time_indices_current_labels,
+            self.state.scores,
         )
 
         # stage 4: to avoid looping, go to next frame after max_symbols emission
@@ -937,7 +992,9 @@ def _after_inner_loop(self):
         torch.any(self.state.active_mask, out=self.state.active_mask_any)
 
     def __call__(
-        self, x: torch.Tensor, out_len: torch.Tensor,
+        self,
+        x: torch.Tensor,
+        out_len: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         if self.cuda_graphs_mode is not None and x.device.type == "cuda":
             return self.loop_labels_cuda_graphs(encoder_output=x, encoder_output_length=out_len)

From 4167641fae262b4f6b6828498b65aa148511c51c Mon Sep 17 00:00:00 2001
From: Jason <jasoli@nvidia.com>
Date: Tue, 14 May 2024 14:15:17 -0400
Subject: [PATCH 077/178] move tts fixtures (#9183)

* move tts fixtures

Signed-off-by: Jason <jasoli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: blisc <blisc@users.noreply.github.com>

---------

Signed-off-by: Jason <jasoli@nvidia.com>
Signed-off-by: blisc <blisc@users.noreply.github.com>
Co-authored-by: blisc <blisc@users.noreply.github.com>
---
 .../tts.py => collections/tts/conftest.py}          |  0
 tests/conftest.py                                   | 13 +++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)
 rename tests/{fixtures/tts.py => collections/tts/conftest.py} (100%)

diff --git a/tests/fixtures/tts.py b/tests/collections/tts/conftest.py
similarity index 100%
rename from tests/fixtures/tts.py
rename to tests/collections/tts/conftest.py
diff --git a/tests/conftest.py b/tests/conftest.py
index 5069890e4840..6298ed051c68 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -25,8 +25,6 @@
 
 import pytest
 
-from tests.fixtures.tts import *
-
 # Those variables probably should go to main NeMo configuration file (config.yaml).
 __TEST_DATA_FILENAME = "test_data.tar.gz"
 __TEST_DATA_URL = "https://github.com/NVIDIA/NeMo/releases/download/v1.0.0rc1/"
@@ -68,7 +66,7 @@ def pytest_addoption(parser):
 
 @pytest.fixture
 def device(request):
-    """ Simple fixture returning string denoting the device [CPU | GPU] """
+    """Simple fixture returning string denoting the device [CPU | GPU]"""
     if request.config.getoption("--cpu"):
         return "CPU"
     else:
@@ -193,13 +191,16 @@ def pytest_configure(config):
     If file absent or sizes not equal, function downloads the archive from github and unpacks it.
     """
     config.addinivalue_line(
-        "markers", "run_only_on(device): runs the test only on a given device [CPU | GPU]",
+        "markers",
+        "run_only_on(device): runs the test only on a given device [CPU | GPU]",
     )
     config.addinivalue_line(
-        "markers", "with_downloads: runs the test using data present in tests/.data",
+        "markers",
+        "with_downloads: runs the test using data present in tests/.data",
     )
     config.addinivalue_line(
-        "markers", "nightly: runs the nightly test for QA.",
+        "markers",
+        "nightly: runs the nightly test for QA.",
     )
     # Test dir and archive filepath.
     test_dir = join(dirname(__file__), __TEST_DATA_SUBDIR)

From 4d574fe493df9f7e86629d2a0afe880f1a52764d Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithya.r@gmail.com>
Date: Tue, 14 May 2024 13:23:32 -0700
Subject: [PATCH 078/178] enable matryoshka embedding learning (#9130)

* enable matryoshka embedding learning

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Apply isort and black reformatting

Signed-off-by: arendu <arendu@users.noreply.github.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: arendu <arendu@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: arendu <arendu@users.noreply.github.com>
---
 .../megatron_gpt_embedding_model.py           | 49 +++++++++++++++----
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
index d477b337cd29..389c90d7f97c 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
@@ -58,6 +58,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.temperature = self.cfg.get('temperature', 0.02)
         self.use_all_possible_negatives = self.cfg.get("use_all_possible_negatives", True)
         self.global_inbatch_negatives = self.cfg.get("global_inbatch_negatives", True)
+        if self.cfg.get("do_mrl", False):
+            min_mrl = self.cfg.get("min_mrl_dim", int(np.log2(32))) - 1
+            max_mrl = int(np.log2(self.cfg.hidden_size // 2))
+            self.mrl_dims = [2**i for i in range(max_mrl, min_mrl, -1)]
+        else:
+            self.mrl_dims = []
+
         assert (
             self.cfg.get("post_process", False) is False
         ), "post_process must be False to get hidden states in the loss_func"
@@ -255,7 +262,14 @@ def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_me
         gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())]
         torch.distributed.all_gather_object(
             gathered_output_batches,
-            [{'q_hs': batch['q_hs'], 'd_hs': batch['d_hs'], 'metadata': batch['metadata'],} for batch in output],
+            [
+                {
+                    'q_hs': batch['q_hs'],
+                    'd_hs': batch['d_hs'],
+                    'metadata': batch['metadata'],
+                }
+                for batch in output
+            ],
             group=parallel_state.get_data_parallel_group(),
         )
 
@@ -272,7 +286,11 @@ def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_me
                 l_d_hs = listify(batch['d_hs'])
                 l_m = batch['metadata']
                 assert len(l_m) == len(l_q_hs) == len(l_d_hs)
-                for q_hs, d_hs, metadata in zip(l_q_hs, l_d_hs, l_m,):
+                for q_hs, d_hs, metadata in zip(
+                    l_q_hs,
+                    l_d_hs,
+                    l_m,
+                ):
                     total_size += 1
                     if not metadata.get("__AUTOGENERATED__", False):
                         deduplicated_outputs['q_hs'].append(q_hs)
@@ -326,10 +344,10 @@ def write_embeddings_to_file(self, outputs, output_file_path, d_idx):
 
     def local_validation_step(self, dataloader_iter):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         # Check if iterator is exhausted
         # dataloader_iter, done = self._val_iterator_done(dataloader_iter)
@@ -377,7 +395,7 @@ def local_validation_step(self, dataloader_iter):
 
         return loss, non_loss_tensors
 
-    def constrastive_scores(self, pos_doc_hs, neg_doc_hs, query_hs, bs, use_all_possible_negatives=False):
+    def constrastive_scores(self, pos_doc_hs, neg_doc_hs, query_hs, bs, temperature, use_all_possible_negatives=False):
         all_doc_hs = torch.cat([pos_doc_hs, neg_doc_hs], dim=0)  # (2bs) x hidden_size
         cs = torch.mm(query_hs, all_doc_hs.transpose(0, 1))  # (bs) x (2bs)
         pos_cs = cs[:, :bs].diag()
@@ -389,6 +407,8 @@ def constrastive_scores(self, pos_doc_hs, neg_doc_hs, query_hs, bs, use_all_poss
             cs = torch.cat([pos_cs.unsqueeze(1), neg_cs.unsqueeze(1)], dim=1)
         pos_cs = pos_cs.clone().detach().mean()
         neg_cs = neg_cs.clone().detach().mean()
+        cs = cs.clamp(-1.0, 1.0)
+        cs = cs / temperature
         return cs, pos_cs, neg_cs, labels
 
     def inference_loss_func(self, loss_mask, num_valid_tokens_in_ub, eos_tensors):
@@ -426,11 +446,20 @@ def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor):
         neg_doc_hs = torch.nn.functional.normalize(neg_doc_hs, dim=1)
 
         cs, pos_cs, neg_cs, labels = self.constrastive_scores(
-            pos_doc_hs, neg_doc_hs, query_hs, bs, self.use_all_possible_negatives
+            pos_doc_hs, neg_doc_hs, query_hs, bs, self.temperature, self.use_all_possible_negatives
         )
-        cs = cs.clamp(-1.0, 1.0)
-        cs = cs / self.temperature
         loss = torch.nn.functional.cross_entropy(cs, labels)
+        if self.mrl_dims:
+            for dim in self.mrl_dims:
+                cs_dim, _, _, _ = self.constrastive_scores(
+                    pos_doc_hs[:, :dim],
+                    neg_doc_hs[:, :dim],
+                    query_hs[:, :dim],
+                    bs,
+                    self.temperature,
+                    self.use_all_possible_negatives,
+                )
+                loss += torch.nn.functional.cross_entropy(cs_dim, labels)
 
         cp_size = self.cfg.get('context_parallel_size', 1)
         if cp_size > 1:

From 5df8e11255802a2ce2f33db6362e60990e215b64 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Tue, 14 May 2024 15:16:21 -0700
Subject: [PATCH 079/178] Add guards to SD imports (#9158)

* Add guards to SD imports

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../modules/imagen/diffusionmodules/layers.py |  9 +++++++-
 .../modules/stable_diffusion/attention.py     | 17 ++++++++++++---
 .../diffusionmodules/model.py                 |  9 +++++++-
 .../diffusionmodules/openaimodel.py           | 21 +++++++++++++++----
 .../stable_diffusion/diffusionmodules/util.py |  9 +++++++-
 5 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/nemo/collections/multimodal/modules/imagen/diffusionmodules/layers.py b/nemo/collections/multimodal/modules/imagen/diffusionmodules/layers.py
index 72e70250f0d7..f5beca436ecf 100644
--- a/nemo/collections/multimodal/modules/imagen/diffusionmodules/layers.py
+++ b/nemo/collections/multimodal/modules/imagen/diffusionmodules/layers.py
@@ -43,7 +43,14 @@
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
-from apex.contrib.group_norm import GroupNorm
+
+try:
+    from apex.contrib.group_norm import GroupNorm
+
+    OPT_GROUP_NORM = True
+except Exception:
+    print('Fused optimized group norm has not been installed.')
+    OPT_GROUP_NORM = False
 
 
 def conv_nd(dims, *args, **kwargs):
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
index f5689c706e2c..c70b59d39481 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
@@ -17,7 +17,6 @@
 
 import torch
 import torch.nn.functional as F
-from apex.contrib.group_norm import GroupNorm
 from einops import rearrange, repeat
 from torch import einsum, nn
 from torch._dynamo import disable
@@ -25,9 +24,13 @@
 if os.environ.get("USE_NATIVE_GROUP_NORM", "0") == "1":
     from nemo.gn_native import GroupNormNormlization as GroupNorm
 else:
-    from apex.contrib.group_norm import GroupNorm
+    try:
+        from apex.contrib.group_norm import GroupNorm
 
-from transformer_engine.pytorch.module import LayerNormLinear, LayerNormMLP
+        OPT_GROUP_NORM = True
+    except Exception:
+        print('Fused optimized group norm has not been installed.')
+        OPT_GROUP_NORM = False
 
 from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.util import checkpoint
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
@@ -37,6 +40,14 @@
 from nemo.core import adapter_mixins
 from nemo.utils import logging
 
+try:
+    from transformer_engine.pytorch.module import LayerNormLinear, LayerNormMLP
+
+    HAVE_TE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_TE = False
+
 
 def check_cuda():
     if not torch.cuda.is_available():
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
index 7fc5c208004f..644efafaf06a 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
@@ -17,12 +17,19 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from apex.contrib.group_norm import GroupNorm
 from einops import rearrange
 
 from nemo.collections.multimodal.modules.stable_diffusion.attention import LinearAttention
 from nemo.collections.multimodal.parts.stable_diffusion.utils import instantiate_from_config
 
+try:
+    from apex.contrib.group_norm import GroupNorm
+
+    OPT_GROUP_NORM = True
+except Exception:
+    print('Fused optimized group norm has not been installed.')
+    OPT_GROUP_NORM = False
+
 
 def get_timestep_embedding(timesteps, embedding_dim):
     """
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index b610f921a22a..3e301f0b8fc1 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -26,10 +26,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-# FP8 related import
-import transformer_engine
-from apex.contrib.group_norm import GroupNorm
-
 from nemo.collections.multimodal.modules.stable_diffusion.attention import SpatialTransformer
 from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.util import (
     avg_pool_nd,
@@ -45,6 +41,23 @@
 )
 from nemo.utils import logging
 
+try:
+    # FP8 related import
+    import transformer_engine
+
+    HAVE_TE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_TE = False
+
+try:
+    from apex.contrib.group_norm import GroupNorm
+
+    OPT_GROUP_NORM = True
+except Exception:
+    print('Fused optimized group norm has not been installed.')
+    OPT_GROUP_NORM = False
+
 
 def convert_module_to_dtype(module, dtype, enable_norm_layers=False):
     # Convert module parameters to dtype
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
index 3b446f4a42c3..53f9669a0b8f 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
@@ -29,11 +29,18 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from apex.contrib.group_norm import GroupNorm
 from einops import repeat
 from torch._dynamo import disable
 from torch.cuda.amp import custom_bwd, custom_fwd
 
+try:
+    from apex.contrib.group_norm import GroupNorm
+
+    OPT_GROUP_NORM = True
+except Exception:
+    print('Fused optimized group norm has not been installed.')
+    OPT_GROUP_NORM = False
+
 
 def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
     if schedule == "linear":

From c2daa916b6454fe568706b4ab5da06500e2c6728 Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Wed, 15 May 2024 13:57:18 +0200
Subject: [PATCH 080/178] Implement async distributed checkpoint save (#9028)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Prevent duplicated checkpoints

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Introduce DistributedCheckpointIO

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix DistCkptIO usage

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Use NeMo logger

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [DCIO] Fix save_to dist ckpt path

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add versioning to save_to

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add versioning logic to all .nemo files

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add versioning test

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add dist-ckpt test

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Rename existing ckpts instead of using different name

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add comment

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Use dist ckpt flag in all methods

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Improve error msg

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add dist ckpt unit tests

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix load_checkpoint

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix auto-issues

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix ckpt_dir var

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Restore skipping behavior

The fix from prevent-duplicated-checkpoints is required to skip the checkpoints

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix steps on single-GPU machine

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Run dist-ckpt test on GPU

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add docs

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Apply black

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Prevent saving last for non-equal val intervals

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Move checkpoint on rank 0

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix num steps in tests

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add async ckpt implementation

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Abstract AsyncFinalizableCheckpointIO away

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Change async_save flag location

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add debug info

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Apply formatting

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Handle multiple async saves

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Apply formatting

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Move finalization calls to a callback

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Avoid deadlock in teardown

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Adjust to MCore implementation

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add notes and copyrights

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Apply formatting

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix async_request attribute

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add MCore import guards

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add async test

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix finalize_fn arg

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add docs

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Remove checkpoints from accurate steps

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix MCore class usage

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Update docs

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix logger usage

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix rebase

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix code scan issues

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Remove unsused import

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Use dist-ckpt for Bert

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix load checkpoint return val

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Use dist-ckpt based on sharded_state_dict

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add async logging

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Remove deprecated argument

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Use correct checkpoint_io

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bad merge

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Improve debug msg

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Run async test on GPU

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Fix async ckpt unit test

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: mikolajblaz <mikolajblaz@users.noreply.github.com>

* Clarify async logs

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add schema print

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../conf/megatron_gpt_config.yaml             |   1 +
 .../nlp/parts/megatron_trainer_builder.py     |  62 +++-
 nemo/collections/nlp/parts/nlp_overrides.py   | 107 ++++---
 nemo/utils/callbacks/checkpointing_context.py |   0
 nemo/utils/callbacks/dist_ckpt_io.py          | 221 ++++++++++++-
 nemo/utils/callbacks/nemo_model_checkpoint.py | 100 +++++-
 nemo/utils/callbacks/torch_dist_async.py      | 298 ++++++++++++++++++
 nemo/utils/exp_manager.py                     |  27 +-
 tests/core/test_dist_ckpt.py                  |  99 +++++-
 9 files changed, 806 insertions(+), 109 deletions(-)
 create mode 100644 nemo/utils/callbacks/checkpointing_context.py
 create mode 100644 nemo/utils/callbacks/torch_dist_async.py

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index aa43dfe7e53e..20e20744833c 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -52,6 +52,7 @@ exp_manager:
     save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
     filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
     model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+    async_save: False # Set to True to enable async checkpoint save. Currently works only with distributed checkpoints
 
 model:
   # use GPTModel from megatron.core
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index a97b9301fb26..e1a780f09756 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 
 import sys
-from typing import Union
+from typing import Optional, Union
 
+from lightning_fabric.utilities.exceptions import MisconfigurationException
 from omegaconf import DictConfig
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelSummary
@@ -31,7 +32,11 @@
     PipelineMixedPrecisionPlugin,
 )
 from nemo.utils import logging
-from nemo.utils.callbacks.dist_ckpt_io import DistributedCheckpointIO
+from nemo.utils.callbacks.dist_ckpt_io import (
+    AsyncFinalizableCheckpointIO,
+    AsyncFinalizerCallback,
+    DistributedCheckpointIO,
+)
 
 
 class MegatronTrainerBuilder:
@@ -51,7 +56,10 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]:
         _IS_INTERACTIVE = hasattr(sys, "ps1") or bool(sys.flags.interactive)
         if _IS_INTERACTIVE and self.cfg.trainer.devices == 1:
             logging.info("Detected interactive environment, using NLPDDPStrategyNotebook")
-            return NLPDDPStrategyNotebook(no_ddp_communication_hook=True, find_unused_parameters=False,)
+            return NLPDDPStrategyNotebook(
+                no_ddp_communication_hook=True,
+                find_unused_parameters=False,
+            )
 
         if self.cfg.model.get('fsdp', False):
             assert (
@@ -89,7 +97,7 @@ def _grad_scaler(self) -> GradScaler:
         Returns a scaler for precision plugins.
         """
         return GradScaler(
-            init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32),
+            init_scale=self.cfg.model.get('native_amp_init_scale', 2**32),
             growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000),
             hysteresis=self.cfg.model.get('hysteresis', 2),
         )
@@ -137,19 +145,41 @@ def _plugins(self) -> list:
         use_dist_ckpt = not self.cfg.model.get('fsdp', False) and (
             self.cfg.model.get('mcore_gpt', False) or self.cfg.model.get('mcore_bert', False)
         )
+        async_save = self.cfg.exp_manager.checkpoint_callback_params.get('async_save', False)
         if use_dist_ckpt:
-            plugins.append(DistributedCheckpointIO.from_config(self.cfg.model))
+            checkpoint_io = DistributedCheckpointIO.from_config(self.cfg.model, async_save)
+            if async_save:
+                checkpoint_io = AsyncFinalizableCheckpointIO(checkpoint_io)
+            plugins.append(checkpoint_io)
+        elif async_save:
+            raise MisconfigurationException(
+                'exp_manager.checkpoint_callback_params.async_save=True without'
+                'distributed checkpoints is currently not supported'
+            )
 
         return plugins
 
+    def _callbacks(self, callbacks: Optional[list]) -> list:
+        """
+        Returns:
+            callbacks: list of callbacks passed to Trainer.callbacks.
+        """
+        if callbacks is None:
+            callbacks = []
+        # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+        if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
+            callbacks.append(CustomProgressBar())
+
+        if self.cfg.exp_manager.checkpoint_callback_params.get('async_save', False):
+            callbacks.append(AsyncFinalizerCallback())
+        return callbacks
+
     def create_trainer(self, callbacks=None) -> Trainer:
         # cfg.trainer.precision becomes None in Trainer if precision_plugins exist since both precision plugins and precision
         precision = self.cfg.trainer.precision
         strategy = self._training_strategy()
         plugins = self._plugins()
-        # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
-        if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
-            callbacks = [CustomProgressBar()]
+        callbacks = self._callbacks(callbacks)
         trainer = Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
         # Restore the precision value after Trainer is built.
         self.cfg.trainer.precision = precision
@@ -161,7 +191,7 @@ class MegatronBertTrainerBuilder(MegatronTrainerBuilder):
 
     def _grad_scaler(self) -> GradScaler:
         return GradScaler(
-            init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32),
+            init_scale=self.cfg.model.get('native_amp_init_scale', 2**32),
             growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000),
         )
 
@@ -169,13 +199,15 @@ def _grad_scaler(self) -> GradScaler:
 class MegatronT5TrainerBuilder(MegatronTrainerBuilder):
     """Builder for T5 model Trainer with overrides."""
 
-    def create_trainer(self) -> Trainer:
+    def _callbacks(self, callbacks: Optional[list]) -> list:
+        callbacks = super()._callbacks(callbacks)
+        callbacks.append(ModelSummary(max_depth=3))
+        return callbacks
+
+    def create_trainer(self, callbacks=None) -> Trainer:
         strategy = self._training_strategy()
         plugins = self._plugins()
-        callbacks = [ModelSummary(max_depth=3)]
-        # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
-        if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
-            callbacks.append(CustomProgressBar())
+        callbacks = self._callbacks(callbacks)
         return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
 
 
@@ -207,7 +239,7 @@ class MegatronLMPPTrainerBuilder(MegatronTrainerBuilder):
 
     def _grad_scaler(self) -> GradScaler:
         return GradScaler(
-            init_scale=self.cfg.model.get("native_amp_init_scale", 2 ** 32),
+            init_scale=self.cfg.model.get("native_amp_init_scale", 2**32),
             growth_interval=self.cfg.model.get("native_amp_growth_interval", 1000),
             hysteresis=self.cfg.model.get("hysteresis", 2),
             enabled=False if self.cfg.model.pipeline_model_parallel_size > 1 else True,
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 1c68ebff8121..65ffb7df47f4 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -35,6 +35,7 @@
 from pytorch_lightning.loops.fetchers import _DataFetcher
 from pytorch_lightning.plugins import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
+from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.fsdp import FSDPPrecision
 from pytorch_lightning.strategies import DDPStrategy, FSDPStrategy
@@ -120,7 +121,7 @@
 def init_model_parallel(
     sharp: bool, nccl_communicator_config_path: str = None, distributed_timeout_minutes: int = 30
 ) -> None:
-    """ Initializes Megatron-LM model parallel if using model parallelism.
+    """Initializes Megatron-LM model parallel if using model parallelism.
 
     Args:
         sharp: Apply SHARP to NCCL data-parallel communication.
@@ -164,7 +165,7 @@ def init_model_parallel(
 
 
 class NLPDDPStrategy(DDPStrategy):
-    """ DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models.
+    """DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models.
 
     Args:
         no_ddp_communication_hook: Disable DDP communication hook when using AMP-O2
@@ -231,8 +232,8 @@ def setup_distributed(self, global_rank: int = None, world_size: int = None) ->
                 )
 
     def configure_ddp(self):
-        """ Override LightningModule ddp if using model parallel.
-            Sets find_unused_parameters to False to use activation-checkpoint-recomputation.
+        """Override LightningModule ddp if using model parallel.
+        Sets find_unused_parameters to False to use activation-checkpoint-recomputation.
         """
 
         if (hasattr(self.model, 'megatron_amp_O2') and self.model.megatron_amp_O2) or (
@@ -362,9 +363,6 @@ def save_checkpoint(
                 unsharded_optim_state=checkpoint['optimizer_states'][0]
             )
             checkpoint['optimizer_states'] = [sharded_optim_state]
-            # dist_checkpointing expects a directory so we will name the directory
-            # using the path with the file extension removed
-            checkpoint_dir = ckpt_to_dir(filepath)
             # remove device state_dict
             checkpoint['state_dict'] = OrderedDict([])
 
@@ -406,7 +404,7 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr
             self.lightning_module.load_state_dict(checkpoint["state_dict"], strict=strict)
 
     def _fix_tensors_device(self, ckpt: Dict) -> Dict:
-        """ Ensure checkpoint tensors are on the correct device."""
+        """Ensure checkpoint tensors are on the correct device."""
         assert torch.cuda.is_initialized(), (torch.cuda.is_available(), torch.cuda.is_initialized())
         cur_dev = torch.device("cuda", index=torch.cuda.current_device())
 
@@ -418,10 +416,10 @@ def _fix_device(t):
         return dict_list_map_outplace(_fix_device, ckpt)
 
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
-        """ PTL method which we override to integrate distributed checkpoints for model parallel models.
-            In order to load distributed checkpoints we need to provide the sharded_state_dict to 
-            the distributed load function. We get the sharded_state_dict from self.lightning_module
-            which makes it convenient to have the loading logic happen at the strategy level.
+        """PTL method which we override to integrate distributed checkpoints for model parallel models.
+        In order to load distributed checkpoints we need to provide the sharded_state_dict to
+        the distributed load function. We get the sharded_state_dict from self.lightning_module
+        which makes it convenient to have the loading logic happen at the strategy level.
         """
 
         fs = get_filesystem(checkpoint_path)
@@ -466,7 +464,10 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
 
     @property
     def use_distributed_checkpointing(self):
-        has_dist_ckpt_io = HAVE_MEGATRON_CORE and isinstance(self.checkpoint_io, DistributedCheckpointIO)
+        checkpoint_io = self.checkpoint_io
+        while isinstance(checkpoint_io, _WrappingCheckpointIO):
+            checkpoint_io = checkpoint_io.checkpoint_io
+        has_dist_ckpt_io = HAVE_MEGATRON_CORE and isinstance(checkpoint_io, DistributedCheckpointIO)
         has_sharded_state_dict = (
             hasattr(self.lightning_module, 'sharded_state_dict')
             and self.lightning_module.sharded_state_dict() is not None
@@ -500,15 +501,15 @@ def distributed_sampler_kwargs(self):
 
     @property
     def restore_checkpoint_after_setup(self) -> bool:
-        """ This needs to be True for distributed checkpointing because
-            we require the model to have configured the optimizer before 
-            deserializing the checkpoint.
+        """This needs to be True for distributed checkpointing because
+        we require the model to have configured the optimizer before
+        deserializing the checkpoint.
         """
         return True
 
 
 class NLPDDPStrategyNotebook(NLPDDPStrategy):
-    """ Version of NLPDDPStrategy to be used in a Jupyter Notebook
+    """Version of NLPDDPStrategy to be used in a Jupyter Notebook
     A large portion of Megatron code has DDP dependency, so it has been necessary to use NLPDDPStrategy even for
     single-GPU training (e.g. in a Jupyter notebook)
     A PTL 2.0 changes has prevented DDPStrategy to be used in a notebook.
@@ -546,7 +547,7 @@ def _get_full_state_dict_context(module: torch.nn.Module, rank0_only: bool = Fal
 
 
 class NLPFSDPStrategy(FSDPStrategy):
-    """ FSDP plugin for Pytorch Lightning with the support for tensor-parallelism.
+    """FSDP plugin for Pytorch Lightning with the support for tensor-parallelism.
 
     Args:
         sharding_strategy: FSDP parameter sharding strategy.
@@ -639,7 +640,11 @@ def _set_mixed_precision_recipe(
             reduce_dtype = utils_funcs.torch_dtype_from_precision(grad_reduce_dtype, None)
         if set_buffer_dtype is not None:
             buffer_dtype = utils_funcs.torch_dtype_from_precision(buffer_dtype, None)
-        return MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype,)
+        return MixedPrecision(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            buffer_dtype=buffer_dtype,
+        )
 
     def setup_environment(self) -> None:
         """
@@ -750,7 +755,9 @@ def _get_osd(opt_state):
                         with FSDP.summon_full_params(self.model, writeback=True, rank0_only=False):
                             # rekey the osd stored from non-FSDP model
                             rekeyed_osd = FSDP.rekey_optim_state_dict(
-                                temp_osd, OptimStateKeyType.PARAM_NAME, self.model,
+                                temp_osd,
+                                OptimStateKeyType.PARAM_NAME,
+                                self.model,
                             )
                         temp_osd = FSDP.shard_full_optim_state_dict(rekeyed_osd, self.model)
                     except Exception as e:
@@ -758,7 +765,9 @@ def _get_osd(opt_state):
                         exit(1)
                 # Shard optimizer state dict
                 sharded_osd = FSDP.optim_state_dict_to_load(
-                    optim_state_dict=temp_osd, model=self.model, optim=optimizer,
+                    optim_state_dict=temp_osd,
+                    model=self.model,
+                    optim=optimizer,
                 )
 
                 optimizer.load_state_dict(sharded_osd)
@@ -767,9 +776,9 @@ def _get_osd(opt_state):
     def save_checkpoint(
         self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None
     ) -> None:
-        """ Store checkpoints
-            1. In case of sharded checkpoint, all ranks store unique checkpoints.
-            2. In case of non-sharded checkpoint, all data-parallel rank 0 store checkpoints.
+        """Store checkpoints
+        1. In case of sharded checkpoint, all ranks store unique checkpoints.
+        2. In case of non-sharded checkpoint, all data-parallel rank 0 store checkpoints.
         """
         app_state = AppState()
         filepath = inject_model_parallel_rank(filepath, fsdp_sharded_ckpt=self.sharded_checkpoint)
@@ -780,8 +789,7 @@ def save_checkpoint(
             self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
 
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
-        """ Load checkpoints
-        """
+        """Load checkpoints"""
         # 1. Load normal or FSDP-sharded checkpoints.
         fs = get_filesystem(checkpoint_path)
 
@@ -798,8 +806,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
         return checkpoint
 
     def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
-        """ Remove checkpoints
-        """
+        """Remove checkpoints"""
         # legacy checkpoint logic, does not use megatron core
         app_state = AppState()
         # PTL override to accomodate model parallel checkpoints
@@ -814,9 +821,9 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
 
     @property
     def restore_checkpoint_after_setup(self) -> bool:
-        """ When loading FSDP-sharded checkpoint, need to restore checkpoint after configuring
-            FSDP sharding to match FSDP-sharded format between the checkpoint and the current
-            model and optimizer.
+        """When loading FSDP-sharded checkpoint, need to restore checkpoint after configuring
+        FSDP sharding to match FSDP-sharded format between the checkpoint and the current
+        model and optimizer.
         """
         return True
 
@@ -915,7 +922,8 @@ def dummy():
                     else:
                         # move weights to the tmpdir
                         for tp_rank, pp_rank in itertools.product(
-                            range(app_state.tensor_model_parallel_size), range(app_state.pipeline_model_parallel_size),
+                            range(app_state.tensor_model_parallel_size),
+                            range(app_state.pipeline_model_parallel_size),
                         ):
                             os.makedirs(os.path.join(tmpdir, f'tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}'))
                             mp_model_weights = os.path.join(
@@ -1000,6 +1008,7 @@ def modify_state_dict(self, conf, state_dict):
         loaded_keys = state_dict.keys()
         if 'model.model.diffusion_model.input_blocks.1.0.in_layers.2.weight' in loaded_keys:
             new_state_dict = {}
+
             # GroupNormOpt fuses activation function to one layer, thus the indexing of weights are shifted for following
             def should_process(key):
                 base_str = "model.model.diffusion_model."
@@ -1110,7 +1119,13 @@ def restore_from(
         # Get path where the command is executed - the artifacts will be "retrieved" there
         # (original .nemo behavior)
         loaded_params = super().load_config_and_state_dict(
-            calling_cls, restore_path, override_config_path, map_location, strict, return_config, trainer,
+            calling_cls,
+            restore_path,
+            override_config_path,
+            map_location,
+            strict,
+            return_config,
+            trainer,
         )
         if not isinstance(loaded_params, tuple) or return_config is True:
             return loaded_params
@@ -1165,12 +1180,12 @@ def dummy():
 
 
 class PipelineMixedPrecisionPlugin(MixedPrecisionPlugin):
-    """ Overrides PTL autocasting to not wrap training/val/test_step.
-        We do this because we have the megatron-core fwd/bwd functions in training_step.
-        This means .backward is being called in training_step so we do not want the whole
-        step wrapped in autocast.
+    """Overrides PTL autocasting to not wrap training/val/test_step.
+    We do this because we have the megatron-core fwd/bwd functions in training_step.
+    This means .backward is being called in training_step so we do not want the whole
+    step wrapped in autocast.
 
-        We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
+    We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
     """
 
     def __init__(
@@ -1206,12 +1221,12 @@ def forward_context(self) -> Generator[None, None, None]:
 
 
 class FSDPMixedPrecisionPlugin(FSDPPrecision):
-    """ Overrides PTL autocasting to not wrap training/val/test_step.
-        We do this because we have the megatron-core fwd/bwd functions in training_step.
-        This means .backward is being called in training_step so we do not want the whole
-        step wrapped in autocast.
+    """Overrides PTL autocasting to not wrap training/val/test_step.
+    We do this because we have the megatron-core fwd/bwd functions in training_step.
+    This means .backward is being called in training_step so we do not want the whole
+    step wrapped in autocast.
 
-        We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
+    We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
     """
 
     def __init__(
@@ -1246,7 +1261,7 @@ class GradScaler(torch.cuda.amp.GradScaler):
 
     def __init__(
         self,
-        init_scale=2.0 ** 16,
+        init_scale=2.0**16,
         growth_factor=2.0,
         backoff_factor=0.5,
         growth_interval=2000,
@@ -1500,7 +1515,7 @@ def optimizer_step(
 
     @contextmanager
     def forward_context(self) -> Generator[None, None, None]:
-        """ No explicit precision casting. Inputs are supposed to be manually casted """
+        """No explicit precision casting. Inputs are supposed to be manually casted"""
         try:
             yield
         finally:
@@ -1508,7 +1523,7 @@ def forward_context(self) -> Generator[None, None, None]:
 
 
 class GlobalBatchDataFetcher(_DataFetcher):
-    """ Overrides PTL DataFetcher. Used to fetch global batches."""
+    """Overrides PTL DataFetcher. Used to fetch global batches."""
 
     def __init__(self, prefetch_batches: int = 0, store_on_device: bool = False) -> None:
 
diff --git a/nemo/utils/callbacks/checkpointing_context.py b/nemo/utils/callbacks/checkpointing_context.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
index 2e695dd7bbaa..905de4eb3567 100644
--- a/nemo/utils/callbacks/dist_ckpt_io.py
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -1,41 +1,217 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import shutil
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from time import time
 from typing import Any, Dict, Optional
 
+import pytorch_lightning as pl
 from lightning_fabric.plugins import CheckpointIO
 from lightning_fabric.utilities.cloud_io import get_filesystem
 from lightning_fabric.utilities.types import _PATH
-from megatron.core import dist_checkpointing
-from megatron.core.dist_checkpointing.strategies import tensorstore
+from pytorch_lightning import Callback
+from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
 
 from nemo.utils import logging
 
+try:
+    from megatron.core import dist_checkpointing
+    from megatron.core.dist_checkpointing.strategies import tensorstore
+
+    from nemo.utils.callbacks.torch_dist_async import AsyncCallsQueue, AsyncRequest, TorchDistAsyncSaveShardedStrategy
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError) as IMPORT_ERROR_EXC:
+
+    HAVE_MEGATRON_CORE = False
+    IMPORT_ERROR = "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+
+
+@contextmanager
+def _debug_time(name: str):
+    """Simple context manager for timing functions/code blocks."""
+    start = time()
+    try:
+        yield
+    finally:
+        logging.debug(f'{name} took {time() - start:.3f}s')
+
+
+class AsyncCompatibleCheckpointIO(CheckpointIO, ABC):
+    """CheckpointIO that can be used together with async saving.
+
+    Differs from the regular CheckpointIO only by the `save_checkpoint`
+    return type. The `save_checkpoint` method itself is synchronous, but returns
+    callbacks that can be performed asynchronously.
+    """
+
+    @abstractmethod
+    def save_checkpoint(
+        self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None
+    ) -> 'AsyncRequest':
+        raise NotImplementedError
+
 
-class DistributedCheckpointIO(CheckpointIO):
-    """ CheckpointIO for a distributed checkpoint format.
+class AsyncFinalizableCheckpointIO(_WrappingCheckpointIO):
+    """CheckpointIO wrapper for async checkpoint saving and synchronous finalization.
+
+    Runs main part of the checkpoint save in a separate process (not thread as the PTL
+    AsyncCheckpointIO does). Allows to perform a (synchronous) finalization
+    function after all ranks finish checkpoint saving.
+
+    NOTE: for correctness, this plugin must be used together with the
+    AsyncFinalizerCallback callback which performs the finalization checks.
+
+    Args:
+        checkpoint_io (CheckpointIO): wrapped checkpoint_io object. Must be
+            of type AsyncCompatibleCheckpointIO.
+    Requires the underlying checkpoint_io.save_checkpoint to return save_fn, save_args, finalize_fn.
+    """
+
+    def __init__(self, checkpoint_io: AsyncCompatibleCheckpointIO) -> None:
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(IMPORT_ERROR) from IMPORT_ERROR_EXC
+        if not isinstance(checkpoint_io, AsyncCompatibleCheckpointIO):
+            raise ValueError(f'Incompatible wrapped checkpoint_io type: {type(checkpoint_io)}')
+
+        super().__init__(checkpoint_io)
+        self.async_calls_queue = AsyncCallsQueue()
+
+    def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
+        """Executes async request returned from the underlying checkpoint_io asynchronously.
+
+        Requires the underlying checkpoint_io.save_checkpoint to return an AsyncRequest.
+        It is then applied with `self.async_calls_queue` asynchronously.
+
+        Args:
+            checkpoint (Dict[str, Any]): checkpoint to save. Passed to underlying
+                checkpoint_io without modifications.
+            path (_PATH): path to save the checkpoint. Passed to underlying
+                checkpoint_io without modifications.
+            storage_options (Any, optional): storage control modifiers. This class
+                consumed the `finalize_fn` parameter (if any), which is expected to be
+                a callback and is appended to async finalization functions.
+
+        Applies underlying checkpoint_io finalize callback first, then the external one (postfix order).
+        """
+        external_finalize_fn = (storage_options or {}).pop('finalize_fn', None)
+        assert isinstance(self.checkpoint_io, AsyncCompatibleCheckpointIO), type(self.checkpoint_io)
+        async_request = self.checkpoint_io.save_checkpoint(checkpoint, path, storage_options)
+        if external_finalize_fn is not None:
+            async_request.add_finalize_fn(external_finalize_fn)
+        call_idx = self.async_calls_queue.schedule_async_request(async_request)
+        logging.debug(f'Scheduled an async call #{call_idx}')
+
+    @_debug_time('AsyncFinalizableCheckpointIO.maybe_finalize_save_checkpoint')
+    def maybe_finalize_save_checkpoint(self, blocking: bool = False):
+        """Performs checkpoint finalization (if possible).
+
+        Args:
+            blocking (bool, optional): if True, waits until all async saves are
+                completed. Otherwise, finalizes only those async calls which are
+                already done on all ranks. Defaults to False.
+        """
+        call_idx_finalized = self.async_calls_queue.maybe_finalize_async_calls(blocking)
+        if call_idx_finalized:
+            logging.debug(f'Finalized async calls: {[f"#{idx}" for idx in call_idx_finalized]}')
+        return len(call_idx_finalized) > 0
+
+    def teardown(self) -> None:
+        """Warns if there are any pending checkpoint saves."""
+        super().teardown()
+        if self.async_calls_queue.get_num_unfinalized_calls() > 0:
+            # Can't do finalization now because some ranks might be lost
+            logging.warning('Some async checkpoint saves might be not finalized properly.')
+
+
+class AsyncFinalizerCallback(Callback):
+    """Callback which finalizes async saves initiated by the AsyncFinalizableCheckpointIO.
+
+    Tries to perform non-blocking finalization on train_batch_end and train_epoch_end.
+    On train_end performs a blocking finalization of all pending checkpoints.
+    """
+
+    def on_train_batch_end(self, trainer: "pl.Trainer", *args, **kwargs) -> None:
+        self._get_checkpoint_io(trainer).maybe_finalize_save_checkpoint(blocking=False)
+
+    def on_train_epoch_end(self, trainer: "pl.Trainer", *args, **kwargs) -> None:
+        self._get_checkpoint_io(trainer).maybe_finalize_save_checkpoint(blocking=False)
+
+    def on_train_end(self, trainer: "pl.Trainer", *args, **kwargs) -> None:
+        checkpoint_io = self._get_checkpoint_io(trainer)
+        if checkpoint_io.async_calls_queue.get_num_unfinalized_calls() > 0:
+            logging.info('Pending async checkpoint saves. Finalizing them synchronously now')
+        self._get_checkpoint_io(trainer).maybe_finalize_save_checkpoint(blocking=True)
+
+    def _get_checkpoint_io(self, trainer) -> AsyncFinalizableCheckpointIO:
+        checkpoint_io = trainer.strategy.checkpoint_io
+        if not isinstance(checkpoint_io, AsyncFinalizableCheckpointIO):
+            raise ValueError(f'Async finalizer requires an async compatible CheckpointIO, got: {checkpoint_io}')
+        return checkpoint_io
+
+
+class DistributedCheckpointIO(AsyncCompatibleCheckpointIO):
+    """CheckpointIO for a distributed checkpoint format.
 
     Args:
         save_ckpt_format (str): Distributed checkpoint format to use for checkpoint saving.
         load_directly_on_device (bool, optional): if True, loads the weights directly
             on GPU. Has effect only for `zarr` based checkpoints (PyT Distributed
             always loads on device). Defaults to True.
+        async_save (bool): whether to save asynchronously. Should be set to True if
+            this class will be wrapped with AsyncFinalizableCheckpointIO.
     """
 
-    def __init__(self, save_ckpt_format: str, load_directly_on_device: bool = True):
+    def __init__(
+        self,
+        save_ckpt_format: str,
+        load_directly_on_device: bool = True,
+        async_save: bool = False,
+    ):
         super().__init__()
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(IMPORT_ERROR) from IMPORT_ERROR_EXC
+
         self.save_ckpt_format = save_ckpt_format
         self.load_directly_on_device = load_directly_on_device
-
-        self.save_sharded_strategy = self.determine_dist_ckpt_save_strategy()
+        self.async_save = async_save
+        self.save_sharded_strategy = self._determine_dist_ckpt_save_strategy()
 
     @classmethod
-    def from_config(cls, model_cfg):
+    def from_config(cls, model_cfg: dict, async_save: bool = False):
+        """Instantiates a DistributedCheckpointIO from a config dict.
+
+        Args:
+            model_cfg (dict): model config dict. Most of the configuration
+                is extracted from this config.
+            async_save (bool, optional): async_save flag is not part of the model config,
+                it should be provided separately. Defaults to False.
+        """
         return cls(
             save_ckpt_format=model_cfg.get('dist_ckpt_format', 'zarr'),
             load_directly_on_device=model_cfg.get('dist_ckpt_load_on_device', True),
+            async_save=async_save,
         )
 
-    def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
-        """ Saves a distributed checkpoint. Creates the checkpoint root directory if doesn't exist.
+    @_debug_time('DistributedCheckpointIO.save_checkpoint')
+    def save_checkpoint(
+        self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None
+    ) -> Optional['AsyncRequest']:
+        """Saves a distributed checkpoint. Creates the checkpoint root directory if doesn't exist.
 
         Args:
             checkpoint (Dict[str, Any]): sharded state dict to save
@@ -48,11 +224,19 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         dist_checkpointing.save(
             sharded_state_dict=checkpoint, checkpoint_dir=path, sharded_strategy=self.save_sharded_strategy
         )
+        if not self.async_save:
+            return None
+        # NOTE: this logic will be simplified in MCore v0.7
+        assert self.save_sharded_strategy.async_request is not None
+        async_request = self.save_sharded_strategy.async_request
+        self.save_sharded_strategy.async_request = None
+        return async_request
 
+    @_debug_time('DistributedCheckpointIO.load_checkpoint')
     def load_checkpoint(
         self, path: _PATH, map_location: Optional[Any] = None, sharded_state_dict: Dict[str, Any] = None
     ) -> Dict[str, Any]:
-        """ Loads a distributed checkpoint.
+        """Loads a distributed checkpoint.
 
         Args:
             path (_PATH): checkpoint directory
@@ -79,18 +263,25 @@ def load_checkpoint(
             sharded_state_dict=sharded_state_dict, checkpoint_dir=path, sharded_strategy=sharded_strategy
         )
 
+    @_debug_time('DistributedCheckpointIO.remove_checkpoint')
     def remove_checkpoint(self, path: _PATH) -> None:
-        """ Remove a distributed checkpoint.
+        """Remove a distributed checkpoint.
 
         Due to potentially large number of files, the implementation remove the whole directory at once.
         """
         shutil.rmtree(path, ignore_errors=True)
 
-    def determine_dist_ckpt_save_strategy(self):
-        """ Determine the saving strategy based on storage config.
+    def _determine_dist_ckpt_save_strategy(self):
+        """Determine the saving strategy based on constructor args.
 
-        For now only decides the checkpoint format.
+        If self.async_save is True instantiates an async PyT Dist strategy,
+        otherwise relies on MCore to create a proper strategy based on ckpt format.
         """
         save_strategy = (self.save_ckpt_format, 1)
+        if self.async_save:
+            if save_strategy[0] != 'torch_dist':
+                raise ValueError('Async dist-ckpt save supported only for torch_dist format')
+            save_strategy = TorchDistAsyncSaveShardedStrategy('torch_dist', 1)
+
         logging.info(f'Using {save_strategy} dist-ckpt save strategy.')
         return save_strategy
diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py
index f8bdb9d9b294..15e8a4e21f55 100644
--- a/nemo/utils/callbacks/nemo_model_checkpoint.py
+++ b/nemo/utils/callbacks/nemo_model_checkpoint.py
@@ -21,19 +21,21 @@
 
 import pytorch_lightning
 import torch
+from _weakref import proxy
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint, _is_local_file_protocol
 from pytorch_lightning.utilities import rank_zero_info
 
 from nemo.collections.common.callbacks import EMA
 from nemo.utils import logging
 from nemo.utils.app_state import AppState
+from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO
 from nemo.utils.get_rank import is_global_rank_zero
 from nemo.utils.model_utils import ckpt_to_dir, inject_model_parallel_rank, uninject_model_parallel_rank
 
 
 class NeMoModelCheckpoint(ModelCheckpoint):
-    """ Light wrapper around Lightning's ModelCheckpoint to force a saved checkpoint on train_end.
-    Extends Lightning's on_save_checkpoint func to save the .nemo file. Saves the .nemo file based 
+    """Light wrapper around Lightning's ModelCheckpoint to force a saved checkpoint on train_end.
+    Extends Lightning's on_save_checkpoint func to save the .nemo file. Saves the .nemo file based
     on the best checkpoint saved (according to the monitor value).
     Also contains func to save the EMA copy of the model.
     """
@@ -48,6 +50,7 @@ def __init__(
         postfix: str = ".nemo",
         n_resume: bool = False,
         model_parallel_size: int = None,
+        async_save: bool = False,  # controls only finalize callbacks
         **kwargs,
     ):
         # Parse and store "extended" parameters: save_best model and postfix.
@@ -64,6 +67,13 @@ def __init__(
         self.postfix = postfix
         self.previous_best_path = ""
         self.model_parallel_size = model_parallel_size
+        self.async_save = async_save
+        self.async_finalize_cb = None
+        # Checkpoints which removal is deferred until async save is done.
+        # Each element of `deferred_ckpts_to_remove` is a growing list
+        # that `self._remove_checkpoint` adds to. Once `self._save_checkpoint`
+        # is called, the last element is frozen and a new element is added.
+        self.deferred_ckpts_to_remove: List[List[str]] = []
 
         # `prefix` is deprecated
         if 'prefix' in kwargs:
@@ -262,7 +272,7 @@ def on_train_end(self, trainer, pl_module):
             pl_module.save_to(save_path=self._format_nemo_checkpoint_name())
 
     def _backup_existing_nemo_ckpt(self, trainer) -> str:
-        """ Search for an available name with version infix and rename existing checkpoint.
+        """Search for an available name with version infix and rename existing checkpoint.
 
         NOTE: this behavior is slightly different from regular checkpoints.
         PTL creates new regular checkpoint with the first available name.
@@ -330,15 +340,15 @@ def _ema_callback(self, trainer: 'pytorch_lightning.Trainer') -> Optional[EMA]:
 
     @staticmethod
     def format_checkpoint_unfinished_marker_path(checkpoint_path: Union[Path, str]) -> Path:
-        """ Format the path to the unfinished checkpoint marker file.
-        
+        """Format the path to the unfinished checkpoint marker file.
+
         If the marker file exists, corresponding checkpoint is considered unfinished/incomplete.
         NOTE: Marker path for the EMA checkpoint part is the same as for the original checkpoint.
-        
+
         Args:
             checkpoint_path: Path to the checkpoint file or dir.
               Does not need to exist.
-            
+
         Returns:
             Path to the unfinished checkpoint marker file.
         """
@@ -350,7 +360,7 @@ def format_checkpoint_unfinished_marker_path(checkpoint_path: Union[Path, str])
 
     @staticmethod
     def is_checkpoint_unfinished(checkpoint_path: Union[Path, str]) -> bool:
-        """ Check if the checkpoint is unfinished.
+        """Check if the checkpoint is unfinished.
 
         Args:
             checkpoint_path: Path to the checkpoint file or dir.
@@ -363,7 +373,7 @@ def is_checkpoint_unfinished(checkpoint_path: Union[Path, str]) -> bool:
 
     @staticmethod
     def set_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barrier_after=False) -> None:
-        """ Marks given checkpoint as unfinished.
+        """Marks given checkpoint as unfinished.
 
         Args:
             checkpoint_filepath: Path to the checkpoint file or dir.
@@ -409,6 +419,8 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
         ema_callback = self._ema_callback(trainer)
         if ema_callback is not None:
+            if self.async_save:
+                raise ValueError('async_save with EMA not supported')
             with ema_callback.save_original_optimizer_state(trainer):
                 super()._save_checkpoint(trainer, filepath)
 
@@ -418,13 +430,71 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
                 if self.verbose:
                     rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}")
                 super()._save_checkpoint(trainer, filepath)
+            self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
         else:
-            super()._save_checkpoint(trainer, filepath)
-        # barrier_before=True, so all ranks synchronize before removing the unfinished checkpoint marker
-        # we don't want to remove the marker until all checkpointing is done.
-        self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
+            # Async save passed the finalization function to checkpoint_io,
+            # sync save calls the finalization function immediately after save.
+            finalize_fn = self._get_finalize_save_checkpoint_callback(trainer, filepath, trainer.global_step)
+            if self.async_save:
+                checkpoint_io = trainer.strategy.checkpoint_io
+                if not isinstance(checkpoint_io, AsyncFinalizableCheckpointIO):
+                    raise ValueError('Async save requires async compatible CheckpointIO')
+                storage_options = dict(finalize_fn=finalize_fn)
+                # Each upcoming ckpt removal request will be executed as part of this save finalization
+                self.deferred_ckpts_to_remove.append([])
+            else:
+                storage_options = None
+            trainer.save_checkpoint(filepath, self.save_weights_only, storage_options=storage_options)
+            if self.async_save:
+                logging.info(f'Scheduled async checkpoint save for {filepath}')
+            else:
+                finalize_fn()
+
+    def _get_finalize_save_checkpoint_callback(
+        self, trainer: 'pytorch_lightning.Trainer', filepath: str, global_step: int
+    ):
+        """Creates a callback that can be used to finalize async (and sync) ckpt saves."""
 
-    def _remove_checkpoint(self, trainer: "pytorch_lightning.Trainer", filepath: str) -> None:
+        def _cb():
+            logging.debug(f'Finalize callback called for step {global_step}, filepath {filepath}')
+            self._last_global_step_saved = global_step
+            self._last_checkpoint_saved = filepath
+
+            # notify loggers
+            if trainer.is_global_zero:
+                for logger in trainer.loggers:
+                    logger.after_save_checkpoint(proxy(self))
+
+            # barrier_before=True, so all ranks synchronize before removing the unfinished checkpoint marker
+            # we don't want to remove the marker until all checkpointing is done.
+            self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
+
+            if not self.async_save:
+                return
+
+            logging.info(f'Async checkpoint save for step {global_step} ({filepath}) finalized successfully.')
+
+            # Remove checkpoints marked for removal by `self._remove_checkpoint`
+            # For each finalization there is exactly one entry in self.deferred_ckpts_to_remove
+            assert self.deferred_ckpts_to_remove
+            ckpts_to_remove = self.deferred_ckpts_to_remove.pop(0)
+            logging.debug(f'Checkpoints to remove: {ckpts_to_remove}')
+            for ckpt_to_remove in ckpts_to_remove:
+                self._remove_checkpoint(trainer, ckpt_to_remove, override_async=True)
+
+        return _cb
+
+    def _remove_checkpoint(self, trainer: "pytorch_lightning.Trainer", filepath: str, override_async=False) -> None:
+        """Performs checkpoint removal or deferred removal.
+
+        With async save, `self._remove_checkpoint` is called before the checkpoint
+        is actually finished so we can't remove it. Instead we add it to
+        `self.deferred_ckpts_to_remove` for future removal.
+        """
+        if self.async_save and not override_async:
+            # Register checkpoint removal in the last (active) checkpoint removal list
+            self.deferred_ckpts_to_remove[-1].append(filepath)
+            return
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
         # if anything goes wrong during removal, we should be able to detect that data is incomplete.
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
@@ -499,7 +569,7 @@ def _should_remove_checkpoint(self, trainer: "pl.Trainer", previous: str, curren
         A checkpoint won't be deleted if any of the cases apply:
         - The previous checkpoint is the same as the current checkpoint (means the old was already overwritten by new)
         - The previous checkpoint is not in the current checkpoint directory and the filesystem is local
-        - The previous checkpoint is the checkpoint the Trainer resumed from and the filesystem is local 
+        - The previous checkpoint is the checkpoint the Trainer resumed from and the filesystem is local
             and the resumed from checkpoint is not the last checkpoint
         """
         if previous == current:
diff --git a/nemo/utils/callbacks/torch_dist_async.py b/nemo/utils/callbacks/torch_dist_async.py
new file mode 100644
index 000000000000..1cd226af9cdb
--- /dev/null
+++ b/nemo/utils/callbacks/torch_dist_async.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import deque
+from pathlib import Path
+from time import time
+from typing import Callable, List, NamedTuple, Optional, Tuple
+
+import torch
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.dist_checkpointing.strategies.filesystem_async import FileSystemWriterAsync
+from megatron.core.dist_checkpointing.strategies.state_dict_saver import (
+    save_state_dict_async_finalize,
+    save_state_dict_async_plan,
+)
+from megatron.core.dist_checkpointing.strategies.torch import (
+    MCoreSavePlanner,
+    TorchDistSaveShardedStrategy,
+    _replace_state_dict_keys_with_sharded_keys,
+    mcore_to_pyt_state_dict,
+)
+from torch import multiprocessing as mp
+
+from nemo.utils import logging
+
+
+class TorchDistAsyncSaveShardedStrategy(TorchDistSaveShardedStrategy):
+    """Async save strategy for the PyT Distributed format.
+
+    NOTE: this class will be removed and replaced with an MCore version
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.async_request = None
+
+    def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        """Translates MCore ShardedTensors to PyT ShardedTensors and saves in PyT Distributed format.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): sharded state dict to save
+            checkpoint_dir (Path): checkpoint directory
+
+        Returns: None
+        """
+        # Translate the state dict
+        (
+            sharded_state_dict,
+            flat_mapping,
+            rename_mapping,
+        ) = _replace_state_dict_keys_with_sharded_keys(sharded_state_dict, self.keep_only_main_replica)
+        pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, False)
+        # Use PyT saving mechanism
+        writer = FileSystemWriterAsync(checkpoint_dir, thread_count=self.thread_count)
+
+        save_state_dict_ret = save_state_dict_async_plan(
+            pyt_state_dict,
+            writer,
+            None,
+            planner=MCoreSavePlanner(),
+        )
+        self.async_request = self._get_save_and_finalize_callbacks(writer, save_state_dict_ret)
+        return self.async_request
+
+    def _get_save_and_finalize_callbacks(self, writer, save_state_dict_ret):
+        save_fn_args = writer.get_save_function_and_args()
+        if save_fn_args is None:  # this check can be removed with MCore v0.7
+            save_fn_args = None, ()
+        save_fn, save_args = save_fn_args
+
+        def finalize_fn():
+            save_state_dict_async_finalize(*save_state_dict_ret)
+            torch.distributed.barrier()
+
+        return AsyncRequest(save_fn, save_args, [finalize_fn])
+
+
+class AsyncRequest(NamedTuple):
+    """Represents an async request that needs to be scheduled for execution.
+
+    NOTE: this class will be removed and replaced with an MCore version
+
+    Args:
+        async_fn (Callable, optional): async function to call. None represents noop.
+        async_fn_args (Tuple): args to pass to `async_fn`.
+        finalize_fns (List[Callable]): list of functions to call to finalize the request.
+            These functions will be called synchronously after `async_fn` is done
+            *on all ranks*.
+    """
+
+    async_fn: Optional[Callable]
+    async_fn_args: Tuple
+    finalize_fns: List[Callable]
+    is_frozen: bool = False
+
+    def add_finalize_fn(self, fn: Callable) -> None:
+        """Adds a new finalize function to the request.
+
+        Args:
+            fn (Callable): function to add to the async request. This function
+                will be called *after* existing finalization functions.
+
+        Returns:
+            None
+        """
+        if self.is_frozen:
+            raise RuntimeError('Cannot add finalization functions to a frozen AsyncRequest')
+        self.finalize_fns.append(fn)
+
+    def execute_sync(self) -> None:
+        """Helper to synchronously execute the request.
+
+        This logic is equivalent to what should happen in case of the async call.
+        """
+        if self.async_fn is not None:
+            self.async_fn(*self.async_fn_args)
+        torch.distributed.barrier()
+        for finalize_fn in self.finalize_fns:
+            finalize_fn()
+
+    def freeze(self) -> 'AsyncRequest':
+        """Freezes the async request, disallowing adding new finalization functions.
+
+        Returns:
+            AsyncRequest: new async request with all same fields except for the
+                `is_frozen` flag.
+        """
+        return self._replace(is_frozen=True)
+
+
+class DistributedAsyncCaller:
+    """Wrapper around mp.Process that ensures correct semantic of distributed finalization.
+
+    NOTE: this class will be removed and replaced with an MCore version
+
+    Starts process asynchronously and allows checking if all processes on all ranks are done.
+    """
+
+    def __init__(self):
+        self.process: Optional[mp.Process] = None
+        self.start_time: Optional[float] = None
+
+    def schedule_async_call(
+        self,
+        async_fn: Optional[Callable],
+        save_args: Tuple,
+    ) -> None:
+        """Spawn a process with `async_fn` as the target.
+
+        This method must be called on all ranks.
+
+        Args:
+            async_fn (Callable, optional): async function to call. If None,
+                no process will be started.
+            save_args (Tuple): async function args.
+        """
+        if async_fn is None:
+            return  # nothing to do
+        torch.cuda.synchronize()
+        ctx = mp.get_context('fork')
+        self.start_time = time()
+        self.process = ctx.Process(
+            target=async_fn,
+            args=save_args,
+        )
+        self.process.start()
+
+    def is_current_async_call_done(self, blocking=False) -> bool:
+        """Check if async save is finished on all ranks.
+
+        For semantic correctness, requires rank synchronization in each check.
+        This method must be called on all ranks.
+
+        Args:
+            blocking (bool, optional): if True, will wait until the call is done
+                on all ranks. Otherwise, returns immediately if at least one rank
+                is still active. Defaults to False.
+
+        Returns:
+            bool: True if all ranks are done (immediately of after active wait
+                if `blocking` is True), False if at least one rank is still active.
+        """
+        # The following takes the same overhead as torch.distributed.barrier (single integer all-reduce)
+        is_alive = int(self.process.is_alive()) if self.process is not None else 0
+        ten = torch.tensor([is_alive], dtype=torch.int, device=torch.cuda.current_device())
+        logging.debug(f"[rank {torch.distributed.get_rank()}] DistributedAsyncCaller is_alive:{is_alive}")
+        torch.distributed.all_reduce(ten)
+        if ten[0] > 0 and not blocking:
+            return False
+        else:
+            if self.process is not None:
+                logging.debug(f"rank: {torch.distributed.get_rank()}, joining self.process")
+                self.process.join()
+                self.process = None
+
+                logging.debug(
+                    f"DistributedAsyncCaller: Async process join finished after {time() - self.start_time:.2f}s from forking"
+                )
+                self.start_time = None
+            return True
+
+
+class _ActiveAsyncRequest(NamedTuple):
+    """Helper to represent an active async call.
+
+    NOTE: this class will be removed and replaced with an MCore version
+
+    Args:
+        idx (int): index of the call (starting from 0)
+        async_caller (DistributedAsyncCaller): async caller instance that represents
+            the async process handling the async request
+        async_request (AsyncRequest):  async request that is being called
+    """
+
+    idx: int
+    async_caller: DistributedAsyncCaller
+    async_request: AsyncRequest
+
+
+class AsyncCallsQueue:
+    """Manages a queue of async calls.
+
+    NOTE: this class will be removed and replaced with an MCore version
+
+    Allows adding a new async call with `schedule_async_request` and finalizing
+    active calls with `maybe_finalize_async_calls`.
+    """
+
+    def __init__(self):
+        self.async_calls: deque[_ActiveAsyncRequest] = deque([])
+        self.call_idx: int = -1
+
+    def schedule_async_request(self, async_request: AsyncRequest) -> int:
+        """Start a new async call and add it to a queue of active async calls.
+
+        This method must be called on all ranks.
+
+        Args:
+            async_request (AsyncRequest): async request to start.
+
+        Returns:
+            int: index of the async call that was started.
+                This can help the user keep track of the async calls.
+        """
+        self.call_idx += 1
+        async_caller = DistributedAsyncCaller()
+        async_request = async_request.freeze()
+        async_caller.schedule_async_call(async_request.async_fn, async_request.async_fn_args)
+        self.async_calls.append(_ActiveAsyncRequest(self.call_idx, async_caller, async_request))
+        return self.call_idx
+
+    def maybe_finalize_async_calls(self, blocking=False) -> List[int]:
+        """Finalizes all available calls.
+
+        This method must be called on all ranks.
+
+        Args:
+            blocking (bool, optional): if True, will wait until all active requests
+                are done. Otherwise, finalizes only the async request that already
+                finished. Defaults to False.
+        Returns:
+            List[int]: list of indices (as returned by `schedule_async_request`)
+                of async calls that have been successfully finalized.
+        """
+        call_idx_finalized = []
+        while self.async_calls:
+            next_async_done = self.async_calls[0].async_caller.is_current_async_call_done(blocking)
+            if not next_async_done:
+                break
+            call_idx, _, async_request = self.async_calls.popleft()
+            for finalize_fn in async_request.finalize_fns:
+                finalize_fn()
+            ten = torch.tensor([call_idx], dtype=torch.int, device=torch.cuda.current_device())
+            torch.distributed.all_reduce(ten, op=torch.distributed.ReduceOp.MAX)
+            assert (
+                ten.item() == call_idx
+            ), 'Unmatched async calls. That probably means not all ranks are participating in async finalization'
+            call_idx_finalized.append(call_idx)
+        return call_idx_finalized
+
+    def get_num_unfinalized_calls(self):
+        """Get the number of active async calls."""
+        return len(self.async_calls)
+
+    def close(self):
+        """Finalize all calls upon closing."""
+        self.maybe_finalize_async_calls(blocking=True)
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 5c7cac5a9a55..9e8b55eade1f 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -51,11 +51,11 @@
 
 
 class NotFoundError(NeMoBaseException):
-    """ Raised when a file or folder is not found"""
+    """Raised when a file or folder is not found"""
 
 
 class LoggerMisconfigurationError(NeMoBaseException):
-    """ Raised when a mismatch between trainer.logger and exp_manager occurs"""
+    """Raised when a mismatch between trainer.logger and exp_manager occurs"""
 
     def __init__(self, message):
         message = (
@@ -66,7 +66,7 @@ def __init__(self, message):
 
 
 class CheckpointMisconfigurationError(NeMoBaseException):
-    """ Raised when a mismatch between trainer.callbacks and exp_manager occurs"""
+    """Raised when a mismatch between trainer.callbacks and exp_manager occurs"""
 
 
 @dataclass
@@ -106,6 +106,7 @@ class CallbackParams:
     save_nemo_on_train_end: Optional[bool] = True  # Whether to automatically save .nemo file durin on_train_end hook
     model_parallel_size: Optional[int] = None  # tensor parallel size * pipeline parallel size
     save_on_train_epoch_end: Optional[bool] = False  # Save after training, not after validation
+    async_save: Optional[bool] = False  # save the checkpoint asynchronously
 
 
 @dataclass
@@ -128,8 +129,7 @@ class EMAParams:
 
 @dataclass
 class ExpManagerConfig:
-    """Experiment Manager config for validation of passed arguments.
-    """
+    """Experiment Manager config for validation of passed arguments."""
 
     # Log dir creation parameters
     explicit_log_dir: Optional[str] = None
@@ -313,7 +313,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
                 Set this to True if you are using DDP with many GPUs and do not want many log files in your exp dir.
             - log_global_rank_0_only (bool): Whether to only create log files for global rank 0. Defaults to False.
                 Set this to True if you are using DDP with many GPUs and do not want many log files in your exp dir.
-            - max_time (str): The maximum wall clock time *per run*. This is intended to be used on clusters where you want 
+            - max_time (str): The maximum wall clock time *per run*. This is intended to be used on clusters where you want
                 a checkpoint to be saved after this specified time and be able to resume from that checkpoint. Defaults to None.
             - seconds_to_sleep (float): seconds to sleep non rank 0 processes for. Used to give enough time for rank 0 to initialize
 
@@ -336,6 +336,10 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
 
     # Ensure passed cfg is compliant with ExpManagerConfig
     schema = OmegaConf.structured(ExpManagerConfig)
+    # TODO: remove this check
+    if is_global_rank_zero():
+        logging.info('ExpManager schema')
+        logging.info(schema)
     if isinstance(cfg, dict):
         cfg = OmegaConf.create(cfg)
     elif not isinstance(cfg, DictConfig):
@@ -681,7 +685,7 @@ def check_resume(
 def check_explicit_log_dir(
     trainer: 'pytorch_lightning.Trainer', explicit_log_dir: Union[Path, str], exp_dir: str, name: str, version: str
 ) -> Tuple[Path, str, str, str]:
-    """ Checks that the passed arguments are compatible with explicit_log_dir.
+    """Checks that the passed arguments are compatible with explicit_log_dir.
 
     Returns:
         log_dir (Path): the log_dir
@@ -918,7 +922,7 @@ def configure_checkpointing(
     params: 'DictConfig',
     create_preemption_callback: bool,
 ):
-    """ Adds ModelCheckpoint to trainer. Raises CheckpointMisconfigurationError if trainer already has a ModelCheckpoint
+    """Adds ModelCheckpoint to trainer. Raises CheckpointMisconfigurationError if trainer already has a ModelCheckpoint
     callback
     """
     for callback in trainer.callbacks:
@@ -995,7 +999,12 @@ def check_slurm(trainer):
 class StatelessTimer(Timer):
     """Extension of PTL timers to be per run."""
 
-    def __init__(self, duration: timedelta = None, interval: str = Interval.step, verbose: bool = True,) -> None:
+    def __init__(
+        self,
+        duration: timedelta = None,
+        interval: str = Interval.step,
+        verbose: bool = True,
+    ) -> None:
         super().__init__(duration, interval, verbose)
 
     # Override PTL Timer's state dict to not store elapsed time information so that we can restore and continue training.
diff --git a/tests/core/test_dist_ckpt.py b/tests/core/test_dist_ckpt.py
index b6dc5ca89d3e..8fe21a316854 100644
--- a/tests/core/test_dist_ckpt.py
+++ b/tests/core/test_dist_ckpt.py
@@ -1,6 +1,7 @@
 import os
 import types
 from pathlib import Path
+from typing import Any, Dict
 
 import pytest
 import pytorch_lightning as pl
@@ -9,7 +10,19 @@
 from pytorch_lightning.demos.boring_classes import BoringModel
 
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
-from nemo.utils.callbacks.dist_ckpt_io import DistributedCheckpointIO
+from nemo.utils.callbacks.dist_ckpt_io import (
+    AsyncFinalizableCheckpointIO,
+    AsyncFinalizerCallback,
+    DistributedCheckpointIO,
+)
+
+try:
+    from megatron.core.dist_checkpointing import ShardedTensor
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_MEGATRON_CORE = False
 
 
 class ExampleModel(BoringModel):
@@ -19,7 +32,13 @@ def on_validation_epoch_end(self) -> None:
 
 class ExampleMCoreModel(ExampleModel):
     def sharded_state_dict(self):
-        return {'a': 3}
+        return {
+            'a': ShardedTensor.from_rank_offsets('a', self.layer.weight, replica_id=torch.distributed.get_rank()),
+            'const': 3,
+        }
+
+    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        checkpoint['sharded_state_dict'] = self.sharded_state_dict()
 
 
 class MockDistributedCheckpointIO(DistributedCheckpointIO):
@@ -42,17 +61,22 @@ def save_checkpoint(self, *args, **kwargs) -> None:
 
 def _get_last_checkpoint_dir(root_dir: Path, model: pl.LightningModule, suffix: str = '') -> Path:
     steps = len(model.train_dataloader().dataset) * model.trainer.max_epochs // torch.distributed.get_world_size()
-    return root_dir / 'checkpoints' / f'epoch=1-step={steps}{suffix}'
+    return root_dir / 'checkpoints' / f'epoch={model.trainer.max_epochs - 1}-step={steps}{suffix}'
+
+
+def _get_nlp_strategy_without_optimizer_state():
+    strategy = NLPDDPStrategy()
+    # this ensures optimizer sharded state creation is skipped
+    strategy.optimizer_sharded_state_dict = types.MethodType(
+        lambda self, unsharded_optim_state: unsharded_optim_state, strategy
+    )
+    return strategy
 
 
 class TestDistCkptIO:
     @pytest.mark.run_only_on('GPU')
     def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path):
-        strategy = NLPDDPStrategy()
-        # skip optimizer sharded state creation:
-        strategy.optimizer_sharded_state_dict = types.MethodType(
-            lambda self, unsharded_optim_state: unsharded_optim_state, strategy
-        )
+        strategy = _get_nlp_strategy_without_optimizer_state()
         checkpoint_io = MockDistributedCheckpointIO('xxx')
 
         test_trainer = pl.Trainer(
@@ -70,7 +94,7 @@ def test_dist_ckpt_io_called_for_mcore_models(self, tmp_path):
         assert checkpoint_io.save_checkpoint_called_args is not None
         (state_dict, path), _ = checkpoint_io.save_checkpoint_called_args
         # Ckpt path doesn't contain the .ckpt suffix
-        assert path.name == _get_last_checkpoint_dir(tmp_path, model).name, len(test_trainer.strategy.parallel_devices)
+        assert path.name == _get_last_checkpoint_dir(tmp_path, model).name
 
     @pytest.mark.run_only_on('GPU')
     def test_dist_ckpt_path_not_executed_for_non_core_models(self, tmp_path):
@@ -96,3 +120,60 @@ def test_dist_ckpt_path_not_executed_for_non_core_models(self, tmp_path):
             assert os.path.basename(path) == _get_last_checkpoint_dir(tmp_path, model, suffix='.ckpt').name
         else:
             assert checkpoint_io.save_checkpoint_called_args is None
+
+
+class TestAsyncSave:
+    @pytest.mark.run_only_on('GPU')
+    def test_async_save_produces_same_checkpoints_as_sync(self, tmp_path):
+        strategy = _get_nlp_strategy_without_optimizer_state()
+        sync_checkpoint_io = DistributedCheckpointIO('torch_dist')
+        async_checkpoint_io = AsyncFinalizableCheckpointIO(DistributedCheckpointIO('torch_dist', async_save=True))
+
+        model = ExampleMCoreModel()
+
+        # dummy_trainer just to initialize NCCL
+        dummy_trainer = pl.Trainer(
+            enable_checkpointing=False,
+            logger=False,
+            max_epochs=1,
+            strategy=_get_nlp_strategy_without_optimizer_state(),
+            plugins=[sync_checkpoint_io],
+        )
+        dummy_trainer.fit(model)
+        tmp_path = strategy.broadcast(tmp_path)
+
+        sync_ckpt_dir = tmp_path / 'sync_checkpoints'
+        async_ckpt_dir = tmp_path / 'async_checkpoints'
+
+        sync_test_trainer = pl.Trainer(
+            enable_checkpointing=True,
+            logger=False,
+            max_epochs=1,
+            strategy=_get_nlp_strategy_without_optimizer_state(),
+            plugins=[sync_checkpoint_io],
+            default_root_dir=sync_ckpt_dir,
+        )
+        sync_test_trainer.fit(model)
+
+        async_test_trainer = pl.Trainer(
+            enable_checkpointing=True,
+            logger=False,
+            max_epochs=1,
+            strategy=_get_nlp_strategy_without_optimizer_state(),
+            plugins=[async_checkpoint_io],
+            callbacks=AsyncFinalizerCallback(),
+            default_root_dir=async_ckpt_dir,
+        )
+        async_test_trainer.fit(model)
+
+        # Load and compare checkpoints
+        checkpoint = {'sharded_state_dict': model.sharded_state_dict()}
+        sync_state_dict = sync_checkpoint_io.load_checkpoint(
+            _get_last_checkpoint_dir(sync_ckpt_dir, model), sharded_state_dict=checkpoint
+        )
+        async_state_dict = async_checkpoint_io.load_checkpoint(
+            _get_last_checkpoint_dir(async_ckpt_dir, model), sharded_state_dict=checkpoint
+        )
+
+        assert sync_state_dict['sharded_state_dict']['const'] == async_state_dict['sharded_state_dict']['const']
+        assert torch.all(sync_state_dict['sharded_state_dict']['a'] == async_state_dict['sharded_state_dict']['a'])

From 1de4b49d46da12e86716f4c30dac9d01590cb1ae Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Wed, 15 May 2024 13:57:43 +0200
Subject: [PATCH 081/178] Fix incorrect checkpoint removal logic (#9192)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix incorrect if logic

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: mikolajblaz <mikolajblaz@users.noreply.github.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 65ffb7df47f4..079732f6b9c5 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -450,8 +450,9 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
 
     def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
         # check if filepath is a distributed checkpoint
-        if self.use_distributed_checkpointing and self.is_global_zero:
-            self.checkpoint_io.remove_checkpoint(ckpt_to_dir(filepath))
+        if self.use_distributed_checkpointing:
+            if self.is_global_zero:
+                self.checkpoint_io.remove_checkpoint(ckpt_to_dir(filepath))
 
         # legacy checkpoint logic, does not use megatron core
         else:

From 6cb618a81d9239611da22e9ef23d075498d18336 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Wed, 15 May 2024 15:01:42 +0200
Subject: [PATCH 082/178] Update to using Model Optimizer (formerly AMMO) in
 PTQ workflow (#9178)

* Update PTQ to use nvidia-modelopt

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Restore PTQ tests

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Update docs

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Comment on apply_rope_fusion

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Support for calibration PP > 1

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

* Fix cicd-main.yml indent

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Set data/tensor parallel groups

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Install only torch dependecies

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Follow up on recent modelopt changes

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Model support matrix

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

* Rename PTQ script as it should be model-agnostic

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Remove unused import

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Update setup instructions

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               | 135 +++++++++---------
 Dockerfile                                    |   2 -
 docs/source/nlp/quantization.rst              |  48 ++++++-
 docs/source/starthere/intro.rst               |   6 +-
 ...zation.yaml => megatron_quantization.yaml} |   0
 ...antization.py => megatron_quantization.py} |   6 +-
 ...mmo_spec.py => gpt_layer_modelopt_spec.py} |  11 +-
 .../language_modeling/megatron_gpt_model.py   |   4 +-
 nemo/export/quantize/quantizer.py             | 111 +++++++++-----
 9 files changed, 204 insertions(+), 119 deletions(-)
 rename examples/nlp/language_modeling/conf/{megatron_llama_quantization.yaml => megatron_quantization.yaml} (100%)
 rename examples/nlp/language_modeling/{megatron_llama_quantization.py => megatron_quantization.py} (92%)
 rename nemo/collections/nlp/models/language_modeling/megatron/{gpt_layer_ammo_spec.py => gpt_layer_modelopt_spec.py} (91%)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 4652e4d19f89..291eeaed7f89 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -132,8 +132,8 @@ jobs:
                 apt-get update && apt-get install libsox-fmt-all -y && \
                 popd
 
-            # AMMO installation
-            pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
+            # ModelOpt installation
+            pip install nvidia-modelopt[torch]~=0.11.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
 
             # PyTorch Lightning version
             python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
@@ -394,7 +394,7 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
-            python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            python examples/nlp/language_modeling/megatron_quantization.py \
             model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
             quantization.algorithm=null \
             model_save=/home/TestData/nlp/megatron_llama/ci_baseline
@@ -403,69 +403,70 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  # L2_PTQ_Llama2_FP8:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   timeout-minutes: 10
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options:
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g
-  #       --env TRANSFORMERS_OFFLINE=0
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/nlp/language_modeling/megatron_llama_quantization.py \
-  #           model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-  #           tensor_model_parallel_size=2 \
-  #           trainer.devices=2 \
-  #           quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-  #           quantization.algorithm=fp8 \
-  #           quantization.num_calib_size=8 \
-  #           inference.batch_size=2 \
-  #           export.inference_tensor_parallel=2 \
-  #           model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
-
-  #           rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
-  # L2_PTQ_Llama2_INT8_SQ:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   timeout-minutes: 10
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options:
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g
-  #       --env TRANSFORMERS_OFFLINE=0
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/nlp/language_modeling/megatron_llama_quantization.py \
-  #           model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-  #           quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-  #           quantization.algorithm=int8_sq \
-  #           quantization.num_calib_size=8 \
-  #           inference.batch_size=2 \
-  #           model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
-
-  #           rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
+  L2_PTQ_Llama2_FP8:
+     needs: [cicd-test-container-setup]
+     runs-on: self-hosted-azure
+     timeout-minutes: 10
+     container:
+       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+       options:
+         # --user 0:128
+         --device=/dev/nvidia0
+         --gpus all
+         --shm-size=8g
+         --env TRANSFORMERS_OFFLINE=0
+         --env HYDRA_FULL_ERROR=1
+         --volume /mnt/datadrive/TestData:/home/TestData
+     steps:
+         - name: Checkout repository
+           uses: actions/checkout@v4
+         - run: |
+             python examples/nlp/language_modeling/megatron_quantization.py \
+             model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+             tensor_model_parallel_size=2 \
+             trainer.devices=2 \
+             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+             quantization.algorithm=fp8 \
+             quantization.num_calib_size=8 \
+             inference.batch_size=2 \
+             export.inference_tensor_parallel=2 \
+             model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+
+             rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+           if: "failure()"
+
+  L2_PTQ_Llama2_INT8_SQ:
+     needs: [cicd-test-container-setup]
+     runs-on: self-hosted-azure
+     timeout-minutes: 10
+     container:
+       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+       options:
+         # --user 0:128
+         --device=/dev/nvidia0
+         --gpus all
+         --shm-size=8g
+         --env TRANSFORMERS_OFFLINE=0
+         --env HYDRA_FULL_ERROR=1
+         --volume /mnt/datadrive/TestData:/home/TestData
+     steps:
+         - name: Checkout repository
+           uses: actions/checkout@v4
+         - run: |
+             python examples/nlp/language_modeling/megatron_quantization.py \
+             model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+             quantization.algorithm=int8_sq \
+             quantization.num_calib_size=8 \
+             inference.batch_size=2 \
+             model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+
+             rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+           if: "failure()"
+
+  # TODO: investigate int4_awq stuck issues and restore the test
   #L2_PTQ_Llama2_INT4_AWQ:
   #  needs: [cicd-test-container-setup]
   #  runs-on: self-hosted-azure
@@ -484,7 +485,7 @@ jobs:
   #      - name: Checkout repository
   #        uses: actions/checkout@v4
   #      - run: |
-  #          python examples/nlp/language_modeling/megatron_llama_quantization.py \
+  #          python examples/nlp/language_modeling/megatron_quantization.py \
   #          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
   #          tensor_model_parallel_size=1 \
   #          trainer.devices=1 \
diff --git a/Dockerfile b/Dockerfile
index 396645d37019..c27048784244 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -133,8 +133,6 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec
 RUN pip install flash-attn
 # install numba for latest containers
 RUN pip install numba>=0.57.1
-# install ammo
-RUN pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
 
 # copy nemo source into a scratch image
 FROM scratch as nemo-src
diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
index afe2e9eccbca..cc40b6a972a2 100644
--- a/docs/source/nlp/quantization.rst
+++ b/docs/source/nlp/quantization.rst
@@ -10,7 +10,7 @@ PTQ enables deploying a model in a low-precision format -- FP8, INT4, or INT8 --
 
 Model quantization has two primary benefits: reduced model memory requirements and increased inference throughput.
 
-In NeMo, quantization is enabled by the Nvidia AMMO library -- a unified algorithmic model optimization & deployment toolkit.
+In NeMo, quantization is enabled by the `NVIDIA TensorRT Model Optimizer (ModelOpt) <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ library -- a library to quantize and compress deep learning models for optimized inference on GPUs.
 
 The quantization process consists of the following steps:
 
@@ -18,10 +18,52 @@ The quantization process consists of the following steps:
 2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
 3. Producing an output directory or .qnemo tarball with model config (json), quantized weights (safetensors) and tokenizer config (yaml).
 
-Loading models requires using an AMMO spec defined in `megatron.core.inference.gpt.model_specs.py <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/inference/gpt/model_specs.py>`_ module. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also available in NeMo project in ``nemo.deploy`` and ``nemo.export`` modules.
+Loading models requires using an ModelOpt spec defined in `nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec <https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py>`_ module. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also available in NeMo project in ``nemo.deploy`` and ``nemo.export`` modules.
 
 Quantization algorithm can also be conveniently set to ``"null"`` to perform only the weights export step using default precision for TensorRT-LLM deployment. This is useful to obtain baseline performance and accuracy results for comparison.
 
+Support Matrix
+^^^^^^^^^^^^^^
+
+Table below presents verified model support matrix for popular LLM architectures. Each model entry also optionally provides a download link to a corresponding Nemo checkpoint for testing purposes. Support for other model families is experimental.
+
+.. list-table:: Model Support Matrix
+   :widths: 15 15 15 15
+   :header-rows: 1
+
+   * - **Model Family**
+     - **FP8**
+     - **INT8_SQ**
+     - **INT4_AWQ**
+   * - Llama (1, 2, 3)
+     - ✅
+     - ✅
+     - ✅
+   * - Mistral
+     - ✅
+     - ✅
+     - ✅
+   * - `GPT-3 <https://huggingface.co/nvidia/GPT-2B-001>`_
+     - ✅
+     - ✅
+     - ✅
+   * - `Nemotron-3 8b <https://huggingface.co/nvidia/nemotron-3-8b-base-4k>`_
+     - ✅
+     - ✅
+     - ✅
+   * - Nemotron-4 15b
+     - ✅
+     - ✅
+     - ✅
+   * - StarCoder 2
+     - ✅
+     - ✅
+     - ✅
+   * - Gemma
+     - ✅
+     - ✅
+     - ✅
+
 
 Example
 ^^^^^^^
@@ -31,7 +73,7 @@ The script must be launched correctly with the number of processes equal to tens
 
 .. code-block:: bash
 
-    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_llama_quantization.py \
+    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_quantization.py \
         model_file=llama2-70b-base-bf16.nemo \
         tensor_model_parallel_size=8 \
         pipeline_model_parallel_size=1 \
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
index 63fdcfb0406e..ebbe1551c39e 100644
--- a/docs/source/starthere/intro.rst
+++ b/docs/source/starthere/intro.rst
@@ -96,13 +96,13 @@ This section details the steps to clone and install the Megatron Core.
     git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \
     pip install .
 
-AMMO Installation
+Model Optimizer Installation
 
-This final step involves installing the AMMO package.
+This final step involves installing the Model Optimizer package.
 
 .. code-block:: bash
 
-    pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
+    pip install nvidia-modelopt[torch]~=0.11.0 --extra-index-url https://pypi.nvidia.com
 
 
 .. code-block:: bash
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_quantization.yaml
similarity index 100%
rename from examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
rename to examples/nlp/language_modeling/conf/megatron_quantization.yaml
diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_quantization.py
similarity index 92%
rename from examples/nlp/language_modeling/megatron_llama_quantization.py
rename to examples/nlp/language_modeling/megatron_quantization.py
index 92ead6b4ed69..d4d6a8b6b917 100644
--- a/examples/nlp/language_modeling/megatron_llama_quantization.py
+++ b/examples/nlp/language_modeling/megatron_quantization.py
@@ -25,12 +25,12 @@
 Nemo quantization example script.
 
 Please consult nemo.export.quantize.Quantizer class
-and examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml config on available quantization methods,
+and examples/nlp/language_modeling/conf/megatron_quantization.yaml config on available quantization methods,
 models supported as well as how to set up data and inference for calibration (with defaults recommended).
 
 Example usage:
 ```
-python examples/nlp/language_modeling/megatron_llama_quantization.py \
+python examples/nlp/language_modeling/megatron_quantization.py \
     model_file=llama2-7b-fp16.nemo \
     model_save=llama2-7b-fp8.qnemo \
     quantization.algorithm=fp8 \
@@ -59,7 +59,7 @@ def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, ma
         yield batch
 
 
-@hydra_runner(config_path="conf", config_name="megatron_llama_quantization")
+@hydra_runner(config_path="conf", config_name="megatron_quantization")
 def main(cfg) -> None:
     if not torch.cuda.is_available():
         raise EnvironmentError("GPU is required for the inference.")
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
similarity index 91%
rename from nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py
rename to nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
index e51ecaba463a..f9ba58736cbd 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -36,8 +36,9 @@
     HAVE_MEGATRON_CORE = False
     IMPORT_ERROR = e
 
-# Use this spec for AMMO PTQ and TensorRT-LLM export
-def get_gpt_layer_ammo_spec() -> ModuleSpec:
+
+# Use this spec for Model Optimizer PTQ and TensorRT-LLM export
+def get_gpt_layer_modelopt_spec() -> ModuleSpec:
     """Mix the native spec with TENorm.
 
     This is essentially the native local spec except for the layernorm implementation
@@ -65,7 +66,11 @@ def get_gpt_layer_ammo_spec() -> ModuleSpec:
             self_attn_bda=get_bias_dropout_add,
             pre_mlp_layernorm=TENorm,
             mlp=ModuleSpec(
-                module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear,
+                    linear_fc2=RowParallelLinear,
+                ),
             ),
             mlp_bda=get_bias_dropout_add,
             # Map TE-layernorm-fusion keys back
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 536fc5bff7c8..3660a5145b10 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -41,7 +41,7 @@
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_spec import (
     get_gpt_full_te_layer_autocast_spec,
 )
-from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_ammo_spec import get_gpt_layer_ammo_spec
+from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec import get_gpt_layer_modelopt_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
@@ -154,7 +154,7 @@ def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True):
         "te_gpt": get_gpt_layer_with_transformer_engine_spec(num_experts, moe_grouped_gemm),
         "megatron_falcon_gpt": get_falcon_layer_spec(),
         "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(),
-        "ammo": get_gpt_layer_ammo_spec(),
+        "modelopt": get_gpt_layer_modelopt_spec(),
     }
     if spec_name not in name_spec_dict:
         raise ValueError(f"Spec name '{spec_name}' is not recognized.")
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 783f47a08e79..4748f4957a52 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -18,11 +18,12 @@
 
 import torch
 import torch.distributed as dist
-from megatron.core import parallel_state
+from megatron.core import mpu, parallel_state
 from megatron.core.transformer.module import Float16Module
 from omegaconf import OmegaConf
 from omegaconf.omegaconf import DictConfig, open_dict
 from pytorch_lightning.trainer.trainer import Trainer
+from tqdm import tqdm
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
@@ -32,18 +33,18 @@
 from nemo.utils.model_utils import load_config, save_artifacts, unwrap_model
 
 try:
-    import ammo.torch.quantization as atq
-    from ammo.torch.export import export_tensorrt_llm_checkpoint
+    import modelopt.torch.quantization as mtq
+    from modelopt.torch.export import export_tensorrt_llm_checkpoint
+    from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group
 
-    HAVE_AMMO = True
+    HAVE_MODELOPT = True
 
 except (ImportError, ModuleNotFoundError) as e:
-    HAVE_AMMO = False
-    HAVE_AMMO_ERROR = e
+    HAVE_MODELOPT = False
+    HAVE_MODELOPT_ERROR = e
 
 
 class Quantizer:
-
     """
     Post-training quantization of Nemo checkpoints.
 
@@ -63,9 +64,9 @@ class Quantizer:
     model families is experimental and might not be fully supported.
 
     Available quantization methods are listed in QUANT_CFG_CHOICES dictionary below.
-    Please consult AMMO documentation for details. You can also inspect different choices in
-    examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and
-    calibration data as well as recommended settings.
+    Please consult Model Optimizer documentation https://nvidia.github.io/TensorRT-Model-Optimizer/ for details.
+    You can also inspect different choices in examples/nlp/language_modeling/conf/megatron_quantization.yaml
+    for quantization algorithms and calibration data as well as recommended settings.
 
     Quantization algorithm can also be conveniently set to 'null' to perform only weights export step
     for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model.
@@ -78,14 +79,14 @@ def __init__(
         export_config: DictConfig,
         trainer_config: DictConfig,
     ):
-        if not HAVE_AMMO:
-            raise RuntimeError("nvidia-ammo is needed to use Quantizer") from HAVE_AMMO_ERROR
+        if not HAVE_MODELOPT:
+            raise RuntimeError("nvidia-modelopt is needed to use Quantizer") from HAVE_MODELOPT_ERROR
         QUANT_CFG_CHOICES = {
-            "int8": atq.INT8_DEFAULT_CFG,
-            "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
-            "fp8": atq.FP8_DEFAULT_CFG,
-            "int4_awq": atq.INT4_AWQ_CFG,
-            "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
+            "int8": mtq.INT8_DEFAULT_CFG,
+            "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
+            "fp8": mtq.FP8_DEFAULT_CFG,
+            "int4_awq": mtq.INT4_AWQ_CFG,
+            "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
         }
         SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
         assert export_config.dtype in SUPPORTED_DTYPE
@@ -95,25 +96,30 @@ def __init__(
         self.export_config = export_config
         self.trainer_config = trainer_config
         if quantization_config.algorithm is not None:
-            atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm]
+            quant_cfg = QUANT_CFG_CHOICES[quantization_config.algorithm]
 
             if "awq" in quantization_config.algorithm:
-                weight_quantizer = atq_config["quant_cfg"]["*weight_quantizer"]
+                weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"]
                 if isinstance(weight_quantizer, list):
                     weight_quantizer = weight_quantizer[0]
                 weight_quantizer["block_sizes"][-1] = quantization_config.awq_block_size
 
             # Always turn on FP8 kv cache to save memory footprint.
             # For int8_sq, we use int8 kv cache.
-            atq_config["quant_cfg"]["*output_quantizer"] = {
+            # TODO: Investigate why enabling FP8 kv cache will cause accuracy regressions for Nemotron.
+            enable_quant_kv_cache = (
+                "int8" not in quantization_config.algorithm and export_config.decoder_type != "gptnext"
+            )
+            print(f'{"Enable" if enable_quant_kv_cache else "Disable"} KV cache quantization')
+            quant_cfg["quant_cfg"]["*output_quantizer"] = {
                 "num_bits": 8 if quantization_config.algorithm == "int8_sq" else (4, 3),
                 "axis": None,
-                "enable": export_config.decoder_type != "gptnext",
+                "enable": enable_quant_kv_cache,
             }
 
-            self.atq_config = atq_config
+            self.quant_cfg = quant_cfg
         else:
-            self.atq_config = None
+            self.quant_cfg = None
 
     def _load_model(
         self,
@@ -121,14 +127,17 @@ def _load_model(
         tensor_model_parallel_size: Optional[int] = None,
         pipeline_model_parallel_size: Optional[int] = None,
     ):
-        """Load model using AMMO layer spec for quantization."""
+        """Load model using ModelOpt layer spec for quantization."""
         model_cfg = self._load_and_modify_config(model_file, tensor_model_parallel_size, pipeline_model_parallel_size)
 
         trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config)
         connector = NLPSaveRestoreConnector()
 
         model = MegatronGPTModel.restore_from(
-            restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector,
+            restore_path=model_file,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            save_restore_connector=connector,
         )
         model.freeze()
 
@@ -144,7 +153,8 @@ def _load_model(
 
         return model
 
-    def _check_ddp_initialized(self, model):
+    @staticmethod
+    def _check_ddp_initialized(model):
         if not parallel_state.is_initialized():
 
             def dummy():
@@ -154,8 +164,11 @@ def dummy():
                 model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
             model.trainer.strategy.setup_environment()
 
+        set_data_parallel_group(mpu.get_data_parallel_group())
+        set_tensor_parallel_group(mpu.get_tensor_model_parallel_group())
+
+    @staticmethod
     def _load_and_modify_config(
-        self,
         model_file: str,
         tensor_model_parallel_size: Optional[int] = None,
         pipeline_model_parallel_size: Optional[int] = None,
@@ -170,12 +183,35 @@ def _load_and_modify_config(
                 model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
             if pipeline_model_parallel_size is not None:
                 model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size
-            # Only custom AMMO spec is supported for PTQ: this custom spec is largely based on local Megatron-LM
+            # Only custom ModelOpt spec is supported for PTQ: this custom spec is largely based on local Megatron-LM
             # layer definitions to avoid Transformer Engine implementations that are currently not supported.
-            model_cfg.name = "ammo"
+            # This layer spec also requires RoPE fusion to be disabled for tensor view operations in attention
+            # layer implementation from megatron/core/transformer/dot_product_attention.py to be functional.
+            model_cfg.name = "modelopt"
+            model_cfg.apply_rope_fusion = False
 
         return model_cfg
 
+    @staticmethod
+    def _sample_output(model):
+        """Generate sample output for a model instance."""
+        if torch.distributed.get_rank() == 0:
+            print("Generating sample output for a model...")
+
+        response = model.generate(
+            inputs=[
+                "Born in north-east France, Soyer trained as a",
+                "Born in California, Soyer trained as a",
+            ],
+            length_params={
+                "max_length": 100,
+                "min_length": 100,
+            },
+        )
+
+        if torch.distributed.get_rank() == 0:
+            print(f'Example NeMo output after PTQ: {response["sentences"]}"')
+
     def quantize(
         self,
         model_file: str,
@@ -191,13 +227,12 @@ def quantize(
 
         model.set_inference_config(OmegaConf.to_container(self.inference_config))
 
-        def forward_loop():
-            for i, batch in enumerate(dataloader):
-                if dist.get_rank() == 0:
-                    print(f"Calibrating batch {i}")
+        def forward_loop(model):
+            print("Calibrating the model...")
+            for i, batch in enumerate(tqdm(dataloader)):
                 model.predict_step(batch, i)
 
-        model = atq.quantize(model, self.atq_config, forward_loop)
+        model = mtq.quantize(model, self.quant_cfg, forward_loop)
 
         if self.export_config == "gptnext":
             # We found squared_relu may have an under-calibration problem.
@@ -207,12 +242,12 @@ def forward_loop():
                 maxbound = 448
             elif self.quantization_config.quantization.algorithm == "int8_sq":
                 maxbound = 127
-            model = atq.postprocess_amax(
+            model = mtq.postprocess_amax(
                 model, "*input_quantizer", lambda amax: torch.clamp(amax, min=0.01 * maxbound)
             )
 
         if dist.get_rank() == 0:
-            atq.print_quant_summary(model)
+            mtq.print_quant_summary(model)
 
         return model
 
@@ -220,6 +255,8 @@ def export(self, model, model_save: str):
         """Export model to '.qnemo' format for TensorRT-LLM engine build."""
         torch_dtype = torch_dtype_from_precision(self.export_config.dtype)
 
+        self._sample_output(model)
+
         if model.cfg.megatron_amp_O2:
             model.model = unwrap_model(model.model, Float16Module)
 
@@ -239,6 +276,8 @@ def export(self, model, model_save: str):
                 export_dir=export_dir,
                 inference_tensor_parallel=self.export_config.inference_tensor_parallel,
                 inference_pipeline_parallel=self.export_config.inference_pipeline_parallel,
+                use_nfs_workspace=self.export_config.inference_pipeline_parallel == 1
+                and model.cfg.pipeline_model_parallel_size > 1,
             )
             dist.barrier()  # Wait until all ranks complete export_model_config step
             if dist.get_rank() == 0:

From 061cc452cf6c6b8687093799b9d048e55aad5fd8 Mon Sep 17 00:00:00 2001
From: Alessandro Morari <amorari@nvidia.com>
Date: Wed, 15 May 2024 16:43:25 -0400
Subject: [PATCH 083/178] GPU-based vectorized Specaug Version 2 (#9155)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* GPU-based vectorized SpecAug

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Wider dtypes for specaug mask bounds computation

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fast spec augmentation v2

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Removed randint code, added comments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixed padding coverage bug, fixed long casting bug, fixed comments

Signed-off-by: Alessandro Morari <amorari@nvidia.com>

* fixed bug due to using freq_axis with length

Signed-off-by: Alessandro Morari <amorari@nvidia.com>

* Added tests for vectorized spectrogram augmentation

Signed-off-by: Alessandro Morari <amorari@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: pzelasko <pzelasko@users.noreply.github.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Alessandro Morari <amorari@nvidia.com>
Signed-off-by: pzelasko <pzelasko@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: pzelasko <pzelasko@users.noreply.github.com>
---
 .../asr/modules/audio_preprocessing.py        | 239 +++++++++---------
 .../asr/parts/submodules/spectr_augment.py    | 118 ++++++++-
 tests/collections/asr/test_asr_modules.py     |  35 ++-
 3 files changed, 261 insertions(+), 131 deletions(-)

diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py
index 643bc4a69d69..d45c0acf314f 100644
--- a/nemo/collections/asr/modules/audio_preprocessing.py
+++ b/nemo/collections/asr/modules/audio_preprocessing.py
@@ -66,8 +66,8 @@
 
 class AudioPreprocessor(NeuralModule, ABC):
     """
-        An interface for Neural Modules that performs audio pre-processing,
-        transforming the wav files to features.
+    An interface for Neural Modules that performs audio pre-processing,
+    transforming the wav files to features.
     """
 
     def __init__(self, win_length, hop_length):
@@ -101,72 +101,72 @@ def get_features(self, input_signal, length):
 class AudioToMelSpectrogramPreprocessor(AudioPreprocessor, Exportable):
     """Featurizer module that converts wavs to mel spectrograms.
 
-        Args:
-            sample_rate (int): Sample rate of the input audio data.
-                Defaults to 16000
-            window_size (float): Size of window for fft in seconds
-                Defaults to 0.02
-            window_stride (float): Stride of window for fft in seconds
-                Defaults to 0.01
-            n_window_size (int): Size of window for fft in samples
-                Defaults to None. Use one of window_size or n_window_size.
-            n_window_stride (int): Stride of window for fft in samples
-                Defaults to None. Use one of window_stride or n_window_stride.
-            window (str): Windowing function for fft. can be one of ['hann',
-                'hamming', 'blackman', 'bartlett']
-                Defaults to "hann"
-            normalize (str): Can be one of ['per_feature', 'all_features']; all
-                other options disable feature normalization. 'all_features'
-                normalizes the entire spectrogram to be mean 0 with std 1.
-                'pre_features' normalizes per channel / freq instead.
-                Defaults to "per_feature"
-            n_fft (int): Length of FT window. If None, it uses the smallest power
-                of 2 that is larger than n_window_size.
-                Defaults to None
-            preemph (float): Amount of pre emphasis to add to audio. Can be
-                disabled by passing None.
-                Defaults to 0.97
-            features (int): Number of mel spectrogram freq bins to output.
-                Defaults to 64
-            lowfreq (int): Lower bound on mel basis in Hz.
-                Defaults to 0
-            highfreq  (int): Lower bound on mel basis in Hz.
-                Defaults to None
-            log (bool): Log features.
-                Defaults to True
-            log_zero_guard_type(str): Need to avoid taking the log of zero. There
-                are two options: "add" or "clamp".
-                Defaults to "add".
-            log_zero_guard_value(float, or str): Add or clamp requires the number
-                to add with or clamp to. log_zero_guard_value can either be a float
-                or "tiny" or "eps". torch.finfo is used if "tiny" or "eps" is
-                passed.
-                Defaults to 2**-24.
-            dither (float): Amount of white-noise dithering.
-                Defaults to 1e-5
-            pad_to (int): Ensures that the output size of the time dimension is
-                a multiple of pad_to.
-                Defaults to 16
-            frame_splicing (int): Defaults to 1
-            exact_pad (bool): If True, sets stft center to False and adds padding, such that num_frames = audio_length
-                // hop_length. Defaults to False.
-            pad_value (float): The value that shorter mels are padded with.
-                Defaults to 0
-            mag_power (float): The power that the linear spectrogram is raised to
-                prior to multiplication with mel basis.
-                Defaults to 2 for a power spec
-            rng : Random number generator
-            nb_augmentation_prob (float) : Probability with which narrowband augmentation would be applied to
-                samples in the batch.
-                Defaults to 0.0
-            nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation.
-                Defaults to 4000
-            use_torchaudio: Whether to use the `torchaudio` implementation.
-            mel_norm: Normalization used for mel filterbank weights.
-                Defaults to 'slaney' (area normalization)
-            stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints.
-            stft_conv: Deprecated argument, kept for compatibility with older checkpoints.
-        """
+    Args:
+        sample_rate (int): Sample rate of the input audio data.
+            Defaults to 16000
+        window_size (float): Size of window for fft in seconds
+            Defaults to 0.02
+        window_stride (float): Stride of window for fft in seconds
+            Defaults to 0.01
+        n_window_size (int): Size of window for fft in samples
+            Defaults to None. Use one of window_size or n_window_size.
+        n_window_stride (int): Stride of window for fft in samples
+            Defaults to None. Use one of window_stride or n_window_stride.
+        window (str): Windowing function for fft. can be one of ['hann',
+            'hamming', 'blackman', 'bartlett']
+            Defaults to "hann"
+        normalize (str): Can be one of ['per_feature', 'all_features']; all
+            other options disable feature normalization. 'all_features'
+            normalizes the entire spectrogram to be mean 0 with std 1.
+            'pre_features' normalizes per channel / freq instead.
+            Defaults to "per_feature"
+        n_fft (int): Length of FT window. If None, it uses the smallest power
+            of 2 that is larger than n_window_size.
+            Defaults to None
+        preemph (float): Amount of pre emphasis to add to audio. Can be
+            disabled by passing None.
+            Defaults to 0.97
+        features (int): Number of mel spectrogram freq bins to output.
+            Defaults to 64
+        lowfreq (int): Lower bound on mel basis in Hz.
+            Defaults to 0
+        highfreq  (int): Lower bound on mel basis in Hz.
+            Defaults to None
+        log (bool): Log features.
+            Defaults to True
+        log_zero_guard_type(str): Need to avoid taking the log of zero. There
+            are two options: "add" or "clamp".
+            Defaults to "add".
+        log_zero_guard_value(float, or str): Add or clamp requires the number
+            to add with or clamp to. log_zero_guard_value can either be a float
+            or "tiny" or "eps". torch.finfo is used if "tiny" or "eps" is
+            passed.
+            Defaults to 2**-24.
+        dither (float): Amount of white-noise dithering.
+            Defaults to 1e-5
+        pad_to (int): Ensures that the output size of the time dimension is
+            a multiple of pad_to.
+            Defaults to 16
+        frame_splicing (int): Defaults to 1
+        exact_pad (bool): If True, sets stft center to False and adds padding, such that num_frames = audio_length
+            // hop_length. Defaults to False.
+        pad_value (float): The value that shorter mels are padded with.
+            Defaults to 0
+        mag_power (float): The power that the linear spectrogram is raised to
+            prior to multiplication with mel basis.
+            Defaults to 2 for a power spec
+        rng : Random number generator
+        nb_augmentation_prob (float) : Probability with which narrowband augmentation would be applied to
+            samples in the batch.
+            Defaults to 0.0
+        nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation.
+            Defaults to 4000
+        use_torchaudio: Whether to use the `torchaudio` implementation.
+        mel_norm: Normalization used for mel filterbank weights.
+            Defaults to 'slaney' (area normalization)
+        stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints.
+        stft_conv: Deprecated argument, kept for compatibility with older checkpoints.
+    """
 
     def save_to(self, save_path: str):
         pass
@@ -177,8 +177,7 @@ def restore_from(cls, restore_path: str):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return {
             "input_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
             "length": NeuralType(
@@ -218,7 +217,7 @@ def __init__(
         highfreq=None,
         log=True,
         log_zero_guard_type="add",
-        log_zero_guard_value=2 ** -24,
+        log_zero_guard_value=2**-24,
         dither=1e-5,
         pad_to=16,
         frame_splicing=1,
@@ -335,8 +334,7 @@ class AudioToMFCCPreprocessor(AudioPreprocessor):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return {
             "input_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
             "length": NeuralType(tuple('B'), LengthsType()),
@@ -344,8 +342,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "processed_signal": NeuralType(('B', 'D', 'T'), MFCCSpectrogramType()),
             "processed_length": NeuralType(tuple('B'), LengthsType()),
@@ -463,12 +460,14 @@ class SpectrogramAugmentation(NeuralModule):
         rect_time (int): maximum size of cut rectangles along the time
             dimension
             Defaults to 25.
+        use_numba_spec_augment: use numba code for Spectrogram augmentation
+        use_vectorized_spec_augment: use vectorized code for Spectrogram augmentation
+
     """
 
     @property
     def input_types(self):
-        """Returns definitions of module input types
-        """
+        """Returns definitions of module input types"""
         return {
             "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()),
             "length": NeuralType(tuple('B'), LengthsType()),
@@ -476,8 +475,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output types
-        """
+        """Returns definitions of module output types"""
         return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
 
     def __init__(
@@ -491,12 +489,18 @@ def __init__(
         rect_freq=20,
         rng=None,
         mask_value=0.0,
-        use_numba_spec_augment: bool = True,
+        use_vectorized_spec_augment: bool = True,
+        use_numba_spec_augment: bool = False,
     ):
         super().__init__()
 
         if rect_masks > 0:
-            self.spec_cutout = SpecCutout(rect_masks=rect_masks, rect_time=rect_time, rect_freq=rect_freq, rng=rng,)
+            self.spec_cutout = SpecCutout(
+                rect_masks=rect_masks,
+                rect_time=rect_time,
+                rect_freq=rect_freq,
+                rng=rng,
+            )
             # self.spec_cutout.to(self._device)
         else:
             self.spec_cutout = lambda input_spec: input_spec
@@ -508,6 +512,7 @@ def __init__(
                 time_width=time_width,
                 rng=rng,
                 mask_value=mask_value,
+                use_vectorized_code=use_vectorized_spec_augment,
             )
         else:
             self.spec_augment = lambda input_spec, length: input_spec
@@ -541,26 +546,25 @@ def forward(self, input_spec, length):
 
 class MaskedPatchAugmentation(NeuralModule):
     """
-        Zeroes out fixed size time patches of the spectrogram.
-        All samples in batch are guaranteed to have the same amount of masked time steps.
-        Optionally also performs frequency masking in the same way as SpecAugment.
-        Args:
-            patch_size (int): up to how many time steps does one patch consist of.
-                Defaults to 48.
-            mask_patches (float): how many patches should be masked in each sample.
-                if >= 1., interpreted as number of patches (after converting to int)
-                if <1.,   interpreted as fraction of total tokens to be masked (number of patches is rounded up)
-                Defaults to 10.
-            freq_masks (int): how many frequency segments should be cut.
-                Defaults to 0.
-            freq_width (int): maximum number of frequencies to be cut in a segment.
-                Defaults to 0.
+    Zeroes out fixed size time patches of the spectrogram.
+    All samples in batch are guaranteed to have the same amount of masked time steps.
+    Optionally also performs frequency masking in the same way as SpecAugment.
+    Args:
+        patch_size (int): up to how many time steps does one patch consist of.
+            Defaults to 48.
+        mask_patches (float): how many patches should be masked in each sample.
+            if >= 1., interpreted as number of patches (after converting to int)
+            if <1.,   interpreted as fraction of total tokens to be masked (number of patches is rounded up)
+            Defaults to 10.
+        freq_masks (int): how many frequency segments should be cut.
+            Defaults to 0.
+        freq_width (int): maximum number of frequencies to be cut in a segment.
+            Defaults to 0.
     """
 
     @property
     def input_types(self):
-        """Returns definitions of module input types
-        """
+        """Returns definitions of module input types"""
         return {
             "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()),
             "length": NeuralType(tuple('B'), LengthsType()),
@@ -568,12 +572,15 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output types
-        """
+        """Returns definitions of module output types"""
         return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
 
     def __init__(
-        self, patch_size: int = 48, mask_patches: float = 10.0, freq_masks: int = 0, freq_width: int = 0,
+        self,
+        patch_size: int = 48,
+        mask_patches: float = 10.0,
+        freq_masks: int = 0,
+        freq_width: int = 0,
     ):
         super().__init__()
         self.patch_size = patch_size
@@ -586,7 +593,12 @@ def __init__(
             raise ValueError('mask_patches cannot be negative')
 
         if freq_masks > 0:
-            self.spec_augment = SpecAugment(freq_masks=freq_masks, time_masks=0, freq_width=freq_width, time_width=0,)
+            self.spec_augment = SpecAugment(
+                freq_masks=freq_masks,
+                time_masks=0,
+                freq_width=freq_width,
+                time_width=0,
+            )
         else:
             self.spec_augment = None
 
@@ -676,8 +688,7 @@ def forward(self, input_signal, length):
 
     @property
     def input_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "input_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()),
             "length": NeuralType(tuple('B'), LengthsType()),
@@ -685,8 +696,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()),
             "processed_length": NeuralType(tuple('B'), LengthsType()),
@@ -754,8 +764,7 @@ def num_subbands(self) -> int:
 
     @property
     def input_types(self) -> Dict[str, NeuralType]:
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "input": NeuralType(('B', 'C', 'T'), AudioSignal()),
             "input_length": NeuralType(('B',), LengthsType(), optional=True),
@@ -763,8 +772,7 @@ def input_types(self) -> Dict[str, NeuralType]:
 
     @property
     def output_types(self) -> Dict[str, NeuralType]:
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
             "output_length": NeuralType(('B',), LengthsType()),
@@ -835,7 +843,7 @@ class SpectrogramToAudio(NeuralModule):
         fft_length: length of FFT
         hop_length: length of hops/shifts of the sliding window
         magnitude_power: Transform magnitude of the spectrogram as x^(1/magnitude_power).
-        scale: Spectrogram will be scaled with 1/scale before the inverse transform. 
+        scale: Spectrogram will be scaled with 1/scale before the inverse transform.
     """
 
     def __init__(self, fft_length: int, hop_length: int, magnitude_power: float = 1.0, scale: float = 1.0):
@@ -878,8 +886,7 @@ def num_subbands(self) -> int:
 
     @property
     def input_types(self) -> Dict[str, NeuralType]:
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
             "input_length": NeuralType(('B',), LengthsType(), optional=True),
@@ -887,8 +894,7 @@ def input_types(self) -> Dict[str, NeuralType]:
 
     @property
     def output_types(self) -> Dict[str, NeuralType]:
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {
             "output": NeuralType(('B', 'C', 'T'), AudioSignal()),
             "output_length": NeuralType(('B',), LengthsType()),
@@ -968,7 +974,7 @@ class AudioToMelSpectrogramPreprocessorConfig:
     highfreq: Optional[int] = None
     log: bool = True
     log_zero_guard_type: str = "add"
-    log_zero_guard_value: float = 2 ** -24
+    log_zero_guard_value: float = 2**-24
     dither: float = 1e-5
     pad_to: int = 16
     frame_splicing: int = 1
@@ -1015,7 +1021,8 @@ class SpectrogramAugmentationConfig:
     rect_freq: int = 0
     mask_value: float = 0
     rng: Optional[Any] = None  # random.Random() type
-    use_numba_spec_augment: bool = True
+    use_numba_spec_augment: bool = False
+    use_vectorized_spec_augment: bool = True
 
 
 @dataclass
diff --git a/nemo/collections/asr/parts/submodules/spectr_augment.py b/nemo/collections/asr/parts/submodules/spectr_augment.py
index 9b379ce10f37..5bc7104816af 100644
--- a/nemo/collections/asr/parts/submodules/spectr_augment.py
+++ b/nemo/collections/asr/parts/submodules/spectr_augment.py
@@ -38,12 +38,18 @@ class SpecAugment(nn.Module, Typing):
         to be cut in one segment.
         If a float value, defines maximum percentage of timesteps that
         are cut adaptively.
+    use_vectorized_code - GPU-based implementation with batched masking and GPU rng,
+        setting it to False reverts to the legacy implementation.
+        Fast implementation is inspired by torchaudio:
+        https://github.com/pytorch/audio/blob/ea437b31ce316ea3d66fe73768c0dcb94edb79ad/src/torchaudio/functional/functional.py#L816
     """
 
+    FREQ_AXIS = 1  # Frequency axis in the spectrogram tensor
+    TIME_AXIS = 2  # Time axis in the spectrogram tensor
+
     @property
     def input_types(self):
-        """Returns definitions of module input types
-        """
+        """Returns definitions of module input types"""
         return {
             "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()),
             "length": NeuralType(tuple('B'), LengthsType()),
@@ -51,12 +57,18 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output types
-        """
+        """Returns definitions of module output types"""
         return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
 
     def __init__(
-        self, freq_masks=0, time_masks=0, freq_width=10, time_width=10, rng=None, mask_value=0.0,
+        self,
+        freq_masks: int = 0,
+        time_masks: int = 0,
+        freq_width: int = 10,
+        time_width: int | float = 10,
+        rng: random.Random | None = None,
+        mask_value: float = 0.0,
+        use_vectorized_code: bool = True,
     ):
         super().__init__()
 
@@ -69,6 +81,7 @@ def __init__(
         self.time_width = time_width
 
         self.mask_value = mask_value
+        self.use_vectorized_code = use_vectorized_code
 
         if isinstance(time_width, int):
             self.adaptive_temporal_width = False
@@ -81,6 +94,12 @@ def __init__(
     @typecheck()
     @torch.no_grad()
     def forward(self, input_spec, length):
+        if self.use_vectorized_code:
+            return self._forward_vectorized(input_spec, length)
+        else:
+            return self._forward_legacy(input_spec, length)
+
+    def _forward_legacy(self, input_spec, length):
         batch_size, num_freq_bins, _ = input_spec.shape
         # Move lengths to CPU before repeated indexing
         lengths_cpu = length.cpu().numpy()
@@ -112,6 +131,89 @@ def forward(self, input_spec, length):
         masked_spec = input_spec.masked_fill(mask=fill_mask, value=self.mask_value)
         return masked_spec
 
+    def _forward_vectorized(self, input_spec: torch.Tensor, length: torch.Tensor) -> torch.Tensor:
+        # time masks
+        input_spec = self._apply_masks(
+            input_spec=input_spec,
+            num_masks=self.time_masks,
+            length=length,
+            width=self.time_width,
+            axis=self.TIME_AXIS,
+            mask_value=self.mask_value,
+        )
+        # freq masks
+        input_spec = self._apply_masks(
+            input_spec=input_spec,
+            num_masks=self.freq_masks,
+            length=length,
+            width=self.freq_width,
+            axis=self.FREQ_AXIS,
+            mask_value=self.mask_value,
+        )
+        return input_spec
+
+    def _apply_masks(
+        self,
+        input_spec: torch.Tensor,
+        num_masks: int,
+        length: torch.Tensor,
+        width: int | float,
+        mask_value: float,
+        axis: int,
+    ) -> torch.Tensor:
+
+        assert axis in (
+            self.FREQ_AXIS,
+            self.TIME_AXIS,
+        ), f"Axis can be only be equal to frequency \
+            ({self.FREQ_AXIS}) or time ({self.TIME_AXIS}). Received: {axis=}"
+        assert not (
+            isinstance(width, float) and axis == self.FREQ_AXIS
+        ), "Float width supported \
+            only with time axis."
+
+        batch_size = input_spec.shape[0]
+        axis_length = input_spec.shape[axis]
+
+        # If width is float then it is transformed into a tensor
+        if axis == self.TIME_AXIS and isinstance(width, float):
+            width = torch.clamp(width * length, max=axis_length).unsqueeze(1)
+
+        # Generate [0-1) random numbers and then scale the tensors.
+        # Use float32 dtype for begin/end mask markers before they are quantized to long.
+        mask_width = torch.rand((batch_size, num_masks), device=input_spec.device, dtype=torch.float32) * width
+        mask_width = mask_width.long()
+        mask_start = torch.rand((batch_size, num_masks), device=input_spec.device, dtype=torch.float32)
+
+        if axis == self.TIME_AXIS:
+            # length can only be used for the time axis
+            mask_start = mask_start * (length.unsqueeze(1) - mask_width)
+        else:
+            mask_start = mask_start * (axis_length - mask_width)
+
+        mask_start = mask_start.long()
+        mask_end = mask_start + mask_width
+
+        # Create mask values using vectorized indexing
+        indices = torch.arange(axis_length, device=input_spec.device)
+        # Create a mask_tensor with all the indices.
+        # The mask_tensor shape is (batch_size, num_masks, axis_length).
+        mask_tensor = (indices >= mask_start.unsqueeze(-1)) & (indices < mask_end.unsqueeze(-1))
+
+        # Reduce masks to one mask
+        mask_tensor = mask_tensor.any(dim=1)
+
+        # Create a final mask that aligns with the full tensor
+        mask = torch.zeros_like(input_spec, dtype=torch.bool)
+        if axis == self.TIME_AXIS:
+            mask_ranges = mask_tensor[:, None, :]
+        else:  # axis == self.FREQ_AXIS
+            mask_ranges = mask_tensor[:, :, None]
+        mask[:, :, :] = mask_ranges
+
+        # Apply the mask value
+        return input_spec.masked_fill(mask=mask, value=mask_value)
+
 
 class SpecCutout(nn.Module, Typing):
     """
@@ -126,14 +228,12 @@ class SpecCutout(nn.Module, Typing):
 
     @property
     def input_types(self):
-        """Returns definitions of module input types
-        """
+        """Returns definitions of module input types"""
         return {"input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
 
     @property
     def output_types(self):
-        """Returns definitions of module output types
-        """
+        """Returns definitions of module output types"""
         return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())}
 
     def __init__(self, rect_masks=0, rect_time=5, rect_freq=20, rng=None):
diff --git a/tests/collections/asr/test_asr_modules.py b/tests/collections/asr/test_asr_modules.py
index b47a72fe0476..1a845232b2a7 100644
--- a/tests/collections/asr/test_asr_modules.py
+++ b/tests/collections/asr/test_asr_modules.py
@@ -69,10 +69,28 @@ def test_AudioToMelSpectrogramPreprocessor_batch(self):
             assert diff <= 1e-3
 
     @pytest.mark.unit
-    def test_SpectrogramAugmentationr(self):
+    def test_SpectrogramAugmentationr_legacy(self):
         # Make sure constructor works
         instance1 = modules.SpectrogramAugmentation(
-            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=False
+            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=False, use_vectorized_spec_augment=False
+        )
+        assert isinstance(instance1, modules.SpectrogramAugmentation)
+
+        # Make sure forward doesn't throw with expected input
+        instance0 = modules.AudioToMelSpectrogramPreprocessor(dither=0)
+        input_signal = torch.randn(size=(4, 512))
+        length = torch.randint(low=161, high=500, size=[4])
+        res0 = instance0(input_signal=input_signal, length=length)
+        res = instance1(input_spec=res0[0], length=length)
+
+        assert res.shape == res0[0].shape
+
+    @pytest.mark.unit
+    @pytest.mark.run_only_on('GPU')
+    def test_SpectrogramAugmentationr_vectorized(self):
+        # Make sure constructor works
+        instance1 = modules.SpectrogramAugmentation(
+            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=False, use_vectorized_spec_augment=True
         )
         assert isinstance(instance1, modules.SpectrogramAugmentation)
 
@@ -97,7 +115,7 @@ def test_SpectrogramAugmentationr_numba_kernel(self, caplog):
 
         # Make sure constructor works
         instance1 = modules.SpectrogramAugmentation(
-            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=True
+            freq_masks=10, time_masks=3, rect_masks=3, use_numba_spec_augment=True, use_vectorized_spec_augment=False
         )
         assert isinstance(instance1, modules.SpectrogramAugmentation)
 
@@ -120,7 +138,8 @@ def test_SpectrogramAugmentationr_numba_kernel(self, caplog):
     def test_SpectrogramAugmentationr_config(self):
         # Test that dataclass matches signature of module
         result = config_utils.assert_dataclass_signature_match(
-            modules.SpectrogramAugmentation, modules.audio_preprocessing.SpectrogramAugmentationConfig,
+            modules.SpectrogramAugmentation,
+            modules.audio_preprocessing.SpectrogramAugmentationConfig,
         )
         signatures_match, cls_subset, dataclass_subset = result
 
@@ -178,7 +197,8 @@ def test_MaskedPatchAugmentation(self):
     def test_MaskedPatchAugmentation_config(self):
         # Test that dataclass matches signature of module
         result = config_utils.assert_dataclass_signature_match(
-            modules.MaskedPatchAugmentation, modules.audio_preprocessing.MaskedPatchAugmentationConfig,
+            modules.MaskedPatchAugmentation,
+            modules.audio_preprocessing.MaskedPatchAugmentationConfig,
         )
         signatures_match, cls_subset, dataclass_subset = result
 
@@ -195,7 +215,10 @@ def test_RNNTDecoder(self):
         pred_config = OmegaConf.create(
             {
                 '_target_': 'nemo.collections.asr.modules.RNNTDecoder',
-                'prednet': {'pred_hidden': 32, 'pred_rnn_layers': 1,},
+                'prednet': {
+                    'pred_hidden': 32,
+                    'pred_rnn_layers': 1,
+                },
                 'vocab_size': vocab_size,
                 'blank_as_pad': True,
             }

From 964ea3cb5faab50791d08226ec49741418774aa8 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 15 May 2024 21:57:22 -0700
Subject: [PATCH 084/178] run_cicd_for_release_branches_also (#9213)

---
 .github/workflows/cicd-main.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 291eeaed7f89..8430dae56418 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -15,7 +15,9 @@ name: "CICD NeMo"
 
 on:
   pull_request:
-    branches: [ "main" ]
+    branches:
+      - 'main'
+      - 'r**'
     types: [ labeled ]
 
 concurrency:

From d0a453531e686cc7d126600b42fb3d385b20a6ae Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Thu, 16 May 2024 18:11:45 +0200
Subject: [PATCH 085/178] Update nemo.export module for quantized models
 (#9218)

* Remove config aligner - no longer needed after TRT-LLM 0.9 update

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Change default export precision to bf16 (more frequent)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Specify gpt_attention_plugin

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 .../conf/megatron_quantization.yaml           |  2 +-
 nemo/export/trt_llm/qnemo/__init__.py         |  1 -
 nemo/export/trt_llm/qnemo/align_config.py     | 46 -------------------
 .../trt_llm/qnemo/qnemo_to_tensorrt_llm.py    | 40 ++--------------
 4 files changed, 5 insertions(+), 84 deletions(-)
 delete mode 100644 nemo/export/trt_llm/qnemo/align_config.py

diff --git a/examples/nlp/language_modeling/conf/megatron_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_quantization.yaml
index 79a5bfbd8fe6..88d10ae0a66c 100644
--- a/examples/nlp/language_modeling/conf/megatron_quantization.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_quantization.yaml
@@ -31,7 +31,7 @@ export:
   decoder_type: llama # gptnext, gpt2, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
-  dtype: 16 # Default precision data type
+  dtype: bf16 # Default precision data type
 
 model_file: llama2-7b-fp16.nemo # Nemo file path
 model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
diff --git a/nemo/export/trt_llm/qnemo/__init__.py b/nemo/export/trt_llm/qnemo/__init__.py
index 77832d749b66..59b9eb8ae6a6 100644
--- a/nemo/export/trt_llm/qnemo/__init__.py
+++ b/nemo/export/trt_llm/qnemo/__init__.py
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .align_config import align_config
 from .qnemo_to_tensorrt_llm import qnemo_to_tensorrt_llm
diff --git a/nemo/export/trt_llm/qnemo/align_config.py b/nemo/export/trt_llm/qnemo/align_config.py
deleted file mode 100644
index abc53224e4b3..000000000000
--- a/nemo/export/trt_llm/qnemo/align_config.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from typing import Any, Dict
-
-
-def align_config(config_trtllm_build: Dict[str, Any]) -> Dict[str, Any]:
-    """Function to align config produced by trtllm-build API for consistency
-    with how ModelConfig from tensorrt_llm.runtime is used in the project.
-    """
-    config = {}
-
-    config_trtllm_build = copy.deepcopy(config_trtllm_build)
-
-    # Builder config
-    config["builder_config"] = {}
-    config["builder_config"]["name"] = "NeMo"
-    config["builder_config"].update(config_trtllm_build["build_config"])
-    config["builder_config"].update(config_trtllm_build["pretrained_config"])
-
-    # Plugin config
-    config["plugin_config"] = config["builder_config"].pop("plugin_config")
-
-    # Parallelism config
-    config["builder_config"]["world_size"] = config["builder_config"]["mapping"]["world_size"]
-    config["builder_config"]["tensor_parallel"] = config["builder_config"]["mapping"]["tp_size"]
-    config["builder_config"]["pipeline_parallel"] = config["builder_config"]["mapping"]["pp_size"]
-
-    # Other parameters
-    config["builder_config"]["num_heads"] = config_trtllm_build["pretrained_config"]["num_attention_heads"]
-    config["builder_config"]["num_layers"] = config_trtllm_build["pretrained_config"]["num_hidden_layers"]
-    config["builder_config"]["add_bos"] = False
-    config["builder_config"]["precision"] = config["builder_config"]["dtype"]
-    return config
diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
index 4e74d8e5fb58..b7e2f7bc2973 100644
--- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
@@ -15,13 +15,10 @@
 import json
 import os
 import subprocess
-from typing import List, Optional
 
-from nemo.export.trt_llm.qnemo import align_config
-from nemo.export.trt_llm.tensorrt_llm_build import MODEL_NAME, get_engine_name
+from typing import List, Optional
 
 CONFIG_NAME = "config.json"
-CONFIG_TRTLLM_BUILD_NAME = "config_trtllm_build.json"
 
 
 def qnemo_to_tensorrt_llm(
@@ -34,6 +31,7 @@ def qnemo_to_tensorrt_llm(
     lora_target_modules: Optional[List[str]] = None,
 ):
     """Build TRT-LLM engine via trtllm-build CLI API in a subprocess."""
+    assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}"
     print(
         "Note that setting n_gpus, tensor_parallel_size and pipeline_parallel_size parameters"
         " for quantized models is possible only on export step via nemo.export.quantize module."
@@ -58,6 +56,8 @@ def qnemo_to_tensorrt_llm(
         str(max_prompt_embedding_table_size),
         "--gemm_plugin",
         model_config["dtype"],
+        "--gpt_attention_plugin",
+        model_config["dtype"],
         "--strongly_typed",
         "--use_custom_all_reduce",
         "disable",
@@ -75,35 +75,3 @@ def qnemo_to_tensorrt_llm(
 
     print("Building engine done. Full logs are:")
     print(result.stdout.decode())
-
-    # Alignment to make nemo-fw tensorrt_llm.runtime ModelConfig definition compatible with config
-    # produced by trtllm-build API. The new config is saved as "config.json" while the source build
-    # config is saved as "config_trtllm_build.json" in the engine directory for reference.
-    os.rename(os.path.join(engine_dir, CONFIG_NAME), os.path.join(engine_dir, CONFIG_TRTLLM_BUILD_NAME))
-    with open(os.path.join(engine_dir, CONFIG_TRTLLM_BUILD_NAME), "r") as f:
-        config_trtllm_build = json.load(f)
-
-    config = align_config(config_trtllm_build)
-
-    # Other parameters
-    assert lora_target_modules is None
-    config["builder_config"]["lora_target_modules"] = lora_target_modules
-
-    with open(os.path.join(engine_dir, CONFIG_NAME), "w") as f:
-        json.dump(config, f, indent=2)
-
-    # Rename for consistency with how engine is run later
-    for i in range(config["builder_config"]["world_size"]):
-        os.rename(
-            os.path.join(engine_dir, f"rank{i}.engine"),
-            os.path.join(
-                engine_dir,
-                get_engine_name(
-                    MODEL_NAME,
-                    config["builder_config"]["precision"],
-                    config["builder_config"]["tensor_parallel"],
-                    config["builder_config"]["pipeline_parallel"],
-                    i,
-                ),
-            ),
-        )

From b489fba96227657b3d04ab71e390cb017bbcf685 Mon Sep 17 00:00:00 2001
From: jgerh <163925524+jgerh@users.noreply.github.com>
Date: Thu, 16 May 2024 10:26:29 -0700
Subject: [PATCH 086/178] Update index.rst (#9080)

Removed best-practices.rst file

Signed-off-by: jgerh <163925524+jgerh@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 docs/source/index.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 82d3359480ca..eb586f749842 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -41,7 +41,6 @@ For quick guides and tutorials, see the "Getting started" section below.
    :titlesonly:
 
    starthere/intro
-   starthere/best-practices
    starthere/tutorials
 
 For more information, browse the developer docs for your area of interest in the contents section below or on the left sidebar.
@@ -86,4 +85,4 @@ For more information, browse the developer docs for your area of interest in the
    :name: Speech AI Tools
    :titlesonly:
 
-   tools/intro
\ No newline at end of file
+   tools/intro

From 526b6ade4bb078635d88feff76b5941c24db9e66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Thu, 16 May 2024 20:45:05 +0200
Subject: [PATCH 087/178] ci: Speeding NeMo-CI up by using caching (#9174)

* build: Add `Dockerfile.ci`

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci: Build, push, and test ci image

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* chore: Disable cache dir for NeMo reinstall

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* revert: Modify `reinstall.sh`

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix: install modelopt[torch] instead of ammo

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* deduplicate requirements

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* make mcore/datasets

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cicd-main.yml | 123 ++++++++++----------------------
 Dockerfile.ci                   |  74 +++++++++++++++++++
 2 files changed, 112 insertions(+), 85 deletions(-)
 create mode 100644 Dockerfile.ci

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 8430dae56418..ed2fc9f71f49 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -75,92 +75,45 @@ jobs:
       uses: actions/checkout@v4
       with:
         path: ${{ github.run_id }}
-
-    - name: Container setup
-      run: |
-        # Pull base PyTorch container
-        docker pull nvcr.io/nvidia/pytorch:24.02-py3
-        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.02-py3 /bin/bash -c '
-            set -x
-
-            # PyTorch version
-            python -c "import torch; print(torch.__version__)"
-            python -c "import torchvision; print(torchvision.__version__)"
-
-            # Install test requirements
-            apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt
-
-            # Code formatting checks
-            python setup.py style
-
-            # Copyright Headers check
-            python tests/check_copyright_header.py --dir .
-
-            # NeMo Installation
-            ./reinstall.sh release
-
-            # Transformer Engine installation
-            git clone https://github.com/NVIDIA/TransformerEngine.git && \
-                pushd TransformerEngine && \
-                git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
-                git checkout FETCH_HEAD && \
-                git submodule init && git submodule update && \
-                NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .  && \
-                popd
-
-            # Apex installation
-            git clone https://github.com/NVIDIA/apex.git && \
-                pushd apex && \
-                git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
-                cp -R apex /usr/local/lib/python3.10/dist-packages && \
-                popd
-
-            # pip package should be working with main, if not we can update the commit here
-            # until the pip package is updated
-            # Megatron Core installation
-            git clone https://github.com/NVIDIA/Megatron-LM.git && \
-                pushd Megatron-LM && \
-                git checkout c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9 && \
-                pip install . && \
-                  pushd megatron/core/datasets && \
-                  make && \
-                  popd && \
-                popd
-            export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
-
-            # Install only for test: L2: Segmentation Tool
-            pushd tools/ctc_segmentation && \
-                pip install -r requirements.txt && \
-                apt-get update && apt-get install libsox-fmt-all -y && \
-                popd
-
-            # ModelOpt installation
-            pip install nvidia-modelopt[torch]~=0.11.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
-
-            # PyTorch Lightning version
-            python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
-
-            # PyTorch Lightning DDP Checks
-            CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
-
-            # Basic Import Checks
-            python -c "import nemo.collections.asr as nemo_asr"
-            python -c "import nemo.collections.nlp as nemo_nlp"
-            python -c "import nemo.collections.tts as nemo_tts"
-
-            # set permission
-            chmod 777 -R /workspace
-            '
-            ### \'\'
-
-    - name: Push container to registry for future use
+    
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      with: 
+        # We use `docker` driver as this speeds things up for 
+        # trivial (non-multi-stage) builds.
+        driver: docker
+
+    - name: Build and push
+      uses: docker/build-push-action@v5
+      with:
+        file: Dockerfile.ci
+        push: true
+        cache-from: nemoci.azurecr.io/nemo_container:latest
+        cache-to: type=inline
+        tags: |
+          nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+          nemoci.azurecr.io/nemo_container:latest
+
+    - name: Run some checks
       run: |
-        # Push container
-        echo "Docker: List containers" && docker ps -a
-        DOCKER_COMMIT=$(docker ps --latest --quiet)  # latest container
-        docker commit $DOCKER_COMMIT nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-        docker tag nemoci.azurecr.io/nemo_container_${{ github.run_id }} nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-        docker push nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+        docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '\
+          # PyTorch Lightning version
+          python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
+
+          # PyTorch Lightning DDP Checks
+          CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
+
+          # Basic Import Checks
+          python -c "import nemo.collections.asr as nemo_asr"
+          python -c "import nemo.collections.nlp as nemo_nlp"
+          python -c "import nemo.collections.tts as nemo_tts"
+
+          python setup.py style
+          python tests/check_copyright_header.py --dir .
+
+          # These checks are not crucial
+          exit 0
+        '
 
     # - name: Build and push to local registry
     #   uses: docker/build-push-action@v5
diff --git a/Dockerfile.ci b/Dockerfile.ci
new file mode 100644
index 000000000000..5b2cd8d6eb61
--- /dev/null
+++ b/Dockerfile.ci
@@ -0,0 +1,74 @@
+# syntax=docker/dockerfile:1-labs
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
+
+FROM ${BASE_IMAGE}
+
+ENV TRANSFORMERS_OFFLINE=0 
+ENV HYDRA_FULL_ERROR=1
+ENV PYTHONUNBUFFERED=1
+
+# APT packages
+RUN <<"EOF" bash -ex
+apt-get update
+apt-get install -y bc libsox-fmt-all -y 
+apt-get clean
+EOF
+
+WORKDIR /workspace
+
+# Install NeMo requirements
+ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
+ARG MODELOPT_VERSION=0.11.0
+ARG MCORE_TAG=c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9
+ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
+RUN \
+--mount=type=bind,source=requirements,target=requirements \
+--mount=type=bind,source=tools,target=tools \
+--mount=type=bind,source=setup.py,target=setup.py \
+--mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \
+--mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
+pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
+"transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
+"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
+"nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
+"apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
+-r tools/ctc_segmentation/requirements.txt \
+".[all]"
+
+# Megatron Core installation
+git clone https://github.com/NVIDIA/Megatron-LM.git && \
+pushd Megatron-LM && \
+git checkout ${MCORE_TAG} && \
+  pushd megatron/core/datasets && \
+  make && \
+  popd && \
+popd
+export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
+EOF
+
+# Copy over NeMo code
+COPY ./ ./
+RUN <<"EOF" bash -ex
+pip install --no-cache-dir --no-build-isolation ".[all]"
+
+# set permission
+chmod 777 -R /workspace
+EOF
+
+ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
+

From e465b9c880b2cb54f8863c2eb5360c24d674ef6f Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Thu, 16 May 2024 15:09:33 -0400
Subject: [PATCH 088/178] Add save option to the TRT-LLM export test script
 (#9221)

* Add save option to the test script

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
---
 tests/export/test_nemo_export.py | 141 ++++++++++++++++++++++++-------
 1 file changed, 110 insertions(+), 31 deletions(-)

diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
index 0c5a9d9e2309..0e9981403a1a 100644
--- a/tests/export/test_nemo_export.py
+++ b/tests/export/test_nemo_export.py
@@ -81,7 +81,12 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=Non
 
             if nq is not None:
                 trtllm_deployed_output = nq.query_llm(
-                    prompts=[prompt], max_output_token=1, top_k=1, top_p=0, temperature=0.1, task_id=task_ids,
+                    prompts=[prompt],
+                    max_output_token=1,
+                    top_k=1,
+                    top_p=0,
+                    temperature=0.1,
+                    task_id=task_ids,
                 )
                 trtllm_deployed_output = trtllm_deployed_output[0][0].strip().lower()
 
@@ -140,6 +145,7 @@ def run_trt_llm_inference(
     stop_words_list=None,
     test_deployment=False,
     test_data_path=None,
+    save_trt_engine=False,
 ):
     if Path(checkpoint_path).exists():
         if n_gpu > torch.cuda.device_count():
@@ -213,7 +219,8 @@ def run_trt_llm_inference(
 
         if ptuning:
             trt_llm_exporter.add_prompt_table(
-                task_name="0", prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
+                task_name="0",
+                prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
             )
 
         output = trt_llm_exporter.forward(
@@ -232,7 +239,11 @@ def run_trt_llm_inference(
         nm = None
         output_deployed = ""
         if test_deployment:
-            nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name=model_name, port=8000,)
+            nm = DeployPyTriton(
+                model=trt_llm_exporter,
+                triton_model_name=model_name,
+                port=8000,
+            )
             nm.deploy()
             nm.run()
             nq = NemoQueryLLM(url="localhost:8000", model_name=model_name)
@@ -261,12 +272,17 @@ def run_trt_llm_inference(
             result = get_accuracy_with_lambada(trt_llm_exporter, nq, task_ids, lora_uids, test_data_path)
             if test_deployment:
                 nm.stop()
-            shutil.rmtree(trt_llm_model_dir)
+
+            if not save_trt_engine:
+                shutil.rmtree(trt_llm_model_dir)
             return result
 
         if test_deployment:
             nm.stop()
-        shutil.rmtree(trt_llm_model_dir)
+
+        if not save_trt_engine:
+            shutil.rmtree(trt_llm_model_dir)
+
         return None, None, None, None, None
     else:
         raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path))
@@ -284,6 +300,7 @@ def run_existing_checkpoints(
     test_deployment=False,
     stop_words_list=None,
     test_data_path=None,
+    save_trt_engine=False,
 ):
     if n_gpus > torch.cuda.device_count():
         print("Skipping the test due to not enough number of GPUs")
@@ -338,6 +355,7 @@ def run_existing_checkpoints(
         stop_words_list=stop_words_list,
         test_deployment=test_deployment,
         test_data_path=test_data_path,
+        save_trt_engine=save_trt_engine,
     )
 
 
@@ -348,87 +366,146 @@ def get_args():
     )
 
     parser.add_argument(
-        "--model_name", type=str, required=True,
+        "--model_name",
+        type=str,
+        required=True,
     )
     parser.add_argument(
-        "--existing_test_models", default=False, action='store_true',
+        "--existing_test_models",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--model_type", type=str, required=False,
+        "--model_type",
+        type=str,
+        required=False,
     )
     parser.add_argument(
-        "--min_gpus", type=int, default=1, required=True,
+        "--min_gpus",
+        type=int,
+        default=1,
+        required=True,
     )
     parser.add_argument(
-        "--max_gpus", type=int,
+        "--max_gpus",
+        type=int,
     )
     parser.add_argument(
-        "--checkpoint_dir", type=str, default="/tmp/nemo_checkpoint/", required=False,
+        "--checkpoint_dir",
+        type=str,
+        default="/tmp/nemo_checkpoint/",
+        required=False,
     )
     parser.add_argument(
-        "--trt_llm_model_dir", type=str,
+        "--trt_llm_model_dir",
+        type=str,
     )
     parser.add_argument(
-        "--max_batch_size", type=int, default=8,
+        "--max_batch_size",
+        type=int,
+        default=8,
     )
     parser.add_argument(
-        "--max_input_token", type=int, default=256,
+        "--max_input_token",
+        type=int,
+        default=256,
     )
     parser.add_argument(
-        "--max_output_token", type=int, default=128,
+        "--max_output_token",
+        type=int,
+        default=128,
     )
     parser.add_argument(
-        "--p_tuning_checkpoint", type=str,
+        "--p_tuning_checkpoint",
+        type=str,
     )
     parser.add_argument(
-        "--ptuning", default=False, action='store_true',
+        "--ptuning",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--lora_checkpoint", type=str,
+        "--lora_checkpoint",
+        type=str,
     )
     parser.add_argument(
-        "--lora", default=False, action='store_true',
+        "--lora",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--tp_size", type=int,
+        "--tp_size",
+        type=int,
     )
     parser.add_argument(
-        "--pp_size", type=int,
+        "--pp_size",
+        type=int,
     )
     parser.add_argument(
-        "--top_k", type=int, default=1,
+        "--top_k",
+        type=int,
+        default=1,
     )
     parser.add_argument(
-        "--top_p", type=float, default=0.0,
+        "--top_p",
+        type=float,
+        default=0.0,
     )
     parser.add_argument(
-        "--temperature", type=float, default=1.0,
+        "--temperature",
+        type=float,
+        default=1.0,
     )
     parser.add_argument(
-        "--run_accuracy", default=False, action='store_true',
+        "--run_accuracy",
+        type=str,
+        default="False",
     )
     parser.add_argument("--streaming", default=False, action="store_true")
     parser.add_argument(
-        "--test_deployment", type=str, default="False",
+        "--test_deployment",
+        type=str,
+        default="False",
+    )
+    parser.add_argument(
+        "--debug",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--debug", default=False, action='store_true',
+        "--ci_upload_test_results_to_cloud",
+        default=False,
+        action='store_true',
     )
     parser.add_argument(
-        "--ci_upload_test_results_to_cloud", default=False, action='store_true',
+        "--test_data_path",
+        type=str,
+        default=None,
     )
     parser.add_argument(
-        "--test_data_path", type=str, default=None,
+        "--save_trt_engine",
+        type=str,
+        default="False",
     )
 
     return parser.parse_args()
 
 
 def run_inference_tests(args):
-    if args.test_deployment == "False":
+    if args.test_deployment == "True":
+        args.test_deployment = True
+    else:
         args.test_deployment = False
+
+    if args.save_trt_engine == "True":
+        args.save_trt_engine = True
     else:
-        args.test_deployment = True
+        args.save_trt_engine = False
+
+    if args.run_accuracy == "True":
+        args.run_accuracy = True
+    else:
+        args.run_accuracy = False
 
     if args.run_accuracy:
         if args.test_data_path is None:
@@ -453,6 +530,7 @@ def run_inference_tests(args):
                 test_deployment=args.test_deployment,
                 run_accuracy=args.run_accuracy,
                 test_data_path=args.test_data_path,
+                save_trt_engine=args.save_trt_engine,
             )
 
             n_gpus = n_gpus * 2
@@ -487,6 +565,7 @@ def run_inference_tests(args):
                 streaming=args.streaming,
                 test_deployment=args.test_deployment,
                 test_data_path=args.test_data_path,
+                save_trt_engine=args.save_trt_engine,
             )
 
             n_gpus = n_gpus * 2

From 73edac445e7881500d17e288ef04fcc406a69f4e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 16 May 2024 16:55:10 -0400
Subject: [PATCH 089/178] rename paths2audiofiles to audio (#9209) (#9220)

* rename paths2audiofiles to audio


* update transcribe to audio


---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
---
 tutorials/VoiceSwapSample.ipynb   | 2 +-
 tutorials/asr/Multilang_ASR.ipynb | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tutorials/VoiceSwapSample.ipynb b/tutorials/VoiceSwapSample.ipynb
index addf19f3b236..c56544d9a043 100644
--- a/tutorials/VoiceSwapSample.ipynb
+++ b/tutorials/VoiceSwapSample.ipynb
@@ -146,7 +146,7 @@
                 "files = [Audio_sample]\n",
                 "raw_text = ''\n",
                 "text = ''\n",
-                "for fname, transcription in zip(files, quartznet.transcribe(paths2audio_files=files)):\n",
+                "for fname, transcription in zip(files, quartznet.transcribe(audio=files)):\n",
                 "  raw_text = transcription\n",
                 "\n",
                 "# Add capitalization and punctuation\n",
diff --git a/tutorials/asr/Multilang_ASR.ipynb b/tutorials/asr/Multilang_ASR.ipynb
index 6354ce10ec6d..9877b983f2a1 100644
--- a/tutorials/asr/Multilang_ASR.ipynb
+++ b/tutorials/asr/Multilang_ASR.ipynb
@@ -701,7 +701,7 @@
    },
    "outputs": [],
    "source": [
-    "asr_model.transcribe(transcribe = es_files) [0]"
+    "asr_model.transcribe(audio = es_files) [0]"
    ]
   },
   {
@@ -1173,7 +1173,7 @@
    },
    "outputs": [],
    "source": [
-    "asr_model.transcribe(transcribe = en_files)[0]"
+    "asr_model.transcribe(audio = en_files)[0]"
    ]
   },
   {
@@ -1221,7 +1221,7 @@
    },
    "outputs": [],
    "source": [
-    "asr_model.transcribe(transcribe = es_files)[0]"
+    "asr_model.transcribe(audio = es_files)[0]"
    ]
   },
   {

From d2e047a203ed9b2b3e332a5737a9c8019dfb55b6 Mon Sep 17 00:00:00 2001
From: Ao Tang <aot@nvidia.com>
Date: Thu, 16 May 2024 23:47:31 -0400
Subject: [PATCH 090/178] Checkpoint resuming compatible for 2403 container
 (#9199)

* make ckpt loading backward compatible

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* if not using dist optimizer, the states are stored in 'optimizer'

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* code refactor

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* typo

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py | 68 +++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 079732f6b9c5..f50a467cf71a 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -98,6 +98,7 @@
 try:
     from megatron.core import dist_checkpointing, parallel_state
     from megatron.core.dist_checkpointing.dict_utils import dict_list_map_outplace
+    from megatron.core.dist_checkpointing.mapping import LocalNonpersitentObject
     from megatron.core.dist_checkpointing.optimizer import (
         get_param_id_to_sharded_param_map,
         make_sharded_optimizer_tensor,
@@ -415,6 +416,70 @@ def _fix_device(t):
 
         return dict_list_map_outplace(_fix_device, ckpt)
 
+    def _get_param_group(self, state_dict: Dict[str, Any]):
+        """Return the param groups in the state dict"""
+        return (
+            state_dict['optimizer_states'][0]['param_groups']
+            if 'optimizer' not in state_dict['optimizer_states'][0]
+            else state_dict['optimizer_states'][0]['optimizer']['param_groups']
+        )
+
+    def _check_param_groups_mismatch(self, checkpoint_path: Union[str, Path], sharded_state_dict: Dict[str, Any]):
+        """
+        Check if the number of param groups in the checkpoint not match with the sharded_state_dict
+        Returns:
+            bool: True if the number of param groups does not match
+        """
+        common_state_dict = dist_checkpointing.load_common_state_dict(checkpoint_path)
+        model_param_groups = self._get_param_group(common_state_dict)
+        checkpoint_param_groups = self._get_param_group(sharded_state_dict)
+        return len(model_param_groups) != len(checkpoint_param_groups)
+
+    def _fix_param_groups(
+        self, checkpoint_path: Union[str, Path], sharded_state_dict: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Try to fix the param groups in the checkpoint.
+        This is to fix the bug that in 24.03, all checkpoints store EP param group regardless of using EP or not.
+        This function makes sure all checkpoints are compatible for loading.
+        Returns:
+            sharded_state_dict: Loaded dictionary for the distributed load function
+        """
+        common_state_dict = dist_checkpointing.load_common_state_dict(checkpoint_path)
+        model_param_groups = self._get_param_group(sharded_state_dict)
+        checkpoint_param_groups = self._get_param_group(common_state_dict)
+
+        model_has_expert_param = any(param.get('is_expert', False) for param in model_param_groups)
+        checkpoint_has_expert_param = any(param.get('is_expert', False) for param in checkpoint_param_groups)
+
+        expert_index = None
+        if checkpoint_has_expert_param and not model_has_expert_param:
+            logging.warning(
+                'Currently training the model without expert parallelism while restored checkpoint has EP params. Ignoring the EP params for restoring.'
+            )
+            expert_index = next(
+                (index for index, entry in enumerate(checkpoint_param_groups) if entry.get('is_expert', False)),
+                None,
+            )
+            if expert_index:
+                # Temporary empty params so that loading doesn't fail
+                model_param_groups.insert(expert_index, {'params': LocalNonpersitentObject([]), 'is_expert': True})
+                if 'optimizer' in sharded_state_dict['optimizer_states'][0]:
+                    sharded_state_dict['optimizer_states'][0]['optimizer']['param_groups'] = model_param_groups
+                else:
+                    sharded_state_dict['optimizer_states'][0]['param_groups'] = model_param_groups
+            else:
+                raise ValueError('Cannot find expert param in the checkpoint.')
+
+        loaded_state_dict = self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=sharded_state_dict)
+        if expert_index is not None:
+            # Remove the temporary empty params added above
+            if 'optimizer' in loaded_state_dict['optimizer_states'][0]:
+                loaded_state_dict['optimizer_states'][0]['optimizer']['param_groups'].pop(expert_index)
+            else:
+                loaded_state_dict['optimizer_states'][0]['param_groups'].pop(expert_index)
+        return loaded_state_dict
+
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
         """PTL method which we override to integrate distributed checkpoints for model parallel models.
         In order to load distributed checkpoints we need to provide the sharded_state_dict to
@@ -437,6 +502,9 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
             # after dist_checkpointing.load, sharded tensors will be replaced with tensors
             checkpoint['state_dict'] = sharded_state_dict
             checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
+
+            if self._check_param_groups_mismatch(checkpoint_path, checkpoint):
+                return self._fix_param_groups(checkpoint_path, checkpoint)
             return self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=checkpoint)
 
         # Legacy model parallel checkpointing logic, does not use megatron core

From b715f5a460d52d07eb935e7d58f3ff3a8c938521 Mon Sep 17 00:00:00 2001
From: Cathy <815244047@qq.com>
Date: Fri, 17 May 2024 12:54:40 +0800
Subject: [PATCH 091/178] support QWen1.5/QWen2 (#9055)

* support qwen1.5(qwen2)

Signed-off-by: Agoniii <815244047@qq.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove unused import

Signed-off-by: Cathy <815244047@qq.com>

* Apply isort and black reformatting

Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>

---------

Signed-off-by: Agoniii <815244047@qq.com>
Signed-off-by: Cathy <815244047@qq.com>
Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: pablo-garay <pablo-garay@users.noreply.github.com>
---
 .../conf/megatron_qwen2_config.yaml           | 227 +++++++++++++
 .../conf/megatron_qwen2_inference.yaml        |  39 +++
 .../convert_qwen2_hf_to_nemo.py               | 307 ++++++++++++++++++
 .../convert_qwen2_nemo_to_hf.py               | 307 ++++++++++++++++++
 4 files changed, 880 insertions(+)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_qwen2_config.yaml
 create mode 100644 examples/nlp/language_modeling/conf/megatron_qwen2_inference.yaml
 create mode 100644 scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py
 create mode 100644 scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py

diff --git a/examples/nlp/language_modeling/conf/megatron_qwen2_config.yaml b/examples/nlp/language_modeling/conf/megatron_qwen2_config.yaml
new file mode 100644
index 000000000000..e96ba0599bb3
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_qwen2_config.yaml
@@ -0,0 +1,227 @@
+name: megatron_qwen2
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_qwen2
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  mcore_gpt: True
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 4 # limited by GPU memory
+  global_batch_size: 8 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  # model architecture
+  encoder_seq_length: 32768
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 40 # 4b: 40 | 7b: 32 | 14b: 40 | 72b: 80
+  hidden_size: 2560 # 4b: 2560 | 7b: 4096 | 14b: 5120 | 72b: 8192
+  ffn_hidden_size: 6912 # Transformer FFN hidden size. Usually 4 * hidden_size. | 4b: 6912 | 7b: 11008 | 14b: 13696 | 72b: 24576
+  num_attention_heads: 20 # 4b: 20 | 7b: 32 | 14b: 40 | 72b: 64
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+  layernorm_epsilon: 1e-5
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: False # Whether to use bias terms in all weight matrices.
+  qkv_bias: True
+  activation: 'fast-swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope']
+  rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  attention_type: 'multihead' # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: False # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: 20 # Number of query groups for group query attention. If None, normal attention is used. | 4b: 20 | 7b: 32 | 14b: 40 | 72b: 64
+  override_vocab_size: 151936 # 4b: 151936 | 7b: 151936 | 14b: 152064 | 72b: 152064
+  rotary_base: 5000000.0 #  4b: 5000000.0 | 7b: 1000000.0 | 14b: 1000000.0 | 72b: 1000000.0
+
+  tokenizer:
+    library: 'huggingface'
+    type: Qwen/Qwen1.5-4B
+    model: null # /path/to/tokenizer.model
+    vocab_file: null
+    merge_file: null 
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+
+  # Mixed precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+
+  # Miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Transformer Engine
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 
+  fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin 
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  data:
+   # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+    # data_prefix: ???
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 2
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: distributed_fused_adam
+    lr: 0.00015
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: True
+    overlap_param_sync: True
+    contiguous_grad_buffer: True
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 11873
+      min_lr: 1.0e-05
diff --git a/examples/nlp/language_modeling/conf/megatron_qwen2_inference.yaml b/examples/nlp/language_modeling/conf/megatron_qwen2_inference.yaml
new file mode 100644
index 000000000000..e508b01858f5
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_qwen2_inference.yaml
@@ -0,0 +1,39 @@
+inference:
+  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: True # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["</s>"]  # generation will stop when one of these tokens is generated
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 32 # 16, 32, or bf16
+  use_distributed_sampler: False
+
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+gpt_model_file: null  # GPT nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+prompts: # prompts for GPT inference
+  - "Q: How are you?"
+  - "Q: How big is the universe?"
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: False  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server
diff --git a/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py
new file mode 100644
index 000000000000..223c7af50843
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert Huggingface QWen2(QWen1.5) checkpoints into nemo checkpoint.
+  Example to run this conversion script:
+    python convert_qwen2_hf_to_nemo.py \
+     --input_name_or_path <path_to_hf_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file>
+"""
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import Qwen2ForCausalLM, Qwen2Tokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface QWen2 checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_qwen2_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+def load_config(args, qwen_config):
+    nemo_config = OmegaConf.load(args.hparams_file).model
+    if qwen_config.get('rope_theta', None):
+        nemo_config['rotary_base'] = qwen_config['rope_theta']
+    nemo_config.encoder_seq_length = qwen_config['max_position_embeddings']
+    nemo_config.num_layers = int(qwen_config['num_hidden_layers'])
+    nemo_config.hidden_size = qwen_config['hidden_size']
+    nemo_config.ffn_hidden_size = qwen_config['intermediate_size']
+    nemo_config.num_attention_heads = qwen_config['num_attention_heads']
+    nemo_config.max_position_embeddings = qwen_config['max_position_embeddings']
+    nemo_config.init_method_std = qwen_config['initializer_range']
+    nemo_config.layernorm_epsilon = qwen_config['rms_norm_eps']
+    if 'num_key_value_heads' in qwen_config:
+        nemo_config.num_query_groups = qwen_config['num_key_value_heads']
+    nemo_config.use_cpu_initialization = True
+    nemo_config.activation = 'fast-swiglu'
+    nemo_config.tokenizer.type = str(args.input_name_or_path)
+    nemo_config.tokenizer.model = str(args.input_name_or_path) + '/vocab.json'
+    nemo_config.override_vocab_size = qwen_config['vocab_size']
+
+    base = 128
+    while qwen_config['vocab_size'] % base != 0:
+        base //= 2
+    nemo_config.make_vocab_size_divisible_by = base
+
+    return nemo_config
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    model = Qwen2ForCausalLM.from_pretrained(args.input_name_or_path)
+    tokenizer = Qwen2Tokenizer.from_pretrained(args.input_name_or_path)
+    hf_config = vars(model.config)
+    print(f"hf_config: {hf_config}")
+    print("named parameters:")
+    for name, param in model.named_parameters():
+        print(f"- {name}")
+
+    nemo_config = load_config(args, hf_config)
+
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+    elif args.precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            precision = args.precision
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = args.precision[2:]  # prune bf in string
+    else:
+        precision = args.precision
+
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=nemo_config.get('native_amp_init_scale', 2**32),
+                growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+                hysteresis=nemo_config.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if nemo_config.get('megatron_amp_O2', False):
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+    nemo_config.precision = precision
+    print(f"nemo_config: {nemo_config}")
+
+    # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
+
+    hidden_size = hf_config["hidden_size"]
+    head_num = hf_config["num_attention_heads"]
+    head_size = hidden_size // head_num
+    num_layers = hf_config["num_hidden_layers"]
+
+    mcore_gpt = nemo_config.mcore_gpt
+
+    assert mcore_gpt == nemo_config.get(
+        'transformer_engine', False
+    ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+    param_to_weights = lambda param: param.float()
+
+    checkpoint = OrderedDict()
+    checkpoint['state_dict'] = OrderedDict()
+
+    embed_weight = model.state_dict()[f'model.embed_tokens.weight']
+    if mcore_gpt:
+        embed_weights_base_name = f'model.embedding.word_embeddings.weight'
+    else:
+        embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
+    checkpoint['state_dict'][embed_weights_base_name] = param_to_weights(embed_weight)
+
+    # in hf, this is defined as register_buffer(..., persistent=False) so it won't be in the state dict
+    if f'model.layers.0.self_attn.rotary_emb.inv_freq' in model.state_dict():
+        rotary_embed_weight = model.state_dict()[f'model.layers.0.self_attn.rotary_emb.inv_freq']
+        if mcore_gpt:
+            rotary_embed_weight_base_name = f'model.rotary_pos_emb.inv_freq'
+        else:
+            rotary_embed_weight_base_name = f'model.language_model.rotary_pos_emb.inv_freq'
+        checkpoint['state_dict'][rotary_embed_weight_base_name] = param_to_weights(rotary_embed_weight)
+
+    if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
+        num_query_groups = head_num
+    else:
+        num_query_groups = nemo_config.num_query_groups
+        assert head_num % num_query_groups == 0, 'head_num must be divisible by num_query_groups'
+    if mcore_gpt:
+        assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.'
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+        old_tensor_shape = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].size()
+        new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+        new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+        q = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.weight'].view(*new_q_tensor_shape)
+        k = model.state_dict()[f'model.layers.{l}.self_attn.k_proj.weight'].view(*new_kv_tensor_shape)
+        v = model.state_dict()[f'model.layers.{l}.self_attn.v_proj.weight'].view(*new_kv_tensor_shape)
+        qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+        heads_per_group = head_num // num_query_groups
+        for i in range(num_query_groups):
+            qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+            qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+            qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+        qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+        if mcore_gpt:
+            qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'
+        else:
+            qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight'
+        checkpoint['state_dict'][qkv_weights_base_name] = param_to_weights(qkv_weights)
+
+        new_q_tensor_shape = (head_num, head_size)
+        new_kv_tensor_shape = (num_query_groups, head_size)
+        q = model.state_dict()[f'model.layers.{l}.self_attn.q_proj.bias'].view(*new_q_tensor_shape)
+        k = model.state_dict()[f'model.layers.{l}.self_attn.k_proj.bias'].view(*new_kv_tensor_shape)
+        v = model.state_dict()[f'model.layers.{l}.self_attn.v_proj.bias'].view(*new_kv_tensor_shape)
+        qkv_bias = torch.empty((0, head_size))
+        heads_per_group = head_num // num_query_groups
+        for i in range(num_query_groups):
+            qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :]))
+            qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :]))
+            qkv_bias = torch.cat((qkv_bias, v[i : i + 1, :]))
+        qkv_bias = qkv_bias.reshape(
+            [
+                head_size * (head_num + 2 * num_query_groups),
+            ]
+        )
+        if mcore_gpt:
+            qkv_bias_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.bias'
+        else:
+            qkv_bias_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.bias'
+        checkpoint['state_dict'][qkv_bias_base_name] = param_to_weights(qkv_bias)
+
+        # attention dense
+        o_weight = model.state_dict()[f'model.layers.{l}.self_attn.o_proj.weight']
+        if mcore_gpt:
+            o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
+        else:
+            o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight'
+        checkpoint['state_dict'][o_weight_base_name] = param_to_weights(o_weight)
+
+        # MLP
+        mlp_down_weight = model.state_dict()[f'model.layers.{l}.mlp.gate_proj.weight']
+        mlp_gate_weight = model.state_dict()[f'model.layers.{l}.mlp.up_proj.weight']
+        if mcore_gpt:
+            mlp_down_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.weight'
+        else:
+            mlp_down_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_h_to_4h.weight'
+        mlp_down_weight = torch.cat((mlp_down_weight, mlp_gate_weight), axis=0)
+        checkpoint['state_dict'][mlp_down_base_name] = param_to_weights(mlp_down_weight)
+
+        mlp_up_weight = model.state_dict()[f'model.layers.{l}.mlp.down_proj.weight']
+        if mcore_gpt:
+            mlp_up_base_name = f'model.decoder.layers.{l}.mlp.linear_fc2.weight'
+        else:
+            mlp_up_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_4h_to_h.weight'
+        checkpoint['state_dict'][mlp_up_base_name] = param_to_weights(mlp_up_weight)
+
+        # LayerNorm
+        input_ln_weight = model.state_dict()[f'model.layers.{l}.input_layernorm.weight']
+        if mcore_gpt:
+            input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+        else:
+            input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight'
+        checkpoint['state_dict'][input_ln_base_name] = param_to_weights(input_ln_weight)
+
+        post_attn_ln_weight = model.state_dict()[f'model.layers.{l}.post_attention_layernorm.weight']
+        if mcore_gpt:
+            post_attn_ln_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
+        else:
+            post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
+        checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+
+        print(f"done layer {l}")
+
+    final_ln_weight = model.state_dict()[f'model.norm.weight']
+    if mcore_gpt:
+        final_ln_base_name = f'model.decoder.final_layernorm.weight'
+    else:
+        final_ln_base_name = f'model.language_model.encoder.final_layernorm.weight'
+    checkpoint['state_dict'][final_ln_base_name] = param_to_weights(final_ln_weight)
+
+    output_layer_weight = model.state_dict()[f'lm_head.weight']
+    if mcore_gpt:
+        output_layer_base_name = f'model.output_layer.weight'
+    else:
+        output_layer_base_name = f'model.language_model.output_layer.weight'
+    checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight)
+
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+
+    del model
+
+    if nemo_config.get('megatron_amp_O2', False):
+        keys = list(checkpoint['state_dict'].keys())
+        for key in keys:
+            checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
+
+    model = load_state_dict_helper(MegatronGPTModel, nemo_config, trainer, checkpoint['state_dict'])
+
+    model._save_restore_connector = NLPSaveRestoreConnector()
+
+    # cast to target precision and disable cpu init
+    dtype = torch_dtype_from_precision(precision)
+    model = model.to(dtype=dtype)
+    model.cfg.use_cpu_initialization = False
+
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
new file mode 100644
index 000000000000..c6a218020c21
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_qwen2_nemo_to_hf.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from pytorch_lightning import Trainer
+from transformers import Qwen2ForCausalLM, Qwen2Tokenizer, Qwen2TokenizerFast, convert_slow_tokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils import logging
+
+"""
+Script to convert a QWen2 checkpoint in nemo (mcore path) into a HuggingFace checkpoint.
+This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder.
+
+1) Generate only HF weights from a nemo file:
+
+    python convert_qwen2_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin
+    
+2) Generate the full HF model folder
+
+    python convert_qwen2_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin \
+    --hf_input_path /path/to/input_hf_folder \
+    --hf_output_path /path/to/output_hf_folder \
+    --input_tokenizer /path/to/tokenizer \
+    --hf_output_tokenizer /path/to/output_tokenizer \
+
+    Use the --cpu-only flag if the model cannot fit in the GPU (e.g. qwen1.5 72b). 
+    However this option makes the conversion script significantly slower.
+"""
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to .nemo file or extracted folder",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file")
+    parser.add_argument(
+        "--hf_input_path",
+        type=str,
+        default=None,
+        help="A HF model path, " "e.g. a folder containing https://huggingface.co/Qwen/Qwen1.5-72B/tree/main",
+    )
+    parser.add_argument(
+        "--hf_output_path",
+        type=str,
+        default=None,
+        help="Output HF model path, " "with the same format as above but user's own weights",
+    )
+    parser.add_argument(
+        "--input_tokenizer",
+        type=str,
+        default=None,
+        help="Path to tokenizer used for the input nemo model. (need to extract the .nemo file first)",
+    )
+    parser.add_argument(
+        "--hf_output_tokenizer",
+        type=str,
+        default=None,
+        help="Path to save the tokenizer used for the output HF model.",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default=None,
+        help="Precision of output weights."
+        "Defaults to precision of the input nemo weights (model.cfg.trainer.precision)",
+    )
+    parser.add_argument(
+        "--cpu-only",
+        action="store_true",
+        help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
+        "but this option makes the conversion script significantly slower.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None:
+    """
+    Convert NeMo weights to HF weights
+    """
+    dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy())
+    model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
+    model_config.tensor_model_parallel_size = 1
+    model_config.pipeline_model_parallel_size = 1
+    if cpu_only:
+        map_location = torch.device('cpu')
+        model_config.use_cpu_initialization = True
+    else:
+        map_location = None
+
+    if cpu_only:
+        logging.info("******** Loading model on CPU. This will take a significant amount of time.")
+    model = MegatronGPTModel.restore_from(
+        input_nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
+    )
+    if precision is None:
+        precision = model.cfg.precision
+    if precision in [32, "32"]:
+        dtype = torch.float32
+    elif precision in [16, "16", "16-mixed"]:
+        dtype = torch.float16
+    elif precision in ["bf16", "bf16-mixed"]:
+        dtype = torch.bfloat16
+    else:
+        logging.warning(f"Precision string {precision} is not recognized, falling back to fp32")
+        dtype = torch.float32  # fallback
+    logging.info(f"Using precision {dtype}")
+
+    param_to_weights = lambda param: param.to(dtype)
+    checkpoint = OrderedDict()
+
+    hidden_size = model.cfg.hidden_size
+    head_num = model.cfg.num_attention_heads
+    num_layers = model.cfg.num_layers
+    ffn_hidden_size = model.cfg.ffn_hidden_size
+    num_query_groups = model.cfg.get("num_query_groups", head_num)
+
+    head_size = hidden_size // head_num
+    heads_per_group = head_num // num_query_groups
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    # Embedding
+    embed_weight = model.state_dict()[f'model.embedding.word_embeddings.weight']
+    embed_weights_base_name = f'model.embed_tokens.weight'
+    checkpoint[embed_weights_base_name] = param_to_weights(embed_weight)
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+        # qkv weight
+        qkv_weights = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight']
+        qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
+
+        q_slice = torch.cat(
+            [
+                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                for i in range(num_query_groups)
+            ]
+        )
+        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+        q_weights_base_name = f'model.layers.{l}.self_attn.q_proj.weight'
+        k_weights_base_name = f'model.layers.{l}.self_attn.k_proj.weight'
+        v_weights_base_name = f'model.layers.{l}.self_attn.v_proj.weight'
+
+        checkpoint[q_weights_base_name] = param_to_weights(qkv_weights[q_slice].reshape(-1, hidden_size))
+        checkpoint[k_weights_base_name] = param_to_weights(qkv_weights[k_slice].reshape(-1, hidden_size))
+        checkpoint[v_weights_base_name] = param_to_weights(qkv_weights[v_slice].reshape(-1, hidden_size))
+
+        # qkv bias
+        qkv_bias = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.bias']
+        qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
+
+        q_slice = torch.cat(
+            [
+                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                for i in range(num_query_groups)
+            ]
+        )
+        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+        q_bias_base_name = f'model.layers.{l}.self_attn.q_proj.bias'
+        k_bias_base_name = f'model.layers.{l}.self_attn.k_proj.bias'
+        v_bias_base_name = f'model.layers.{l}.self_attn.v_proj.bias'
+
+        checkpoint[q_bias_base_name] = param_to_weights(
+            qkv_bias[q_slice].reshape(
+                -1,
+            )
+        )
+        checkpoint[k_bias_base_name] = param_to_weights(
+            qkv_bias[k_slice].reshape(
+                -1,
+            )
+        )
+        checkpoint[v_bias_base_name] = param_to_weights(
+            qkv_bias[v_slice].reshape(
+                -1,
+            )
+        )
+
+        # attention dense
+        o_weight = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_proj.weight']
+        o_weight_base_name = f'model.layers.{l}.self_attn.o_proj.weight'
+        checkpoint[o_weight_base_name] = param_to_weights(o_weight)
+
+        # mlp
+        mlp_weights = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.weight']
+        mlp_down_proj_weight = mlp_weights[:ffn_hidden_size, :]
+        mlp_gate_proj_weight = mlp_weights[ffn_hidden_size:, :]
+
+        mlp_down_proj_base_name = f'model.layers.{l}.mlp.gate_proj.weight'
+        mlp_gate_proj_base_name = f'model.layers.{l}.mlp.up_proj.weight'
+
+        checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight)
+        checkpoint[mlp_gate_proj_base_name] = param_to_weights(mlp_gate_proj_weight)
+
+        mlp_up_proj_weight = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc2.weight']
+        mlp_up_proj_base_name = f'model.layers.{l}.mlp.down_proj.weight'
+        checkpoint[mlp_up_proj_base_name] = param_to_weights(mlp_up_proj_weight)
+
+        # layernorm
+        input_ln_weight = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight']
+        input_ln_base_name = f'model.layers.{l}.input_layernorm.weight'
+        checkpoint[input_ln_base_name] = param_to_weights(input_ln_weight)
+
+        post_attn_ln_weight = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight']
+        post_attn_ln_base_name = f'model.layers.{l}.post_attention_layernorm.weight'
+        checkpoint[post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+
+        print(f"done layer {l}")
+
+    final_ln_weight = model.state_dict()[f'model.decoder.final_layernorm.weight']
+    final_ln_base_name = f'model.norm.weight'
+    checkpoint[final_ln_base_name] = param_to_weights(final_ln_weight)
+
+    output_layer_weight = model.state_dict()[f'model.output_layer.weight']
+    output_layer_base_name = f'lm_head.weight'
+    checkpoint[output_layer_base_name] = param_to_weights(output_layer_weight)
+
+    os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)
+    torch.save(checkpoint, output_hf_file)
+    logging.info(f"Weights saved to {output_hf_file}")
+
+    return dtype
+
+
+def replace_hf_weights_and_tokenizer(
+    weights_file,
+    dtype,
+    input_hf_path,
+    output_hf_path,
+    tokenizer_path,
+    output_hf_tokenizer,
+):
+    model = Qwen2ForCausalLM.from_pretrained(
+        input_hf_path,
+        local_files_only=True,
+        torch_dtype=dtype,
+    )
+    nemo_exported = torch.load(weights_file)
+
+    if tokenizer_path:
+        tokenizer = Qwen2Tokenizer.from_pretrained(
+            tokenizer_path,
+            local_files_only=True,
+            legacy=False,
+        )
+        tmp_tokenizer = convert_slow_tokenizer.convert_slow_tokenizer(tokenizer)
+        fast_tokenizer = Qwen2TokenizerFast(tokenizer_object=tmp_tokenizer)
+        tokenizer_length = len(fast_tokenizer)
+        model.resize_token_embeddings(tokenizer_length)
+
+    model.load_state_dict(nemo_exported)
+    model.save_pretrained(output_hf_path)
+    logging.info(f"Full HF model saved to {output_hf_path}")
+
+    if tokenizer_path:
+        fast_tokenizer.save_pretrained(output_hf_tokenizer)
+        tokenizer.save_pretrained(output_hf_tokenizer)
+        logging.info(f"Tokenizer saved to {output_hf_tokenizer}")
+
+
+if __name__ == '__main__':
+    args = get_args()
+    if not args.hf_output_tokenizer and args.hf_output_path:
+        args.hf_output_tokenizer = args.hf_output_path
+    dtype = convert(args.input_name_or_path, args.output_path, precision=args.precision, cpu_only=args.cpu_only)
+    if args.hf_input_path and args.hf_output_path:
+        replace_hf_weights_and_tokenizer(
+            args.output_path,
+            dtype,
+            args.hf_input_path,
+            args.hf_output_path,
+            args.input_tokenizer,
+            args.hf_output_tokenizer,
+        )
+    else:
+        logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.")
+        logging.info(f".bin file is saved to {args.output_path}")

From 18eed4d782fda1197d33358a2820fb9ba87731e8 Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Fri, 17 May 2024 10:26:56 +0200
Subject: [PATCH 092/178] Implement export with PyT Distributed checkpoints
 (#9058)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Implement PyT Dist load with MCore

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Use plain PyT Dist utils

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Implement TarPath compatible version

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Apply black

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/export/trt_llm/nemo/nemo_ckpt_convert.py | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
index 44133de381bd..8112bb8755e3 100644
--- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
+++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
@@ -14,6 +14,7 @@
 
 
 import configparser
+import json
 import logging
 import math
 import multiprocessing
@@ -28,6 +29,8 @@
 import torch
 import zarr
 from tensorrt_llm._utils import np_bfloat16, pad_vocab_size, str_dtype_to_torch, torch_to_numpy
+from torch.distributed.checkpoint import FileSystemReader, TensorStorageMetadata
+from torch.distributed.checkpoint.state_dict_loader import load_state_dict
 from tqdm import tqdm
 from transformers import AutoTokenizer, GPT2Tokenizer, LlamaConfig
 
@@ -122,6 +125,54 @@ def rename_key_dist_ckpt(old_key: str, layer: int):
 
 
 def load_sharded_metadata(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
+    with (checkpoint_dir / 'metadata.json').open(mode='r') as f:
+        config_dict = json.load(f)
+    if config_dict['sharded_backend'] == 'zarr':
+        return load_sharded_metadata_zarr(checkpoint_dir, torch_tensor)
+    elif config_dict['sharded_backend'] == 'torch_dist':
+        return load_sharded_metadata_torch_dist(checkpoint_dir, torch_tensor)
+    else:
+        raise NotImplementedError(f'Distributed checkpoint backend {config_dict["sharded_backend"]} not supported')
+
+
+class TarFileSystemReader(FileSystemReader):
+    """Reader that accepts both Path and TarPath checkpoint directory.
+
+    The FileSystemReader works with TarPath, but expects a pure Path.
+    It's enough to skip the Path check in __init__.
+    """
+
+    def __init__(self, path: Union[Path, TarPath]) -> None:
+        """No call to super().__init__ because it expects pure Path."""
+        self.path = path
+        self.storage_data = dict()
+
+
+def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
+    fs_reader = TarFileSystemReader(checkpoint_dir)
+    metadata = fs_reader.read_metadata()
+
+    state_dict = {
+        k: torch.empty(tp.size, dtype=tp.properties.dtype)
+        for k, tp in metadata.state_dict_metadata.items()
+        if isinstance(tp, TensorStorageMetadata)
+    }
+    load_state_dict(
+        state_dict,
+        storage_reader=fs_reader,
+        no_dist=True,
+    )
+
+    if not torch_tensor:
+        for k, v in state_dict.items():
+            if v.dtype == torch.bfloat16:
+                state_dict[k] = v.view(torch.int16).numpy().view(np_bfloat16)
+            else:
+                state_dict[k] = v.numpy()
+    return state_dict
+
+
+def load_sharded_metadata_zarr(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
     sharded_state_dict = {}
     for subdir in checkpoint_dir.iterdir():
         if not subdir.is_dir() or not (subdir / '.zarray').exists():

From ce1612d8a75d15cb3bf2e5a97d65f3f82e1bad2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Fri, 17 May 2024 17:34:37 +0200
Subject: [PATCH 093/178] ci: Multi-tenancy for tests and garbage collection
 (#9179)

* ci: Multi-tenancy for tests and garbage collection

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* add remaining testcases

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cicd-main.yml | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index ed2fc9f71f49..4efb525100d9 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -225,6 +225,10 @@ jobs:
             --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \
             --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
             --precision=16
+        - name: Cleanup
+          if: "always()"
+          run: |
+            rm -rf /home/TestData/nlp/megatron_llama/model_weights
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
@@ -251,6 +255,10 @@ jobs:
             --output_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo \
             --precision=16
             rm -f /home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo
+        - name: Cleanup
+          if: "always()"
+          run: |
+            rm -rf /home/TestData/nlp/megatron_llama/llama3-ci-hf/model_weights
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
@@ -272,10 +280,19 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - run: |
+            mkdir -p /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }};
             python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
             --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
-            --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf
-            rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo
+            --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}
+        - name: Cleanup
+          if: "always()"
+          run: |
+            rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; 
+            rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/
+        - name: Cleanup
+          if: "always()"
+          run: |
+            rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
@@ -301,6 +318,10 @@ jobs:
             --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
             --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
             rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
+        - name: Cleanup
+          if: "always()"
+          run: |
+            rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 

From eb31309851aa6e5db3a3292a411cee1018ed536a Mon Sep 17 00:00:00 2001
From: tbartley94 <90423858+tbartley94@users.noreply.github.com>
Date: Fri, 17 May 2024 08:38:19 -0700
Subject: [PATCH 094/178] Lhotse Sharding Fix (#9187)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* quick fix

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Update nemo/collections/common/data/lhotse/nemo_adapters.py

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* adding warning flag for non-sharded data.

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: tbartley94 <tbartley94@users.noreply.github.com>

---------

Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: tbartley94 <tbartley94@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: tbartley94 <tbartley94@users.noreply.github.com>
---
 .../common/data/lhotse/nemo_adapters.py        | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index b2ca1186c8e3..d24ce794da5a 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import random
 import re
 import tarfile
@@ -197,6 +198,12 @@ def __init__(
         self.shard_id_to_manifest: dict[int, Iterable[dict]]
         self.paths = expand_sharded_filepaths(manifest_path)
         if len(self.paths) == 1:
+            logging.warning(
+                f"""You are using Lhotse dataloading for tarred audio with a non-sharded manifest.
+                            This will incur significant memory overhead and slow-down training. To prevent this error message
+                            please shard file '{self.paths[0]}' using 'scripts/speech_recognition/convert_to_tarred_audio_dataset.py'
+                            WITHOUT '--no_shard_manifest'"""
+            )
             self.source = LazyJsonlIterator(self.paths[0])
             self.shard_id_to_manifest = groupby("shard_id", self.source)
         else:
@@ -272,15 +279,16 @@ def __iter__(self) -> Generator[Cut, None, None]:
             random.Random(seed).shuffle(shard_ids)
 
         for sid in shard_ids:
-            shard_manifest = self.shard_id_to_manifest[sid]
+            manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
+            shard_manifest = {data["audio_filepath"]: data for data in self.shard_id_to_manifest[sid]}
             tar_path = self.shard_id_to_tar_path[sid]
             with tarfile.open(fileobj=open_best(tar_path, mode="rb"), mode="r|*") as tar:
-                for data, tar_info in zip(shard_manifest, tar):
-                    manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
-                    assert data["audio_filepath"] == tar_info.name, (
+                for tar_info in tar:
+                    assert tar_info.name in shard_manifest, (
                         f"Mismatched entry between JSON manifest ('{manifest_path}') and tar file ('{tar_path}'). "
-                        f"Conflicting audio file names are JSON='{data['audio_filepath']}' and TAR='{tar_info.name}'"
+                        f"Cannot locate JSON entry for tar file '{tar_info.name}'"
                     )
+                    data = shard_manifest[tar_info.name]
                     raw_audio = tar.extractfile(tar_info).read()
                     # Note: Lhotse has a Recording.from_bytes() utility that we won't use here because
                     #       the profiling indicated significant overhead in torchaudio ffmpeg integration

From 7f3e535fa9f75467152b642c9a90587d7ac30bb5 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 17 May 2024 09:10:56 -0700
Subject: [PATCH 095/178] fix graphviz installation for local run (#9233)
 (#9234)

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>
Co-authored-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com>
---
 tutorials/asr/ASR_Context_Biasing.ipynb | 40 ++++++++++++++-----------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb
index bca4585e45cb..dd2e8176ad33 100644
--- a/tutorials/asr/ASR_Context_Biasing.ipynb
+++ b/tutorials/asr/ASR_Context_Biasing.ipynb
@@ -259,6 +259,7 @@
    "execution_count": null,
    "id": "d34ee0ba",
    "metadata": {
+    "collapsed": true,
     "jupyter": {
      "outputs_hidden": true
     },
@@ -717,6 +718,28 @@
     "The context graph consists of a composition of a prefix tree (Trie) with the CTC transition topology for words and phrases from the context-biasing list. We use a BPE tokenizer from the target ASR model for word segmentation."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55a36a27-919c-4d64-9163-b0b2c9dca15e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# install graphviz from source in case of local run (not Google Colab)\n",
+    "# this may take about 5-10 minutes\n",
+    "# make sure that env variables have been set\n",
+    "\n",
+    "if not IN_COLAB:\n",
+    "\n",
+    "    os.environ['DEBIAN_FRONTEND'] = 'noninteractive'\n",
+    "    os.environ['TZ'] = 'Etc/UTC'\n",
+    "\n",
+    "    !echo $DEBIAN_FRONTEND\n",
+    "    !echo $TZ\n",
+    "\n",
+    "    !{NEMO_DIR_PATH}/scripts/installers/install_graphviz.sh"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -750,23 +773,6 @@
     "context_graph.draw()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e1c57878",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# install graphviz from source if you have problems with graph picture\n",
-    "# set instal_graphviz = True\n",
-    "# this may take about 5-10 minutes\n",
-    "\n",
-    "instal_graphviz = False\n",
-    "\n",
-    "if instal_graphviz:\n",
-    "    !{NEMO_DIR_PATH}/scripts/installers/install_graphviz.sh"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "04a6f4be",

From 67401eda531bf12b184e0482a847d1298f48480c Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Fri, 17 May 2024 09:18:36 -0700
Subject: [PATCH 096/178] Support dataloader as input to `audio` for
 transcription (#9201)

* Support dataloader as input to `audio` for transcription

Signed-off-by: smajumdar <titu1994@gmail.com>

* Apply isort and black reformatting

Signed-off-by: titu1994 <titu1994@users.noreply.github.com>

* Support dataloader as input to `audio` for transcription

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update transcribe signatures

Signed-off-by: smajumdar <titu1994@gmail.com>

* Apply isort and black reformatting

Signed-off-by: titu1994 <titu1994@users.noreply.github.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
---
 .../asr/models/aed_multitask_models.py        | 12 +++--
 .../asr/models/classification_models.py       | 20 +++++---
 nemo/collections/asr/models/ctc_models.py     | 29 +++++++++---
 .../asr/models/hybrid_rnnt_ctc_models.py      | 12 +++--
 nemo/collections/asr/models/rnnt_models.py    | 28 +++++++----
 nemo/collections/asr/models/slu_models.py     | 15 +++---
 .../asr/models/transformer_bpe_models.py      | 12 +++--
 .../asr/parts/mixins/transcription.py         |  9 +++-
 .../asr/mixins/test_transcription.py          | 46 +++++++++++++++++++
 9 files changed, 139 insertions(+), 44 deletions(-)

diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index f9413a4dd738..b11d744a7e6a 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -21,6 +21,7 @@
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pytorch_lightning import Trainer
+from torch.utils.data import DataLoader
 
 from nemo.collections.asr.data.audio_to_text_lhotse_prompted import (
     PromptedAudioToTextLhotseDataset,
@@ -156,7 +157,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             self.transf_encoder = EncDecMultiTaskModel.from_config_dict(transf_encoder_cfg_dict)
 
             # Initialize weights
-            std_init_range = 1 / self.cfg.model_defaults.lm_enc_hidden ** 0.5
+            std_init_range = 1 / self.cfg.model_defaults.lm_enc_hidden**0.5
             self.transf_encoder.apply(lambda module: transformer_weights_init(module, std_init_range))
 
         transf_decoder_cfg_dict = cfg.transf_decoder
@@ -182,7 +183,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight
 
         # Initialize weights
-        std_init_range = 1 / self.cfg.model_defaults.lm_dec_hidden ** 0.5
+        std_init_range = 1 / self.cfg.model_defaults.lm_dec_hidden**0.5
         self.transf_decoder.apply(lambda module: transformer_weights_init(module, std_init_range))
         self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))
 
@@ -347,7 +348,7 @@ def change_vocabulary(
             self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight
 
         # Initialize weights of token classifier
-        std_init_range = 1 / self.cfg.model_defaults.lm_dec_hidden ** 0.5
+        std_init_range = 1 / self.cfg.model_defaults.lm_dec_hidden**0.5
         self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))
 
         # Setup Decoding class
@@ -387,7 +388,7 @@ def change_vocabulary(
     @torch.no_grad()
     def transcribe(
         self,
-        audio: Union[List[str], str],
+        audio: Union[str, List[str], np.ndarray, DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         task: Optional[str] = None,
@@ -403,7 +404,8 @@ def transcribe(
         """
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
         Args:
-            audio: (a list) of paths to audio files. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference.
diff --git a/nemo/collections/asr/models/classification_models.py b/nemo/collections/asr/models/classification_models.py
index c1294de5bdc0..7b226f59e364 100644
--- a/nemo/collections/asr/models/classification_models.py
+++ b/nemo/collections/asr/models/classification_models.py
@@ -15,7 +15,6 @@
 import copy
 import json
 import os
-import tempfile
 from abc import abstractmethod
 from dataclasses import dataclass, field
 from math import ceil, floor
@@ -24,6 +23,7 @@
 import torch
 from omegaconf import DictConfig, ListConfig, OmegaConf
 from pytorch_lightning import Trainer
+from torch.utils.data import DataLoader
 from torchmetrics import Accuracy
 from torchmetrics.regression import MeanAbsoluteError, MeanSquaredError
 
@@ -169,7 +169,8 @@ def forward(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
         # Crop or pad is always applied
         if self.crop_or_pad is not None:
@@ -355,7 +356,7 @@ def _setup_feature_label_dataloader(self, config: DictConfig) -> torch.utils.dat
     @torch.no_grad()
     def transcribe(
         self,
-        audio: List[str],
+        audio: Union[List[str], DataLoader],
         batch_size: int = 4,
         logprobs=None,
         override_config: Optional[ClassificationInferConfig] | Optional[RegressionInferConfig] = None,
@@ -364,7 +365,8 @@ def transcribe(
         Generate class labels for provided audio files. Use this method for debugging and prototyping.
 
         Args:
-            audio: (a single or list) of paths to audio files or a np.ndarray audio sample. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is approximately 1 second.
             batch_size: (int) batch size to use during inference. \
                 Bigger will result in better throughput performance but would use more memory.
@@ -952,7 +954,10 @@ def _setup_dataloader_from_config(self, config: DictConfig):
 
             shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0
             dataset = audio_to_label_dataset.get_tarred_audio_multi_label_dataset(
-                cfg=config, shuffle_n=shuffle_n, global_rank=self.global_rank, world_size=self.world_size,
+                cfg=config,
+                shuffle_n=shuffle_n,
+                global_rank=self.global_rank,
+                world_size=self.world_size,
             )
             shuffle = False
             if hasattr(dataset, 'collate_fn'):
@@ -1022,7 +1027,8 @@ def forward(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
 
         # Crop or pad is always applied
@@ -1124,7 +1130,7 @@ def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):
     def reshape_labels(self, logits, labels, logits_len, labels_len):
         """
         Reshape labels to match logits shape. For example, each label is expected to cover a 40ms frame, while each frme prediction from the
-        model covers 20ms. If labels are shorter than logits, labels are repeated, otherwise labels are folded and argmax is applied to obtain 
+        model covers 20ms. If labels are shorter than logits, labels are repeated, otherwise labels are folded and argmax is applied to obtain
         the label of each frame. When lengths of labels and logits are not factors of each other, labels are truncated or padded with zeros.
         The ratio_threshold=0.2 is used to determine whether to pad or truncate labels, where the value 0.2 is not important as in real cases the ratio
         is very close to either ceil(ratio) or floor(ratio). We use 0.2 here for easier unit-testing. This implementation does not allow frame length
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 4df02b1177cd..177da81f85f2 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -22,6 +22,7 @@
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pytorch_lightning import Trainer
+from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data import audio_to_text_dataset
@@ -119,7 +120,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
     def transcribe(
         self,
-        audio: Union[str, List[str], torch.Tensor, np.ndarray],
+        audio: Union[str, List[str], torch.Tensor, np.ndarray, DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         num_workers: int = 0,
@@ -135,7 +136,8 @@ def transcribe(
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
 
         Args:
-            audio: (a single or list) of paths to audio files or a np.ndarray audio array. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference.
@@ -493,7 +495,8 @@ def forward(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
 
         if self.spec_augmentation is not None and self.training:
@@ -579,7 +582,9 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
             log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len)
 
         transcribed_texts, _ = self.wer.decoding.ctc_decoder_predictions_tensor(
-            decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
+            decoder_outputs=log_probs,
+            decoder_lengths=encoded_len,
+            return_hypotheses=False,
         )
 
         sample_id = sample_id.cpu().detach().numpy()
@@ -601,11 +606,19 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0):
             log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
         )
         loss_value, metrics = self.add_interctc_losses(
-            loss_value, transcript, transcript_len, compute_wer=True, log_wer_num_denom=True, log_prefix="val_",
+            loss_value,
+            transcript,
+            transcript_len,
+            compute_wer=True,
+            log_wer_num_denom=True,
+            log_prefix="val_",
         )
 
         self.wer.update(
-            predictions=log_probs, targets=transcript, targets_lengths=transcript_len, predictions_lengths=encoded_len,
+            predictions=log_probs,
+            targets=transcript,
+            targets_lengths=transcript_len,
+            predictions_lengths=encoded_len,
         )
         wer, wer_num, wer_denom = self.wer.compute()
         self.wer.reset()
@@ -677,7 +690,9 @@ def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> Gen
         logits_len = outputs.pop('logits_len')
 
         current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor(
-            logits, decoder_lengths=logits_len, return_hypotheses=trcfg.return_hypotheses,
+            logits,
+            decoder_lengths=logits_len,
+            return_hypotheses=trcfg.return_hypotheses,
         )
         if trcfg.return_hypotheses:
             if logits.is_cuda:
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
index 3eaab9961ef8..9a5c4188aebd 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
@@ -111,7 +111,8 @@ def transcribe(
 
         Args:
 
-            audio: (a list) of paths to audio files. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference. \
@@ -182,7 +183,9 @@ def _transcribe_output_processing(
         encoded_len = outputs.pop('encoded_len')
 
         best_hyp, all_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor(
-            logits, encoded_len, return_hypotheses=trcfg.return_hypotheses,
+            logits,
+            encoded_len,
+            return_hypotheses=trcfg.return_hypotheses,
         )
         logits = logits.cpu()
 
@@ -554,7 +557,10 @@ def validation_pass(self, batch, batch_idx, dataloader_idx):
             loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss
             tensorboard_logs['val_loss'] = loss_value
         self.ctc_wer.update(
-            predictions=log_probs, targets=transcript, targets_lengths=transcript_len, predictions_lengths=encoded_len,
+            predictions=log_probs,
+            targets=transcript,
+            targets_lengths=transcript_len,
+            predictions_lengths=encoded_len,
         )
         ctc_wer, ctc_wer_num, ctc_wer_denom = self.ctc_wer.compute()
         self.ctc_wer.reset()
diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
index 386f2a915142..cb2505fbadbf 100644
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@@ -13,16 +13,15 @@
 # limitations under the License.
 
 import copy
-import json
 import os
-import tempfile
 from math import ceil
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+import numpy as np
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pytorch_lightning import Trainer
-from tqdm.auto import tqdm
+from torch.utils.data import DataLoader
 
 from nemo.collections.asr.data import audio_to_text_dataset
 from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
@@ -101,7 +100,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.cfg.decoding = self.set_decoding_type_according_to_loss(self.cfg.decoding)
         # Setup decoding objects
         self.decoding = RNNTDecoding(
-            decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary,
+            decoding_cfg=self.cfg.decoding,
+            decoder=self.decoder,
+            joint=self.joint,
+            vocabulary=self.joint.vocabulary,
         )
         # Setup WER calculation
         self.wer = WER(
@@ -236,7 +238,7 @@ def set_decoding_type_according_to_loss(self, decoding_cfg):
     @torch.no_grad()
     def transcribe(
         self,
-        audio: List[str],
+        audio: Union[str, List[str], np.ndarray, DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         partial_hypothesis: Optional[List['Hypothesis']] = None,
@@ -250,7 +252,8 @@ def transcribe(
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
 
         Args:
-            audio: (a list) of paths to audio files. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference. \
@@ -338,7 +341,10 @@ def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[Di
             decoding_cfg = self.set_decoding_type_according_to_loss(decoding_cfg)
 
             self.decoding = RNNTDecoding(
-                decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary,
+                decoding_cfg=decoding_cfg,
+                decoder=self.decoder,
+                joint=self.joint,
+                vocabulary=self.joint.vocabulary,
             )
 
             self.wer = WER(
@@ -394,7 +400,10 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig):
         decoding_cfg = self.set_decoding_type_according_to_loss(decoding_cfg)
 
         self.decoding = RNNTDecoding(
-            decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary,
+            decoding_cfg=decoding_cfg,
+            decoder=self.decoder,
+            joint=self.joint,
+            vocabulary=self.joint.vocabulary,
         )
 
         self.wer = WER(
@@ -649,7 +658,8 @@ def forward(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
 
         # Spec augment is not applied during evaluation/testing
diff --git a/nemo/collections/asr/models/slu_models.py b/nemo/collections/asr/models/slu_models.py
index 1303bbfde7ea..c599b7f4272a 100644
--- a/nemo/collections/asr/models/slu_models.py
+++ b/nemo/collections/asr/models/slu_models.py
@@ -13,15 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
 import os
-import tempfile
 from math import ceil
 from typing import Any, Dict, List, Optional, Union
 
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
-from tqdm.auto import tqdm
+from torch.utils.data import DataLoader
 
 from nemo.collections.asr.data import audio_to_text_dataset
 from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs
@@ -190,7 +188,8 @@ def forward(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
 
         if self.spec_augmentation is not None and self.training:
@@ -278,7 +277,8 @@ def predict(
 
         if not has_processed_signal:
             processed_signal, processed_signal_length = self.preprocessor(
-                input_signal=input_signal, length=input_signal_length,
+                input_signal=input_signal,
+                length=input_signal_length,
             )
 
         if self.spec_augmentation is not None and self.training:
@@ -560,7 +560,7 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
     @torch.no_grad()
     def transcribe(
         self,
-        audio: List[str],
+        audio: Union[List[str], DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         num_workers: int = 0,
@@ -571,7 +571,8 @@ def transcribe(
         Use this method for debugging and prototyping.
 
         Args:
-            audio: (a list) of paths to audio files. \
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference.
diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 21a5f34b3038..e7e67f8fbb2f 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -24,6 +24,7 @@
 import torch.distributed as dist
 from omegaconf import DictConfig, OmegaConf, open_dict
 from pytorch_lightning import Trainer
+from torch.utils.data import DataLoader
 from torchmetrics.text import SacreBLEUScore
 from tqdm.auto import tqdm
 
@@ -141,7 +142,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             num_layers=self.cfg.head.num_layers,
         )
         self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight
-        std_init_range = 1 / self.transf_decoder.hidden_size ** 0.5
+        std_init_range = 1 / self.transf_decoder.hidden_size**0.5
         self.transf_decoder.apply(lambda module: transformer_weights_init(module, std_init_range))
         self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))
 
@@ -174,7 +175,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
     @torch.no_grad()
     def transcribe(
         self,
-        audio: List[str],
+        audio: Union[List[str], DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         num_workers: int = 0,
@@ -185,7 +186,8 @@ def transcribe(
         """
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
         Args:
-            audio: (a list) of paths to audio files. \
+            audio: (a list) of paths to audio files.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds. \
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference.
@@ -225,7 +227,9 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
                 config,
                 global_rank=self.global_rank,
                 world_size=self.world_size,
-                dataset=LhotseSpeechToTextBpeDataset(tokenizer=self.tokenizer,),
+                dataset=LhotseSpeechToTextBpeDataset(
+                    tokenizer=self.tokenizer,
+                ),
             )
 
         dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config(
diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py
index c252d498dc08..df8d6bac50a9 100644
--- a/nemo/collections/asr/parts/mixins/transcription.py
+++ b/nemo/collections/asr/parts/mixins/transcription.py
@@ -186,7 +186,7 @@ class TranscriptionMixin(ABC):
     @torch.no_grad()
     def transcribe(
         self,
-        audio: Union[str, List[str], np.ndarray],
+        audio: Union[str, List[str], np.ndarray, DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
         num_workers: int = 0,
@@ -201,6 +201,7 @@ def transcribe(
 
         Args:
             audio: (a single or list) of paths to audio files or a np.ndarray audio array.
+                Can also be a dataloader object that provides values that can be consumed by the model.
                 Recommended length per file is between 5 and 25 seconds.
                 But it is possible to pass a few hours long file if enough GPU memory is available.
             batch_size: (int) batch size to use during inference.
@@ -368,7 +369,11 @@ def transcribe_generator(self, audio, override_config: Optional[TranscribeConfig
             with tempfile.TemporaryDirectory() as tmpdir:
                 transcribe_cfg._internal.temp_dir = tmpdir
 
-                dataloader = self._transcribe_input_processing(audio, transcribe_cfg)
+                # Create a DataLoader if not already present
+                if not isinstance(audio, DataLoader):
+                    dataloader = self._transcribe_input_processing(audio, transcribe_cfg)
+                else:
+                    dataloader = audio
 
                 if hasattr(transcribe_cfg, 'verbose'):
                     verbose = transcribe_cfg.verbose
diff --git a/tests/collections/asr/mixins/test_transcription.py b/tests/collections/asr/mixins/test_transcription.py
index 794213c72397..1a6f38681d0c 100644
--- a/tests/collections/asr/mixins/test_transcription.py
+++ b/tests/collections/asr/mixins/test_transcription.py
@@ -22,6 +22,7 @@
 import torch
 from torch.utils.data import DataLoader, Dataset
 
+from nemo.collections.asr.data.audio_to_text import _speech_collate_fn
 from nemo.collections.asr.models import ASRModel
 from nemo.collections.asr.parts.mixins import TranscribeConfig, TranscriptionMixin
 from nemo.collections.asr.parts.mixins.transcription import GenericTranscriptionType
@@ -121,6 +122,27 @@ def _transcribe_on_end(self, trcfg: TranscribeConfig):
         self.flag_end = True
 
 
+class DummyDataset(Dataset):
+    def __init__(self, audio_tensors: List[str], config: Dict = None):
+        self.audio_tensors = audio_tensors
+        self.config = config
+
+    def __getitem__(self, index):
+        data = self.audio_tensors[index]
+        samples = torch.tensor(data)
+        # Calculate seq length
+        seq_len = torch.tensor(samples.shape[0], dtype=torch.long)
+
+        # Dummy text tokens
+        text_tokens = torch.tensor([0], dtype=torch.long)
+        text_tokens_len = torch.tensor(1, dtype=torch.long)
+
+        return (samples, seq_len, text_tokens, text_tokens_len)
+
+    def __len__(self):
+        return len(self.audio_tensors)
+
+
 @pytest.fixture()
 def dummy_model():
     return TranscribableDummy()
@@ -326,3 +348,27 @@ def test_transcribe_multiple_tensor(self, test_data_dir):
         assert len(outputs) == 2
         assert isinstance(outputs[0], str)
         assert isinstance(outputs[1], str)
+
+    @pytest.mark.with_downloads()
+    @pytest.mark.unit
+    def test_transcribe_dataloader(self, test_data_dir):
+        model = ASRModel.from_pretrained("stt_en_conformer_ctc_small")
+
+        # Load audio file
+        import soundfile as sf
+
+        audio_file = os.path.join(test_data_dir, "asr", "train", "an4", "wav", "an46-mmap-b.wav")
+        audio, sr = sf.read(audio_file, dtype='float32')
+
+        audio_file2 = os.path.join(test_data_dir, "asr", "train", "an4", "wav", "an152-mwhw-b.wav")
+        audio2, sr = sf.read(audio_file2, dtype='float32')
+
+        dataset = DummyDataset([audio, audio2])
+        collate_fn = lambda x: _speech_collate_fn(x, pad_id=0)
+        dataloader = DataLoader(dataset, batch_size=2, shuffle=False, num_workers=0, collate_fn=collate_fn)
+
+        # DataLoader test
+        outputs = model.transcribe(dataloader, batch_size=1)
+        assert len(outputs) == 2
+        assert isinstance(outputs[0], str)
+        assert isinstance(outputs[1], str)

From 51c2c3fc389af23ef28d5a00ac43fb20d8cdcb17 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Fri, 17 May 2024 09:47:54 -0700
Subject: [PATCH 097/178] NeMo Dev Doc Feature Updates 1: Some parallelisms
 (#9184)

* add various docs fixes

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* make conf.py changes clearer

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix Duplicate explicit target name error for links

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* more fixes, mainly citations

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix some code formatting

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update hf space iframe link

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix new ERRORs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Update docs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add MQA and GQA

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix small issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add parallelisms

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add seq packing in NeMo dev doc

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix few issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix table

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix table

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix table

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix table

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add EP

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* squeeze in neva updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* rename Megatron-Core to Megatron Core

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix typo

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update index

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
---
 docs/source/features/memory_optimizations.rst |  58 +++++++-
 docs/source/features/parallelisms.rst         | 133 +++++++++++++++++-
 .../features/throughput_optimizations.rst     |   8 ++
 docs/source/multimodal/mllm/datasets.rst      |   8 ++
 docs/source/multimodal/mllm/intro.rst         |   1 +
 .../multimodal/mllm/sequence_packing.rst      | 127 +++++++++++++++++
 6 files changed, 328 insertions(+), 7 deletions(-)
 create mode 100644 docs/source/multimodal/mllm/sequence_packing.rst

diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst
index d72d54ab7c2c..d87cb1e191ca 100644
--- a/docs/source/features/memory_optimizations.rst
+++ b/docs/source/features/memory_optimizations.rst
@@ -11,7 +11,7 @@ Flash Attention
 Overview
 ^^^^^^^^
 
-Flash Attention is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as Natural Language Processing (NLP). Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. FlashAttention, an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms.
+Flash Attention is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as Natural Language Processing (NLP). Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. Flash Attention is an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high-bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms.
 
 Turn Flash Attention On and Off
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -38,4 +38,58 @@ Selective Activation Recomputation
 """"""""""""""""""""""""""""""""""
 This method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
 
-Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198
+Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198.
+
+Multi-query Attention (MQA) and Grouped-query Attention (GQA)
+-------------------------------------------------------------
+
+**Multi-query Attention (MQA)** and **Grouped-query Attention (GQA)** are modifications of the traditional multihead attention mechanism in Transformer models. These methods improve the efficiency and effectiveness of attention mechanisms.
+
+Overview
+^^^^^^^^
+
+**Multi-query Attention (MQA)**
+    MQA treats all attention heads as a single group, reducing computational complexity and accelerating training times. It is beneficial when model scalability or limited computational resources are concerns.
+
+**Grouped-query Attention (GQA)**
+    GQA groups the heads into clusters, each processing a subset of queries independently. This method balances the detailed focus of traditional multihead attention with the broad approach of MQA, enhancing nuanced input data processing.
+
+These attention variants offer:
+
+- **Reduced computational load**: Both methods decrease computation, beneficial for large models.
+- **Increased processing speed**: Simplifying attention leads to faster training and inference.
+- **Flexibility and adaptability**: Adjustments can be made based on task needs or hardware constraints.
+
+Enable MQA and GQA
+^^^^^^^^^^^^^^^^^^
+
+To use MQA or GQA in the NeMo Framework, adjust the ``num_query_groups`` parameter in the model configuration:
+
+1. **For Multi-query Attention (MQA)**:
+   - Set ``num_query_groups`` to `1` to treat all attention heads as a single group.
+
+   .. code-block:: yaml
+
+       num_query_groups: 1  # Enables Multi-query Attention
+
+2. **For Grouped-query Attention (GQA)**:
+   - Set ``num_query_groups`` to a number that is a divisor of the total number of attention heads (more than one but less than the total heads).
+
+   .. code-block:: yaml
+
+       num_query_groups: <number_of_groups>  # Enables Grouped-query Attention
+
+   - For regular attention, set this parameter to `None` or match it with the number of heads.
+
+   .. code-block:: yaml
+
+       num_query_groups: null  # Default setting for regular multihead attention
+
+Adjust the ``num_query_groups`` to explore different attention mechanisms and optimize your model's performance based on specific needs.
+
+Implement MQA or GQA
+^^^^^^^^^^^^^^^^^^^^
+
+NeMo's support for GQA and MQA is enabled through the integration of Megatron Core's Attention mechanism. The underlying implementation details can be explored within the Attention class of Megatron Core, which provides the functional backbone for these advanced attention methods. To understand the specific modifications and implementations of MQA and GQA, refer to the source code in the Attention class:
+
+Check implementation details from Attention Class in Megatron Core Repo: https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/attention.py#L49
diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst
index b10477e4232c..9d5f33196c4e 100644
--- a/docs/source/features/parallelisms.rst
+++ b/docs/source/features/parallelisms.rst
@@ -17,41 +17,164 @@ Distributed Data Parallelism (DDP) creates idential copies of the model across m
 
 Tensor Parallelism
 ^^^^^^^^^^^^^^^^^^
-With Tensor Paralellism (TP) a tensor is split into non-overlapping pieces and
-different parts are distributed and processed on separate GPUs.
+
+**Tensor Parallelism (TP)** is a method for distributing a model's computation across multiple GPUs by splitting tensors into non-overlapping pieces. This allows different parts of the tensor to be processed simultaneously on separate GPUs, enhancing performance and enabling the training of larger models.
 
 .. image:: ../nlp/nemo_megatron/images/tp.gif
     :align: center
     :width: 800px
     :alt: Tensor Parallel
 
+Enable Tensor Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To enable TP in the NeMo framework, configure the ``tensor_model_parallel_size`` parameter in the model configuration. This parameter determines the number of GPUs among which the model's tensors are partitioned.
+
+**For Tensor Parallelism**:
+   - Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism.
+
+   .. code-block:: yaml
+
+       tensor_model_parallel_size: 1  # Example to enable Tensor Parallelism
+
+The configuration file can be adjusted here: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L65>`_.
+
+Implement Tensor Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+NeMo integrates Tensor Parallelism through the implementation from Megatron Core. To understand how TP is activated within transformer blocks, refer to the code in the following repository: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
+
+For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
+
 Pipeline Parallelism
 ^^^^^^^^^^^^^^^^^^^^
-With Pipeline Paralellism (PP) consecutive layer chunks are assigned to different GPUs.
+
+**Pipeline Parallelism (PP)** is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially.
 
 .. image:: ../nlp/nemo_megatron/images/pp.gif
     :align: center
     :width: 800px
     :alt: Pipeline Parallel
 
+
+Enable Pipeline Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To utilize PP in the NeMo framework, you need to set the ``pipeline_model_parallel_size`` parameter in the model's configuration. This parameter specifies the number of GPUs among which the model's layers are distributed.
+
+**For Pipeline Parallelism**:
+   - Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism.
+
+   .. code-block:: yaml
+
+       pipeline_model_parallel_size: 1  # Example to enable Pipeline Parallelism
+
+Adjust the configuration accordingly here: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L66>`_.
+
+Interleaved Pipeline Parallel Schedule
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. For instance, instead of each GPU processing a continuous set of four layers, it might handle two model chunks with two layers each. This method ensures that each GPU in the pipeline manages multiple stages rather than on a single contiguous block.
+
+   .. code-block:: yaml
+
+       virtual_pipeline_model_parallel_size: 2 # Set for interleaved pipeline
+
+For more insights into this approach, see our detailed blog: `Scaling Language Model Training <https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/#pipeline_parallelism>`_.
+
+Implement Pipeline Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+NeMo's implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
+
+For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
+
 Sequence Parallelism
 ^^^^^^^^^^^^^^^^^^^^
 
+**Sequence Parallelism** extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
+
 .. image:: ../nlp/nemo_megatron/images/sp.gif
     :align: center
     :width: 800px
     :alt: Sequence Parallel
 
+Enable Sequence Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To utilize Sequence Parallelism in NeMo, set the ``sequence_parallel`` parameter to ``True`` in the model's configuration. Note that this feature is effective only when the tensor parallel size (``tensor_model_parallel_size``) is greater than ``1``.
+
+   .. code-block:: yaml
+
+       sequence_parallel: True  # Enable Sequence Parallelism
+
+For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L66>`_.
+
+Implement Sequence Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+NeMo's implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py>`_.
+
+Context Parallelism
+^^^^^^^^^^^^^^^^^^^
+
+**Context Parallelism (CP)** is a method for parallelizing the processing of neural network activations across multiple GPUs, focusing on the sequence dimension of the input data. Unlike Sequence Parallelism (SP) that only partitions specific types of activations, CP divides all network activations along the sequence dimension.
+
+Enable Context Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To activate CP in the NeMo framework, set the ``context_parallel_size`` parameter in the model configuration. This parameter specifies the number of GPUs among which the model's sequence activations are distributed.
+
+**For Context Parallelism**:
+   - Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism.
+
+   .. code-block:: yaml
+
+       context_parallel_size: 1  # Example to enable Context Parallelism
+
+The configuration can be found and modified here: `NeMo Megatron Core Context Config <https://docs.nvidia.com/Megatron Core/developer-guide/latest/api-guide/context_parallel.html>`_.
+
+Implement Context Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+NeMo leverages functionalities from both Megatron Core and transformer-engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
+
+Additionally, NeMo's CP supports integration with various forms of model parallelism such as TP (Tensor Parallelism), PP (Pipeline Parallelism), and DP (Data Parallelism), ensuring broad usability and flexibility in large-scale model training environments.
+
+Visit our source code for more insights into the implementation:
+- Megatron Core transformer engine: `Megatron Core <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py>`_
+- Transformer Engine repository: `Transformer Engine Code <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_
+
+
 Expert Parallelism
 ^^^^^^^^^^^^^^^^^^
-Expert Paralellim (EP) distributes experts across GPUs.
-
+**Expert Parallelism (EP)** is a type of model parallelism that distributes experts of an MoE across GPUs.
 
 .. image:: ../nlp/nemo_megatron/images/ep.png
     :align: center
     :width: 800px
     :alt: Expert Parallelism
 
+Enable Expert Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To enable it users can pass ``model.expert_model_parallel_size=k``, where k is an integer with the desired
+expert parallelism level, for example if the model has three experts (i.e. ``model.num_moe_experts=3``), we can specify
+k=3 (i.e. via CLI using ``model.expert_model_parallel_size=3``). The number of experts should be exactly divisible by the ``expert_model_parallel_size``.
+
+   .. code-block:: yaml
+
+       expert_model_parallel_size: 3  # Set EP to 3
+
+For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml#L68>`_.
+
+
+Implement Expert Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+NeMo's expert parallelism functionality is provided by Megatron-LM repository, please consult the corresponding `Moe-layer <https://github.com/NVIDIA/Megatron-LM/blob/e2ec14ab5690fead7e33760b0f8fb20c83b4fd1f/megatron/core/transformer/moe/moe_layer.py#L29>`_ for more moe implementation details.
+
+
 Parallelism nomenclature
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/source/features/throughput_optimizations.rst b/docs/source/features/throughput_optimizations.rst
index 825c3add5dfb..3f3ded01b1a2 100644
--- a/docs/source/features/throughput_optimizations.rst
+++ b/docs/source/features/throughput_optimizations.rst
@@ -135,6 +135,14 @@ To train with packed sequences, you need to change four items in the SFT/PEFT co
 
 Now you are all set to finetune your model with a much improved throughput!
 
+Sequence Packing for NeVA
+-------------------------
+
+Sequence packing in NeVA (Multimodal LLMs) differs slightly from the LLM SFT/PEFT approach. For details,
+please refer to the documentation below
+
+:doc:`../multimodal/mllm/sequence_packing`
+
 Communication Overlap
 ---------------------
 NeMo leverages Megatron-Core's optimizations to enhance bandwidth utilization and effectively overlap computation with communication. Additional details will be provided soon.
diff --git a/docs/source/multimodal/mllm/datasets.rst b/docs/source/multimodal/mllm/datasets.rst
index 1c64c4d317d2..2f2000124e4d 100644
--- a/docs/source/multimodal/mllm/datasets.rst
+++ b/docs/source/multimodal/mllm/datasets.rst
@@ -90,6 +90,14 @@ For NeVA training, integrating special tokens into the tokenizer is vital. After
 
 .. code-block:: bash
 
+   cd /opt; git clone https://github.com/google/sentencepiece.git && \
+     cd sentencepiece && \
+     mkdir build && \
+     cd build && \
+     cmake .. && \
+     make && \
+     make install && \
+     ldconfig
    cd /opt/sentencepiece/src/; protoc --python_out=/opt/NeMo/scripts/tokenizers/ sentencepiece_model.proto
    python /opt/NeMo/scripts/tokenizers/add_special_tokens_to_sentencepiece.py \
    --input_file /path/to/neva/tokenizers/tokenizer.model \
diff --git a/docs/source/multimodal/mllm/intro.rst b/docs/source/multimodal/mllm/intro.rst
index be564a81a826..0e76a9737a0f 100644
--- a/docs/source/multimodal/mllm/intro.rst
+++ b/docs/source/multimodal/mllm/intro.rst
@@ -11,3 +11,4 @@ The endeavor to extend Language Models (LLMs) into multimodal domains by integra
    checkpoint
    neva
    video_neva
+   sequence_packing
diff --git a/docs/source/multimodal/mllm/sequence_packing.rst b/docs/source/multimodal/mllm/sequence_packing.rst
new file mode 100644
index 000000000000..b061ee1d89c6
--- /dev/null
+++ b/docs/source/multimodal/mllm/sequence_packing.rst
@@ -0,0 +1,127 @@
+Sequence Packing for NeVA
+=========================
+
+Overview
+--------
+As outlined in the throughput optimizations section, most multimodal LLM datasets, such as the LLaVA datasets, exhibit a skewed distribution of sequence lengths. Many sequences are short, and a few are very long, conforming to Zipf’s Law. Transformer models require fixed-length inputs, necessitating padding with many unused pad tokens, which is inefficient for two reasons:
+
+1. Computation on pad values is disregarded in the final model output, resulting in wasted FLOPs.
+2. The micro batch size is often constrained by the batch containing the longest sequences, leading to underutilized GPU memory in most other batches.
+
+Sequence packing is a training technique wherein multiple training sequences (examples) are concatenated into one long sequence (pack). This approach eliminates the need for padding and allows for more tokens to be processed per micro batch, optimizing both GPU compute and memory utilization.
+
+For Sequence Packing in SFT / PEFT for LLMs, NeVA considers the following design:
+
+1. Original Datasets to Sequence Lengths Files
+
+   1.1. **PyTorch Loaders for Dataset Processing Efficiency**
+        To efficiently manage large datasets (~700K sequences), the system utilizes PyTorch's DataLoader with multi-worker capabilities, significantly speeding up the data processing phase by parallelizing the loading and pre-processing steps.
+   1.2. **Handling Large Datasets**
+        The system writes sequence lengths to disk on the fly, ensuring scalability and efficient memory usage, as loading all data into memory is impractical.
+   1.3. **Efficient I/O Operations**
+        To facilitate efficient I/O operations necessary for parallelized data loading, the system employs IndexedDataset from Megatron-Core, chosen for its ability to dynamically build binary tensor files.
+
+2. Packing Sequences into Bins
+
+   2.1. **Algorithm Choices and Performance**
+        The first_fit_decreasing and first_fit_shuffle algorithms initially used for packing sequences into bins showed performance issues due to their O(n^2) complexity, making the processing of NeVA samples time-consuming.
+   2.2. **Introduction of shuffle_and_pack**
+        To address these inefficiencies, the shuffle_and_pack algorithm was introduced, an O(n) complexity algorithm that shuffles the sequence lengths before packing them into bins sequentially, significantly improving processing time.
+   2.3. **Parallelization of Packing Process**
+        The system implements a parallelized approach to the first_fit_shuffle algorithm by dividing the samples into chunks (~20K samples each) and processing them separately, effectively mitigating the quadratic complexity problem. The bins from each chunk are then combined in the final step, enhancing overall efficiency.
+   2.4. **Efficiency Improvements with completed_bins**
+        A minor optimization involves using completed_bins to prevent the algorithm from iterating over bins that cannot accommodate the minimum sequence length, leading to a more efficient packing process.
+
+3. Reading Sequence Lengths and Packing into New Files
+   After determining the optimal bins for packing, the system reads the sequence lengths from the generated files and packs these lengths into new files based on the bins' assignments. This final step consolidates the sequences into efficiently packed bins, ready for further processing or analysis.
+
+Performance Improvement
+-----------------------
+A 40% speed increase was achieved with optimized sequence packing for sequence length w/ Vicuna-1.5 13B (LLaVA 1.5 recipe). Detailed performance metrics across different configurations and stages are provided in the tables below.
+
+Fine-tuning Performance Table:
+
++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+
+| Stage        | Vision Encoder            | LLM Model      | TP | PP | Precision | Sequence Packing | Step Timing (s) | Global Batch Size | Samples / Sec | Perf Improvement  |
++==============+===========================+================+====+====+===========+==================+=================+===================+===============+===================+
+| Fine-tuning  | openai/clip-vit-large-    | Vicuna-1.5 13B | 8  | 1  | BF16      | No               | 2.008           | 128               | 63.745        | 0%                |
+|              | patch14-336               |                |    |    |           |                  |                 |                   |               |                   |
++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+
+| Fine-tuning  | openai/clip-vit-large-    | Vicuna-1.5 13B | 4  | 2  | BF16      | No               | 1.889           | 128               | 67.761        | 6%                |
+|              | patch14-336               |                |    |    |           |                  |                 |                   |               |                   |
++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+
+| Fine-tuning  | openai/clip-vit-large-    | Vicuna-1.5 13B | 8  | 1  | BF16      | Yes              | 1.302           | 116.08            | 89.155        | 40%               |
+|              | patch14-336               |                |    |    |           |                  |                 |                   |               |                   |
++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+
+| Fine-tuning  | openai/clip-vit-large-    | Vicuna-1.5 13B | 4  | 2  | BF16      | Yes              | 1.237           | 116.08            | 93.840        | 47%               |
+|              | patch14-336               |                |    |    |           |                  |                 |                   |               |                   |
++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+
+
+How to Run NeVA with Packed Sequence
+------------------------------------
+Prepare Dataset
+^^^^^^^^^^^^^^^
+We provide an easy-to-use script for preprocessing a dataset for the NeMo Multimodal Learning framework. It requires specifying paths for data, images, and the tokenizer model, among other parameters.
+
+.. code-block:: bash
+
+    python examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py \
+     --data_path=/path/to/LLaVA-Instruct-150K/llava_v1_5_mix665k_filtered.json \
+     --image_folder=/path/to/LLaVA-Instruct-150K/images \
+     --tokenizer_path=/path/to/checkpoints/tokenizer_add_special.model \
+     --output_dir=/path/to/LLaVA-Instruct-150K/packed_seq_12288_336_v1 \
+     --max_seq_length=12288 \
+     --packing_algorithm=first_fit_shuffle \
+     --hf_vision_encoder=openai/clip-vit-large-patch14-336 \
+     --conv_template=v1 \
+     --image_aspect_ratio=pad \
+     --seed=42
+
+Parameters:
+* ``--data_path``: Path to the dataset file in JSON format.
+* ``--image_folder``: Directory containing the images referenced in the dataset.
+* ``--tokenizer_path``: Path to the tokenizer model.
+* ``--output_dir``: Directory where the processed dataset will be stored.
+* ``--max_seq_length``: The maximum sequence length of the model.
+* ``--packing_algorithm``: Algorithm used for packing sequences. Defaults to 'first_fit_shuffle'.
+* ``--hf_vision_encoder``: The Hugging Face vision encoder to use. Default is 'openai/clip-vit-large-patch14-336'.
+* ``--conv_template``: Template for data conversion. Default is 'plain', with 'v1' as an alternative.
+* ``--image_aspect_ratio``: The aspect ratio for processing images. Defaults to 'square', 'pad' for padding to maintain aspect ratio.
+* ``--seed``: Seed for random operations in 'first_fit_shuffle'.
+* ``--hparams_file``: Optional path to a YAML file containing additional hyperparameters.
+
+Remarks:
+1. The current version of data processing saves processed image tensors in the sequence packing, which may require significant storage. This issue will be addressed in future iterations.
+2. The ``max_seq_length`` is crucial for achieving optimal performance. Excessive length can lead to out-of-memory errors, while insufficient length may degrade performance.
+3. The conversation prompt template is inserted during this step to ensure accurate sequence length calculation.
+
+Adjust Training Config
+""""""""""""""""""""""
+To train with packed sequences, modify four items in the SFT/PEFT config file.
+
+1. Enable the ``packed_sequence`` flag:
+
+.. code-block:: bash
+
+    ++model.data.data_prefix=/lustre/fsw/coreai_dlalgo_genai/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
+    ++model.data.crop_size=[224,224]
+    ++model.data.packed_sequence=True
+
+2. Use the new dataset file instead of the original JSONL file and ensure the crop sizes are correctly specified since images are now cached:
+
+.. code-block:: bash
+
+    ++model.data.data_prefix=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
+    ++model.data.crop_size=[336,336]
+
+4. Adjust batch sizes:
+
+* Micro batch size should be set to 1 due to concatenation in the preprocessing step. Increase ``pack_size`` to achieve a higher micro batch size.
+* Global batch size should be adjusted based on the average number of sequences per pack (``n``), calculated as the total number of sequences divided by the number of packs. This maintains the training recipe by ensuring each gradient iteration sees, on average, the same number of tokens.
+
+.. code-block:: bash
+
+    model.micro_batch_size=1
+    model.global_batch_size=<GBS divided by n>
+
+Now, you are ready to fine-tune your model with significantly improved throughput!

From 659e025fd841c1b173ba7c1807fc17c6e6fc66be Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Fri, 17 May 2024 11:27:09 -0700
Subject: [PATCH 098/178] Clean up dev docs collection section (#9205)

* Update examples

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update index

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update index

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 docs/source/{core/core_index.rst => apis.rst} | 26 ++++++++----
 docs/source/asr/api.rst                       |  4 +-
 docs/source/asr/ssl/intro.rst                 |  4 +-
 docs/source/collections.rst                   | 42 +++++++------------
 docs/source/common/intro.rst                  |  4 +-
 docs/source/core/api.rst                      |  4 +-
 docs/source/index.rst                         |  2 +-
 docs/source/multimodal/api.rst                |  4 +-
 docs/source/nlp/api.rst                       |  4 +-
 docs/source/tts/api.rst                       |  4 +-
 docs/source/tts/intro.rst                     |  2 -
 11 files changed, 48 insertions(+), 52 deletions(-)
 rename docs/source/{core/core_index.rst => apis.rst} (74%)

diff --git a/docs/source/core/core_index.rst b/docs/source/apis.rst
similarity index 74%
rename from docs/source/core/core_index.rst
rename to docs/source/apis.rst
index 01977c1b5101..e3c199bb47d5 100644
--- a/docs/source/core/core_index.rst
+++ b/docs/source/apis.rst
@@ -14,14 +14,26 @@ You can learn more about aspects of the NeMo "core" by following the links below
    :name: core
    :titlesonly:
 
-   core
-   neural_modules
-   exp_manager
-   neural_types
-   export
-   adapters/intro
-   api
+   core/core
+   core/neural_modules
+   core/exp_manager
+   core/neural_types
+   core/export
+   core/adapters/intro
 
+You can learn more about aspects of the NeMo APIs by following the links below:
+
+.. toctree::
+   :maxdepth: 1
+   :name: API
+   :titlesonly:
+
+   core/api
+   common/intro
+   nlp/api
+   multimodal/api
+   asr/api
+   tts/api
 
 
 Alternatively, you can jump straight to the documentation for the individual collections:
diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
index 2eb687d97d8e..c99d92c0371a 100644
--- a/docs/source/asr/api.rst
+++ b/docs/source/asr/api.rst
@@ -1,5 +1,5 @@
-NeMo ASR Collection API
-=======================
+NeMo ASR API
+============
 
 
 Model Classes
diff --git a/docs/source/asr/ssl/intro.rst b/docs/source/asr/ssl/intro.rst
index d1a7366164d8..76a3a75dcf37 100644
--- a/docs/source/asr/ssl/intro.rst
+++ b/docs/source/asr/ssl/intro.rst
@@ -1,5 +1,5 @@
-Self-Supervised Learning
-=================================
+Speech Self-Supervised Learning
+===============================
 
 Self-Supervised Learning (SSL) refers to the problem of learning without explicit labels. As 
 any learning process require feedback, without explit labels, SSL derives supervisory signals from 
diff --git a/docs/source/collections.rst b/docs/source/collections.rst
index 1cc7a654b9c1..d4bea503513b 100644
--- a/docs/source/collections.rst
+++ b/docs/source/collections.rst
@@ -11,26 +11,9 @@ Documentation for the individual collections
    :titlesonly:
 
    nlp/nemo_megatron/intro
-   nlp/models
    nlp/machine_translation/machine_translation
    nlp/megatron_onnx_export
    nlp/quantization
-   nlp/api
-
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Speech AI
-   :name: Speech AI
-   :titlesonly:
-
-   asr/intro
-   asr/speech_classification/intro
-   asr/speaker_recognition/intro
-   asr/speaker_diarization/intro
-   asr/ssl/intro
-   asr/speech_intent_slot/intro
-
 
 .. toctree::
    :maxdepth: 1
@@ -42,29 +25,32 @@ Documentation for the individual collections
    multimodal/vlm/intro
    multimodal/text2img/intro
    multimodal/nerf/intro
-   multimodal/api
-
 
 .. toctree::
    :maxdepth: 1
-   :caption: Text To Speech (TTS)
-   :name: Text To Speech
+   :caption: Vision (CV)
+   :name: vision
    :titlesonly:
 
-   tts/intro
+   vision/intro
 
 .. toctree::
    :maxdepth: 1
-   :caption: Vision (CV)
-   :name: vision
+   :caption: Speech AI
+   :name: Speech AI
    :titlesonly:
 
-   vision/intro
+   asr/intro
+   asr/speech_classification/intro
+   asr/speaker_recognition/intro
+   asr/speaker_diarization/intro
+   asr/ssl/intro
+   asr/speech_intent_slot/intro
 
 .. toctree::
    :maxdepth: 1
-   :caption: Common
-   :name: Common
+   :caption: Text To Speech (TTS)
+   :name: Text To Speech
    :titlesonly:
 
-   common/intro
\ No newline at end of file
+   tts/intro
diff --git a/docs/source/common/intro.rst b/docs/source/common/intro.rst
index fadbd9528485..a89f1a480e5d 100644
--- a/docs/source/common/intro.rst
+++ b/docs/source/common/intro.rst
@@ -1,5 +1,5 @@
-Common Collection
-=================
+NeMo Common Collection API
+==========================
 
 The common collection contains things that could be used across all collections.
 
diff --git a/docs/source/core/api.rst b/docs/source/core/api.rst
index 6b389ca3be85..1aceb73de0d9 100644
--- a/docs/source/core/api.rst
+++ b/docs/source/core/api.rst
@@ -1,6 +1,6 @@
 
-Core APIs
-=========
+NeMo Core APIs
+==============
 
 Base class for all NeMo models
 ------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index eb586f749842..511d3ef700c9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -69,7 +69,7 @@ For more information, browse the developer docs for your area of interest in the
    :name: APIs
    :titlesonly:
 
-   core/core_index
+   apis
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst
index 3228cd76d4ad..7a9fe2822d07 100644
--- a/docs/source/multimodal/api.rst
+++ b/docs/source/multimodal/api.rst
@@ -1,5 +1,5 @@
-Multimodal API
-=======================
+NeMo Multimodal API
+===================
 
 Model Classes
 -------------
diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst
index 52c1b537b0bf..cb7db1ba943a 100755
--- a/docs/source/nlp/api.rst
+++ b/docs/source/nlp/api.rst
@@ -1,5 +1,5 @@
-Large language Model API
-========================
+NeMo Large language Model API
+=============================
 
 Pretraining Model Classes
 -------------------------
diff --git a/docs/source/tts/api.rst b/docs/source/tts/api.rst
index 3e9b06b4e9a9..8664adafa6d7 100644
--- a/docs/source/tts/api.rst
+++ b/docs/source/tts/api.rst
@@ -1,5 +1,5 @@
-NeMo TTS Collection API
-=======================
+NeMo TTS API
+============
 
 Model Classes
 -------------
diff --git a/docs/source/tts/intro.rst b/docs/source/tts/intro.rst
index 3964319234b3..b7d717e7ac68 100644
--- a/docs/source/tts/intro.rst
+++ b/docs/source/tts/intro.rst
@@ -15,8 +15,6 @@ We will illustrate details in the following sections.
     datasets
     checkpoints
     configs
-    api
-    resources
     g2p
 
 .. include:: resources.rst

From 074401642bc954f5b78755adffda8369560afccc Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Fri, 17 May 2024 14:05:24 -0700
Subject: [PATCH 099/178] use get with fallback when reading
 checkpoint_callback_params (#9223)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/collections/nlp/parts/megatron_trainer_builder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index e1a780f09756..03cf5fb755bd 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -145,7 +145,7 @@ def _plugins(self) -> list:
         use_dist_ckpt = not self.cfg.model.get('fsdp', False) and (
             self.cfg.model.get('mcore_gpt', False) or self.cfg.model.get('mcore_bert', False)
         )
-        async_save = self.cfg.exp_manager.checkpoint_callback_params.get('async_save', False)
+        async_save = self.cfg.exp_manager.get('checkpoint_callback_params', {}).get('async_save', False)
         if use_dist_ckpt:
             checkpoint_io = DistributedCheckpointIO.from_config(self.cfg.model, async_save)
             if async_save:
@@ -170,7 +170,7 @@ def _callbacks(self, callbacks: Optional[list]) -> list:
         if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
             callbacks.append(CustomProgressBar())
 
-        if self.cfg.exp_manager.checkpoint_callback_params.get('async_save', False):
+        if self.cfg.exp_manager.get('checkpoint_callback_params', {}).get('async_save', False):
             callbacks.append(AsyncFinalizerCallback())
         return callbacks
 

From cd6d67b209015912fbe43d81aaa6271b7c553956 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Fri, 17 May 2024 18:40:12 -0400
Subject: [PATCH 100/178] Revert rope fusion defaults (#9237)

* revert rope fusion defaults

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 .../conf/megatron_chatglm_config.yaml         |  2 +-
 .../conf/megatron_falcon_config.yaml          |  2 +-
 .../conf/megatron_gpt_config.yaml             |  2 +-
 .../conf/megatron_llama_config.yaml           |  2 +-
 .../conf/megatron_starcoder_config.yaml       |  2 +-
 .../language_modeling/megatron_gpt_eval.py    |  9 ++-
 nemo/collections/multimodal/parts/utils.py    | 34 +++++++---
 .../megatron_gpt_embedding_model.py           |  3 -
 .../language_modeling/megatron_base_model.py  | 64 ++++++++++---------
 9 files changed, 73 insertions(+), 47 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml b/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml
index 5c1191dbe64e..84fbd1b801d4 100644
--- a/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml
@@ -81,7 +81,7 @@ model:
   position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope']
   rotary_percentage: 0.5 # If using position_embedding_type=rope, then the per head dim is multiplied by this. For chatglm2, it is 0.5 (https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L754)
   rotary_interleaved: True # chatglm2 use interleaved rotary embedding
-  apply_rope_fusion: True
+  apply_rope_fusion: False
   attention_type: 'multihead' # Attention type. Options ['multihead']
   share_embeddings_and_output_weights: False # Share embedding and output layer weights.
   overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
diff --git a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
index f5746433cc78..8905abaf3ac2 100644
--- a/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_falcon_config.yaml
@@ -113,7 +113,7 @@ model:
   bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 20e20744833c..269aa8f55153 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -131,7 +131,7 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_config.yaml b/examples/nlp/language_modeling/conf/megatron_llama_config.yaml
index 38ed239ec6e1..965b511fc7e7 100644
--- a/examples/nlp/language_modeling/conf/megatron_llama_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_llama_config.yaml
@@ -112,7 +112,7 @@ model:
   bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
 
   # Miscellaneous
diff --git a/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml b/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml
index b170e82ca983..355e575a6d59 100644
--- a/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_starcoder_config.yaml
@@ -117,7 +117,7 @@ model:
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+  apply_rope_fusion: False # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
 
   # Miscellaneous
   seed: 1234
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
index 01c56f1e3269..f3413a5fa92e 100644
--- a/examples/nlp/language_modeling/megatron_gpt_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -148,7 +148,9 @@ def __init__(self, sentences):
         super().__init__()
         self.sentences = sentences
 
-    def __len__(self,):
+    def __len__(
+        self,
+    ):
         return len(self.sentences)
 
     def __getitem__(self, idx):
@@ -173,7 +175,9 @@ def main(cfg) -> None:
         callbacks.append(CustomProgressBar())
     # trainer required for restoring model parallel models
     trainer = Trainer(
-        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)), **cfg.trainer, callbacks=callbacks,
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=callbacks,
     )
 
     if cfg.gpt_model_file is not None:
@@ -224,6 +228,7 @@ def main(cfg) -> None:
             pretrained_cfg.activations_checkpoint_method = None
             pretrained_cfg.precision = trainer.precision
             pretrained_cfg["use_flash_attention"] = cfg.inference.get("use_flash_attention", False)
+            pretrained_cfg["apply_rope_fusion"] = False
             if pretrained_cfg.get('mcore_gpt', False):
                 # with dist checkpointing we can use the model parallel config specified by the user
                 pretrained_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 223fe22bd00a..f9d6ed5250f6 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -138,7 +138,8 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
                 checkpoint = dist_checkpointing.load(
-                    sharded_state_dict=checkpoint, checkpoint_dir=tmp_model_weights_dir,
+                    sharded_state_dict=checkpoint,
+                    checkpoint_dir=tmp_model_weights_dir,
                 )
                 state_dict = checkpoint["state_dict"]
 
@@ -149,7 +150,9 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):
 
 
 def setup_trainer_and_models_for_inference(
-    model_provider: Any, cfg: DictConfig, model_cfg_modifier: Callable,
+    model_provider: Any,
+    cfg: DictConfig,
+    model_cfg_modifier: Callable,
 ):
     """
     Set up a trainer and NeMo model for inference.
@@ -172,7 +175,10 @@ def setup_trainer_and_models_for_inference(
 
     # Use the NLPDDPStrategy for the distributed data parallel strategy.
     # We don't use DDP for async grad allreduce and don't find unused parameters.
-    strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        find_unused_parameters=False,
+    )
 
     # Set up the trainer with the specified plugins and strategy.
     trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
@@ -215,7 +221,9 @@ def setup_trainer_and_models_for_inference(
             )
 
             model = model_provider.load_from_checkpoint(
-                single_model_cfg.restore_from_path, hparams_file=cfg.model.get("hparams_file"), trainer=trainer,
+                single_model_cfg.restore_from_path,
+                hparams_file=cfg.model.get("hparams_file"),
+                trainer=trainer,
             )
             models.append(model)
 
@@ -239,7 +247,9 @@ def dummy():
 
 
 def setup_trainer_and_model_for_inference(
-    model_provider: Any, cfg: DictConfig, model_cfg_modifier: Callable,
+    model_provider: Any,
+    cfg: DictConfig,
+    model_cfg_modifier: Callable,
 ) -> Tuple[Trainer, Any]:
     """
     Set up a trainer and NeMo model for inference.
@@ -261,7 +271,10 @@ def setup_trainer_and_model_for_inference(
 
     # Use the NLPDDPStrategy for the distributed data parallel strategy.
     # We don't use DDP for async grad allreduce and don't find unused parameters.
-    strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        find_unused_parameters=False,
+    )
 
     # Set up the trainer with the specified plugins and strategy.
     trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
@@ -299,7 +312,9 @@ def setup_trainer_and_model_for_inference(
         )
 
         model = model_provider.load_from_checkpoint(
-            cfg.model.restore_from_path, hparams_file=cfg.model.get("hparams_file"), trainer=trainer,
+            cfg.model.restore_from_path,
+            hparams_file=cfg.model.get("hparams_file"),
+            trainer=trainer,
         )
 
     else:
@@ -335,7 +350,9 @@ def create_neva_model_and_processor(cfg):
         or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
     ):
         model_config = MegatronNevaModel.restore_from(
-            restore_path=cfg.neva_model_file, trainer=trainer, return_config=True,
+            restore_path=cfg.neva_model_file,
+            trainer=trainer,
+            return_config=True,
         )
 
         with open_dict(cfg):
@@ -366,6 +383,7 @@ def create_neva_model_and_processor(cfg):
             neva_cfg.activations_checkpoint_method = None
             neva_cfg.precision = trainer.precision
             neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None)
+            neva_cfg.apply_rope_fusion = False
             neva_cfg.fp8 = False
         #    neva_cfg.mm_cfg.vision_encoder.from_pretrained = None
 
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
index 389c90d7f97c..67fd2b1b6c62 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
@@ -68,9 +68,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         assert (
             self.cfg.get("post_process", False) is False
         ), "post_process must be False to get hidden states in the loss_func"
-        assert (
-            self.cfg.get('apply_rope_fusion', True) is False
-        ), "RoPE fusion should be set to False for MegatronGPTEmbeddingModel"
 
     def model_provider_func(self, pre_process, post_process):
         # (@adithyare) We need post_process to be False to get hidden states in the loss_func
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 8b7c7a38045c..a27f9fd5e5e4 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -246,12 +246,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
         self.use_fsdp = cfg.get('fsdp', False)
 
     def setup_transformer_engine_tp_groups(self):
-        """ This should be called after model parallel groups have been initialized
-            and only needs to be called when using Transformer Engine.
+        """This should be called after model parallel groups have been initialized
+        and only needs to be called when using Transformer Engine.
         """
         for module in self.get_model_module_list():
             """Set TP group
-               Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
             """
             # Deep iterate but skip self to avoid infinite recursion.
             for index, child in enumerate(module.modules()):
@@ -262,14 +262,14 @@ def setup_transformer_engine_tp_groups(self):
                     child.set_tensor_parallel_group(tp_group)
 
     def setup_transformer_engine_cp_groups(self):
-        """ This should be called after context parallel groups have been initialized
-            and only needs to be called when using Transformer Engine.
+        """This should be called after context parallel groups have been initialized
+        and only needs to be called when using Transformer Engine.
         """
         cp_stream = torch.cuda.Stream()
 
         for module in self.get_model_module_list():
             """Set context parallel running
-               Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
             """
             # Deep iterate but skip self to avoid infinite recursion.
             for index, child in enumerate(module.modules()):
@@ -283,11 +283,11 @@ def setup_transformer_engine_cp_groups(self):
                     )
 
     def _wrap_model_for_O2(self):
-        """ Wraps self.model in a float16 wrapper if the model is using megatron amp O2.
-            Args:
-                model: The model to wrap. Can be a list of modules or a single module.
-            Returns:
-                The wrapped model. Returns a list of wrapped modules or a single wrapped module.
+        """Wraps self.model in a float16 wrapper if the model is using megatron amp O2.
+        Args:
+            model: The model to wrap. Can be a list of modules or a single module.
+        Returns:
+            The wrapped model. Returns a list of wrapped modules or a single wrapped module.
         """
         is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False)
 
@@ -450,10 +450,10 @@ def on_validation_end(self) -> None:
             gc.collect()
 
     def build_transformer_config(self) -> TransformerConfig:
-        """ Builds the megatron core transformer config for the model.
-            For attributes in the nemo model config that are the same
-            as the megatron core TransformerConfig, we will use the value from the nemo model config.
-            For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """Builds the megatron core transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
         """
 
         # create a dictionary copy of the model config
@@ -509,7 +509,7 @@ def build_transformer_config(self) -> TransformerConfig:
 
         bias_dropout_fusion = self.cfg.get('bias_dropout_add_fusion', True)
 
-        apply_rope_fusion = self.cfg.get('apply_rope_fusion', True)
+        apply_rope_fusion = self.cfg.get('apply_rope_fusion', False)
 
         # TODO: need to check if recompute APIs are matching up properly
         recompute_granularity = self.cfg.get('activations_checkpoint_granularity', None)
@@ -601,7 +601,7 @@ def get_parameters_with_grad(self):
 
     def configure_gradient_clipping(self, *args, **kwargs):
         """PTL hook to configure gradients.
-           We use gradient clipping implementation from megatron-lm.
+        We use gradient clipping implementation from megatron-lm.
         """
         clip_val = self.trainer.gradient_clip_val
         if clip_val is None:
@@ -627,13 +627,17 @@ def configure_gradient_clipping(self, *args, **kwargs):
                 parameters = self._optimizer.get_parameters_with_grad()
             else:
                 parameters = self.get_parameters_with_grad()
-            grad_norm = clip_grad_norm_fp32(parameters=parameters, max_norm=clip_val, use_fsdp=self.use_fsdp,)
+            grad_norm = clip_grad_norm_fp32(
+                parameters=parameters,
+                max_norm=clip_val,
+                use_fsdp=self.use_fsdp,
+            )
 
         self.log('grad_norm', grad_norm, rank_zero_only=True, batch_size=1)
 
     def allreduce_gradients(self):
         """Reduce gradients across data parallel ranks.
-           Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
+        Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
         """
         # Bucketize and all-reduce
         buckets = {}
@@ -732,7 +736,9 @@ def on_validation_batch_end(self, outputs, batch: Any, batch_idx: int, dataloade
             self.validation_global_step += 1
 
     def setup_optimization(
-        self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        optim_config: Optional[Union[DictConfig, Dict]] = None,
+        optim_kwargs: Optional[Dict[str, Any]] = None,
     ):
         # Ensure `max_steps` is set correctly
         optim_config = self._optim_config_copy(optim_config)
@@ -913,8 +919,8 @@ def _extract_consumed_samples_from_ckpt(self, ckpt_path):
         return init_consumed_samples
 
     def _validate_and_override_config(self):
-        """ Certain configurations might be incompatible or discouraged.
-            We can check for them here and override if necessary.
+        """Certain configurations might be incompatible or discouraged.
+        We can check for them here and override if necessary.
         """
         app_state = AppState()
 
@@ -1093,9 +1099,9 @@ def _get_total_params_across_model_parallel_groups_enc_dec(self, model):
         return num_parameters_on_device, total_num_parameters
 
     def build_model_parallel_config(self) -> ModelParallelConfig:
-        """ For attributes in the nemo model config that are the same as the
-            megatron core ModelParallelConfig we will use the value from the nemo config.
-            For attributes in ModelParallelConfig that are not in the nemo model config, we add custom logic.
+        """For attributes in the nemo model config that are the same as the
+        megatron core ModelParallelConfig we will use the value from the nemo config.
+        For attributes in ModelParallelConfig that are not in the nemo model config, we add custom logic.
         """
         cfg = OmegaConf.to_container(self.cfg, resolve=True)
 
@@ -1116,9 +1122,9 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
             "async_tensor_model_parallel_allreduce": self.cfg.get('tensor_model_parallel_world_size', 1) > 1
             and not self.cfg.get('sequence_parallel', False),
             "pipeline_dtype": pipeline_dtype,
-            "grad_scale_func": self.trainer.precision_plugin.scaler.scale
-            if self.trainer.precision in ["16", "16-mixed"]
-            else None,
+            "grad_scale_func": (
+                self.trainer.precision_plugin.scaler.scale if self.trainer.precision in ["16", "16-mixed"] else None
+            ),
             "enable_autocast": not megatron_amp_O2 and self.torch_dtype in [torch.bfloat16, torch.float16],
             "autocast_dtype": self.autocast_dtype,
             "variable_seq_lengths": False,  # set dynamically during training
@@ -1230,7 +1236,7 @@ def find_frozen_submodules(model):
             return frozen_submodule_names, frozen_submodules
 
         if self.use_fsdp:
-            """ Top-evel FSDP model sharding """
+            """Top-evel FSDP model sharding"""
             # Shard the top-level model hierarchically. We shard the strategy-unwrapped model not
             # to lose the structure of non-FSDP wrapped parameters (e.g, embedding)
             # TODO: Currently the main parameter data type is kept in fp32 (when O2=False). This needs to be

From 1d576e42644a0e3ea4ffa254f59522acfbc9414a Mon Sep 17 00:00:00 2001
From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com>
Date: Mon, 20 May 2024 10:29:00 -0400
Subject: [PATCH 101/178] Update Online_Offline_Microphone_VAD_Demo.ipynb
 (#9251)

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
---
 tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
index 490a4b6c8de7..9522ac0a80e5 100644
--- a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
+++ b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
@@ -638,7 +638,7 @@
                 "    ax2.set_ylabel('Preds and Probas')\n",
                 "    \n",
                 "    \n",
-                "ax = plt.subplot(num+1,1,i+2)\n",
+                "ax = plt.subplot(num+1,1,num+1)\n",
                 "S = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=64, fmax=8000)\n",
                 "S_dB = librosa.power_to_db(S, ref=np.max)\n",
                 "librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=sample_rate, fmax=8000)\n",

From c1b198b5317b2cf0b7e11e2e5e33b425986b4f4b Mon Sep 17 00:00:00 2001
From: anteju <108555623+anteju@users.noreply.github.com>
Date: Mon, 20 May 2024 10:17:02 -0700
Subject: [PATCH 102/178] Add mel codec checkpoints (#9228)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add mel codec checkpoints

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

---------

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
---
 docs/source/tts/data/ngc_models_codec.csv  |  2 ++
 docs/source/tts/models.rst                 |  3 ++-
 nemo/collections/tts/models/audio_codec.py | 26 ++++++++++++++++++----
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/docs/source/tts/data/ngc_models_codec.csv b/docs/source/tts/data/ngc_models_codec.csv
index d46567012600..6827c54ce7f4 100644
--- a/docs/source/tts/data/ngc_models_codec.csv
+++ b/docs/source/tts/data/ngc_models_codec.csv
@@ -1,2 +1,4 @@
 Model Name,Dataset,Sampling Rate,Model Class,Overview,Checkpoint
 audio_codec_16khz_small,Libri-Light,16000Hz,nemo.collections.tts.models.AudioCodecModel,`audio_codec_16khz_small <https://ngc.nvidia.com/catalog/models/nvidia:nemo:audio_codec_16khz_small>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/audio_codec_16khz_small/versions/v1/files/audio_codec_16khz_small.nemo``
+mel_codec_22khz_medium,LibriVox and Common Voice,22050Hz,nemo.collections.tts.models.AudioCodecModel,`mel_codec_22khz_medium <https://ngc.nvidia.com/catalog/models/nvidia:nemo:mel_codec_22khz_medium>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_22khz_medium/versions/v1/files/mel_codec_22khz_medium.nemo``
+mel_codec_44khz_medium,LibriVox and Common Voice,44100Hz,nemo.collections.tts.models.AudioCodecModel,`mel_codec_44khz_medium <https://ngc.nvidia.com/catalog/models/nvidia:nemo:mel_codec_44khz_medium>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_44khz_medium/versions/v1/files/mel_codec_44khz_medium.nemo``
diff --git a/docs/source/tts/models.rst b/docs/source/tts/models.rst
index 6f9d7d24c45d..7ea5caa4d871 100644
--- a/docs/source/tts/models.rst
+++ b/docs/source/tts/models.rst
@@ -140,9 +140,10 @@ Codecs
 Audio Codec
 ~~~~~~~~~~~
 
-The NeMo Audio Codec model is a non-autoregressive convolutional encoder-quantizer-decoder model for coding or tokenization of raw audio signals.
+The NeMo Audio Codec model is a non-autoregressive convolutional encoder-quantizer-decoder model for coding or tokenization of raw audio signals or mel-spectrogram features.
 The NeMo Audio Codec model supports residual vector quantizer (RVQ) :cite:`tts-models-zeghidour2022soundstream` and finite scalar quantizer (FSQ) :cite:`tts-models-mentzer2023finite` for quantization of the encoder output.
 This model is trained end-to-end using generative loss, discriminative loss, and reconstruction loss, similar to other neural audio codecs such as SoundStream :cite:`tts-models-zeghidour2022soundstream` and EnCodec :cite:`tts-models-defossez2022encodec`.
+For further information refer to the ``Audio Codec Training`` tutorial in the TTS tutorial section.
 
     .. image:: images/audiocodec_model.png
         :align: center
diff --git a/nemo/collections/tts/models/audio_codec.py b/nemo/collections/tts/models/audio_codec.py
index 81fb7cb5cd7b..04a6d2793f88 100644
--- a/nemo/collections/tts/models/audio_codec.py
+++ b/nemo/collections/tts/models/audio_codec.py
@@ -118,7 +118,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         # STFT loss setup
         stft_loss_log_guard = cfg.get("stft_loss_log_guard", 1.0)
         self.stft_loss_scale = cfg.get("stft_loss_scale", 0.0)
-        self.stft_loss_fn = MultiResolutionSTFTLoss(resolutions=loss_resolutions, log_guard=stft_loss_log_guard,)
+        self.stft_loss_fn = MultiResolutionSTFTLoss(
+            resolutions=loss_resolutions,
+            log_guard=stft_loss_log_guard,
+        )
 
         # Time domain loss setup
         self.time_domain_loss_scale = cfg.get("time_domain_loss_scale", 1.0)
@@ -237,7 +240,9 @@ def quantize(self, encoded: torch.Tensor, encoded_len: torch.Tensor) -> torch.Te
             "tokens": NeuralType(('B', 'C', 'T_encoded'), TokenIndex()),
             "tokens_len": NeuralType(tuple('B'), LengthsType()),
         },
-        output_types={"dequantized": NeuralType(('B', 'D', 'T_encoded'), EncodedRepresentation()),},
+        output_types={
+            "dequantized": NeuralType(('B', 'D', 'T_encoded'), EncodedRepresentation()),
+        },
     )
     def dequantize(self, tokens: torch.Tensor, tokens_len: torch.Tensor) -> torch.Tensor:
         """Convert the discrete tokens into a continuous encoded representation.
@@ -392,8 +397,7 @@ def _process_batch(self, batch):
 
     @property
     def disc_update_prob(self) -> float:
-        """Probability of updating the discriminator.
-        """
+        """Probability of updating the discriminator."""
         return self.disc_updates_per_period / self.disc_update_period
 
     def should_update_disc(self, batch_idx) -> bool:
@@ -652,4 +656,18 @@ def list_available_models(cls) -> List[PretrainedModelInfo]:
         )
         models.append(model)
 
+        model = PretrainedModelInfo(
+            pretrained_model_name="mel_codec_22khz_medium",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_22khz_medium/versions/v1/files/mel_codec_22khz_medium.nemo",
+            description="For details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_22khz_medium",
+        )
+        models.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="mel_codec_44khz_medium",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_44khz_medium/versions/v1/files/mel_codec_44khz_medium.nemo",
+            description="For details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_44khz_medium",
+        )
+        models.append(model)
+
         return models

From 67f06aca159e1970f0df25ba0c69180536ea5a1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 20 May 2024 20:24:51 +0200
Subject: [PATCH 103/178] ci: Remove duplicated job (#9258)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cicd-main.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 4efb525100d9..dbc7d907580a 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -289,9 +289,6 @@ jobs:
           run: |
             rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; 
             rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/
-        - name: Cleanup
-          if: "always()"
-          run: |
             rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"

From 5d4f6b2ea629ed7b89305a4a7d984b792af2139e Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Mon, 20 May 2024 12:42:16 -0700
Subject: [PATCH 104/178] fix import (#9240)

* fix import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 .../nlp/language_modeling/megatron_lm_ckpt_to_nemo.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
index 03d6fd94e4e2..72252a03d5be 100644
--- a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
@@ -291,9 +291,9 @@ def load_from_checkpoint(
     **kwargs,
 ):
     """
-        Loads Megatron_LM checkpoints, convert it, with some maintenance of restoration.
-        For documentation, please refer to LightningModule.load_from_checkpoin() documentation.
-        """
+    Loads Megatron_LM checkpoints, convert it, with some maintenance of restoration.
+    For documentation, please refer to LightningModule.load_from_checkpoin() documentation.
+    """
     checkpoint = None
     try:
         cls._set_model_restore_state(is_being_restored=True)
@@ -470,7 +470,7 @@ def convert(local_rank, rank, world_size, args):
             )
         if mcore_output and not args.mcore_input:
             # convert from legacy Megatron-LM to MCore NeMo. Initialize an mcore translation dict
-            from scripts.nlp_language_modeling.convert_nemo_gpt_to_mcore import build_key_mapping
+            from scripts.checkpoint_converters.convert_gpt_nemo_to_mcore import build_key_mapping
 
             mcore_translate = {}
             for k, v in build_key_mapping(model_cfg).items():

From a69ace4f5ac5f72367852f78538ea6c9880c39b2 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 20 May 2024 16:38:19 -0700
Subject: [PATCH 105/178] Fix document links (#9260)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 docs/source/features/parallelisms.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst
index 9d5f33196c4e..d5e86e46a49d 100644
--- a/docs/source/features/parallelisms.rst
+++ b/docs/source/features/parallelisms.rst
@@ -44,7 +44,7 @@ Implement Tensor Parallelism
 
 NeMo integrates Tensor Parallelism through the implementation from Megatron Core. To understand how TP is activated within transformer blocks, refer to the code in the following repository: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
 
-For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
+For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
 
 Pipeline Parallelism
 ^^^^^^^^^^^^^^^^^^^^
@@ -87,7 +87,7 @@ Implement Pipeline Parallelism
 
 NeMo's implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
 
-For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
+For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
 
 Sequence Parallelism
 ^^^^^^^^^^^^^^^^^^^^
@@ -132,7 +132,7 @@ To activate CP in the NeMo framework, set the ``context_parallel_size`` paramete
 
        context_parallel_size: 1  # Example to enable Context Parallelism
 
-The configuration can be found and modified here: `NeMo Megatron Core Context Config <https://docs.nvidia.com/Megatron Core/developer-guide/latest/api-guide/context_parallel.html>`_.
+The configuration can be found and modified here: `NeMo Megatron Core Context Config <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/context_parallel.html>`_.
 
 Implement Context Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 2e1814c9f031ad2aeeebad44597365e97253d2c4 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Tue, 21 May 2024 09:56:38 -0400
Subject: [PATCH 106/178] Add TRT-LLM params like max_num_tokens and
 opt_num_tokens (#9210)

* Add params like max_num_tokens and opt_num_tokens

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* remove padding param added

* update params like max_num_token

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* remove context context_fmha param for now

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* add params like max num token to the script

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 nemo/export/tensorrt_llm.py               | 21 ++++++---
 nemo/export/trt_llm/tensorrt_llm_build.py | 28 ++++++++++--
 scripts/deploy/nlp/deploy_triton.py       | 53 +++++++++++++++++++----
 scripts/export/export_to_trt_llm.py       | 19 +++++---
 tests/export/test_nemo_export.py          |  2 +
 5 files changed, 100 insertions(+), 23 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index af4f1b6699ee..cad7b821b3b4 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -117,15 +117,16 @@ def export(
         max_batch_size: int = 8,
         max_prompt_embedding_table_size=None,
         use_parallel_embedding: bool = False,
-        use_inflight_batching: bool = False,
-        enable_context_fmha: bool = True,
-        paged_kv_cache: bool = False,
+        paged_kv_cache: bool = True,
+        remove_input_padding: bool = True,
         dtype: str = "bfloat16",
         load_model: bool = True,
         enable_multi_block_mode: bool = False,
         use_lora_plugin: str = None,
         lora_target_modules: List[str] = None,
         max_lora_rank: int = 64,
+        max_num_tokens: int = None,
+        opt_num_tokens: int = None,
         save_nemo_model_config: bool = False,
     ):
         """
@@ -142,12 +143,18 @@ def export(
             max_output_token (int): max output length.
             max_batch_size (int): max batch size.
             max_prompt_embedding_table_size (int): max prompt embedding size.
-            use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
-            enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
+            use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
             paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
+            remove_input_padding (bool): enables removing input padding or not.
             dtype (str): Floating point type for model weights (Supports BFloat16/Float16).
             load_model (bool): load TensorRT-LLM model after the export.
             enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context.
+            use_lora_plugin (str): use dynamic lora or not.
+            lora_target_modules (List[str]): list of the target lora modules.
+            max_lora_rank (int): maximum lora rank.
+            max_num_tokens (int):
+            opt_num_tokens (int):
+            save_nemo_model_config (bool):
         """
 
         if model_type not in self.get_supported_models_list:
@@ -238,6 +245,10 @@ def export(
                         lora_target_modules=lora_target_modules,
                         max_prompt_embedding_table_size=max_prompt_embedding_table_size,
                         enable_multi_block_mode=enable_multi_block_mode,
+                        paged_kv_cache=paged_kv_cache,
+                        remove_input_padding=remove_input_padding,
+                        max_num_tokens=max_num_tokens,
+                        opt_num_tokens=opt_num_tokens,
                     )
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index ac8d9094ea32..2336b8eb38ce 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -24,6 +24,7 @@
 import tensorrt_llm
 import torch
 from tensorrt_llm import str_dtype_to_trt
+from tensorrt_llm._common import check_max_num_tokens
 from tensorrt_llm._utils import np_dtype_to_trt
 from tensorrt_llm.builder import BuildConfig, Builder
 from tensorrt_llm.commands.build import build as build_trtllm
@@ -371,6 +372,12 @@ def build_and_save_engine(
     lora_target_modules=None,
     max_prompt_embedding_table_size=0,
     enable_multi_block_mode: bool = False,
+    paged_kv_cache: bool = True,
+    remove_input_padding: bool = True,
+    max_num_tokens: int = None,
+    opt_num_tokens: int = None,
+    max_beam_width: int = 1,
+    tokens_per_block: int = 128,
 ):
     try:
         model_cls = getattr(tensorrt_llm.models, model_config.architecture)
@@ -383,15 +390,30 @@ def build_and_save_engine(
     plugin_config.set_gpt_attention_plugin(dtype=str_dtype)
     plugin_config.set_gemm_plugin(dtype=str_dtype)
     plugin_config.set_plugin("multi_block_mode", enable_multi_block_mode)
-    max_num_tokens = max_batch_size * max_input_len
+    if paged_kv_cache:
+        plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
+    else:
+        plugin_config.paged_kv_cache = False
+    plugin_config.remove_input_padding = remove_input_padding
+
+    max_num_tokens, opt_num_tokens = check_max_num_tokens(
+        max_num_tokens=max_num_tokens,
+        opt_num_tokens=opt_num_tokens,
+        max_batch_size=max_batch_size,
+        max_input_len=max_input_len,
+        max_beam_width=max_beam_width,
+        remove_input_padding=remove_input_padding,
+        enable_context_fmha=plugin_config.context_fmha,
+        tokens_per_block=tokens_per_block,
+    )
 
     build_dict = {
         'max_input_len': max_input_len,
         'max_output_len': max_output_len,
         'max_batch_size': max_batch_size,
-        'max_beam_width': 1,
+        'max_beam_width': max_beam_width,
         'max_num_tokens': max_num_tokens,
-        'opt_num_tokens': None,
+        'opt_num_tokens': opt_num_tokens,
         'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
         'gather_context_logits': False,
         'gather_generation_logits': False,
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index aa896e924584..7370731ec996 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -27,7 +27,8 @@
 
 def get_args(argv):
     parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Deploy nemo models to Triton",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=f"Deploy nemo models to Triton",
     )
     parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
     parser.add_argument(
@@ -73,6 +74,8 @@ def get_args(argv):
     parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
     parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
     parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
+    parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens")
+    parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens")
     parser.add_argument(
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
@@ -80,11 +83,11 @@ def get_args(argv):
         "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
-        "-dcf",
-        "--disable_context_fmha",
+        "-drip",
+        "--disable_remove_input_padding",
         default=False,
         action='store_true',
-        help="Disable fused Context MultiHeadedAttention (required for V100 support).",
+        help="Disables the remove input padding option.",
     )
     parser.add_argument(
         "-mbm",
@@ -101,7 +104,6 @@ def get_args(argv):
         '--use_lora_plugin',
         nargs='?',
         const=None,
-        default=False,
         choices=['float16', 'float32', 'bfloat16'],
         help="Activates the lora plugin which enables embedding sharing.",
     )
@@ -109,7 +111,16 @@ def get_args(argv):
         '--lora_target_modules',
         nargs='+',
         default=None,
-        choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",],
+        choices=[
+            "attn_qkv",
+            "attn_q",
+            "attn_k",
+            "attn_v",
+            "attn_dense",
+            "mlp_h_to_4h",
+            "mlp_gate",
+            "mlp_4h_to_h",
+        ],
         help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
     )
     parser.add_argument(
@@ -198,6 +209,29 @@ def nemo_deploy(argv):
     trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt)
 
     if args.nemo_checkpoint is not None:
+
+        trt_llm_exporter.export(
+            nemo_checkpoint_path=args.nemo_checkpoint,
+            model_type=args.model_type,
+            n_gpus=args.num_gpus,
+            tensor_parallel_size=args.num_gpus,
+            pipeline_parallel_size=1,
+            max_input_token=args.max_input_len,
+            max_output_token=args.max_output_len,
+            max_batch_size=args.max_batch_size,
+            max_num_tokens=args.max_num_tokens,
+            opt_num_tokens=args.opt_num_tokens,
+            max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+            paged_kv_cache=args.use_paged_kv_cache,
+            remove_input_padding=(not args.disable_remove_input_padding),
+            dtype=args.dtype,
+            enable_multi_block_mode=args.multi_block_mode,
+            use_lora_plugin=args.use_lora_plugin,
+            lora_target_modules=args.lora_target_modules,
+            max_lora_rank=args.max_lora_rank,
+            save_nemo_model_config=True,
+        )
+
         try:
             LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
             trt_llm_exporter.export(
@@ -209,9 +243,11 @@ def nemo_deploy(argv):
                 max_input_token=args.max_input_len,
                 max_output_token=args.max_output_len,
                 max_batch_size=args.max_batch_size,
+                max_num_tokens=args.max_num_tokens,
+                opt_num_tokens=args.opt_num_tokens,
                 max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
                 paged_kv_cache=args.use_paged_kv_cache,
-                enable_context_fmha=not args.disable_context_fmha,
+                remove_input_padding=(not args.disable_remove_input_padding),
                 dtype=args.dtype,
                 enable_multi_block_mode=args.multi_block_mode,
                 use_lora_plugin=args.use_lora_plugin,
@@ -236,7 +272,8 @@ def nemo_deploy(argv):
                 )
             )
             trt_llm_exporter.add_prompt_table(
-                task_name=str(task_id), prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
+                task_name=str(task_id),
+                prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
             )
     except Exception as error:
         LOGGER.error("An error has occurred during adding the prompt embedding table(s). Error message: " + str(error))
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
index 5e5833444f65..e9741516cf00 100644
--- a/scripts/export/export_to_trt_llm.py
+++ b/scripts/export/export_to_trt_llm.py
@@ -53,18 +53,20 @@ def get_args(argv):
     parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
     parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
     parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
+    parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens")
+    parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens")
     parser.add_argument(
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
     parser.add_argument(
-        "-uib",
-        "--use_inflight_batching",
-        default=False,
-        action='store_true',
-        help="Enable inflight batching for TensorRT-LLM Triton backend.",
+        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
-        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+        "-drip",
+        "--disable_remove_input_padding",
+        default=False,
+        action='store_true',
+        help="Disables the remove input padding option.",
     )
     parser.add_argument(
         "-mbm",
@@ -141,9 +143,12 @@ def nemo_export_trt_llm(argv):
             max_input_token=args.max_input_len,
             max_output_token=args.max_output_len,
             max_batch_size=args.max_batch_size,
+            max_num_tokens=args.max_num_tokens,
+            opt_num_tokens=args.opt_num_tokens,
             max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            use_inflight_batching=args.use_inflight_batching,
             paged_kv_cache=args.use_paged_kv_cache,
+            remove_input_padding=(not args.disable_remove_input_padding),
+            dtype=args.dtype,
             enable_multi_block_mode=args.multi_block_mode,
             use_lora_plugin=args.use_lora_plugin,
             lora_target_modules=args.lora_target_modules,
diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
index 0e9981403a1a..b3e186433561 100644
--- a/tests/export/test_nemo_export.py
+++ b/tests/export/test_nemo_export.py
@@ -214,6 +214,8 @@ def run_trt_llm_inference(
             max_prompt_embedding_table_size=max_prompt_embedding_table_size,
             use_lora_plugin=use_lora_plugin,
             lora_target_modules=lora_target_modules,
+            max_num_tokens=int(max_input_token * max_batch_size * 0.2),
+            opt_num_tokens=60,
             save_nemo_model_config=True,
         )
 

From c7bf46e88b404078f58c077f25a9b9180565d43d Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 21 May 2024 11:25:15 -0700
Subject: [PATCH 107/178] sum-reduce grad_norm in DP+CP domain (#9262)

* sum-reudce grad_norm in DP+CP domain

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: pablo-garay <pablo-garay@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: pablo-garay <pablo-garay@users.noreply.github.com>
---
 .../nlp/modules/common/megatron/clip_grads.py        | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/clip_grads.py b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
index 7edc6720574e..b87c260ca4da 100644
--- a/nemo/collections/nlp/modules/common/megatron/clip_grads.py
+++ b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
@@ -142,7 +142,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
                 grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
             # Since we will be summing across data parallel groups,
             # we need the pow(norm-type).
-            total_norm = grad_norm ** norm_type
+            total_norm = grad_norm**norm_type
             if use_fsdp:
                 if len(sharded_grads_for_norm) > 0:
                     sharded_grad_norm, _ = multi_tensor_applier(
@@ -150,20 +150,22 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
                     )
                 else:
                     sharded_grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
-                total_sharded_norm = sharded_grad_norm ** norm_type
+                total_sharded_norm = sharded_grad_norm**norm_type
         else:
             for grad in grads_for_norm:
                 grad_norm = torch.norm(grad, norm_type)
-                total_norm += grad_norm ** norm_type
+                total_norm += grad_norm**norm_type
             if use_fsdp:
                 for grad in sharded_grads_for_norm:
                     grad_norm = torch.norm(grad, norm_type)
-                    total_sharded_norm += grad_norm ** norm_type
+                    total_sharded_norm += grad_norm**norm_type
 
         if use_fsdp:
             # Sum norm of grad shards across data-parallel GPUs.
             torch.distributed.all_reduce(
-                total_sharded_norm, op=torch.distributed.ReduceOp.SUM, group=parallel_state.get_data_parallel_group(),
+                total_sharded_norm,
+                op=torch.distributed.ReduceOp.SUM,
+                group=parallel_state.get_data_parallel_group(with_context_parallel=True),
             )
             total_norm += total_sharded_norm.squeeze()
 

From d7bb40364c17bf322004539f851cc83df4c4c2b7 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Tue, 21 May 2024 20:14:18 -0700
Subject: [PATCH 108/178] Add llama3 and distributed checkpoint support in NeVA
 (#9101)

* temp save

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* temp save 2

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable seq packing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix neva and clip

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Enable parallel seq packing algo and few other fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Pipeline parallel support

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update data preprocess

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix few pp issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable sequence packing w/ PP

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix cu_seqlens in inputs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add assert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Depend on PP to decide whether do padding

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add docstring

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few evaluation issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few PP evaluation issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add llama3 template

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix license

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix llama3

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* llama3 inference fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Force vision encoder to run in fp32

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Revert "Force vision encoder to run in fp32"

This reverts commit 9d2160d96cb3e2a27a18538950ef43b4482c04da.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Try adding distributed format of checkpoint

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Allow dist checkpoint to be non-strict

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Some fixes for PP + dist ckpt in Neva

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix peft

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* few fixes for lora

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* checkpoint updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* bug fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add neva dist checkpoint converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* resolve comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update neva dist ckpt apis

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix return

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 .../neva/conf/neva_inference.yaml             |   2 +-
 .../multimodal_llm/neva/eval/gradio_server.py |   1 +
 .../multimodal_llm/neva/eval/vqa_science.py   |   1 -
 .../neva/neva_convert_to_dist_ckpt.py         |  89 +++++++
 .../multimodal/data/neva/conversation.py      |  61 ++++-
 .../multimodal/data/neva/neva_dataset.py      | 221 +++++++++++++++---
 .../models/multimodal_llm/neva/neva_model.py  | 144 ++++++++++--
 nemo/collections/multimodal/parts/utils.py    |  18 +-
 .../common/text_generation_strategy.py        | 118 +++++++---
 .../modules/common/text_generation_utils.py   |  38 +--
 .../parts/mixins/multimodal_adapter_mixins.py |  85 ++++---
 .../nlp/parts/mixins/nlp_adapter_mixins.py    |  44 +++-
 nemo/collections/nlp/parts/nlp_overrides.py   |   4 +-
 nemo/utils/callbacks/dist_ckpt_io.py          |  36 ++-
 14 files changed, 690 insertions(+), 172 deletions(-)
 create mode 100644 examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py

diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
index 145575d8a73b..b06f4bd8e535 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
@@ -11,7 +11,7 @@ inference:
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
   media_base_path: /pwd/images # /path/to/images or /path/to/videos
-  insert_media_token: left # `left` or `right` or `null`
+  insert_media_token: null # `left` or `right` or `null`
   media_type: image # `image` or `video` 
 
 trainer:
diff --git a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
index 88cfdc4ed194..7c04a7045f00 100644
--- a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
+++ b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
@@ -20,6 +20,7 @@
 from omegaconf import OmegaConf
 
 from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 
 CFG_STRING = """
 trainer:
diff --git a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
index a80c9e70f4ed..17bda5725eb4 100644
--- a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
+++ b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
@@ -169,7 +169,6 @@ def eval_model(args):
     parser.add_argument("--image-folder", type=str, default="")
     parser.add_argument("--question-file", type=str, default="tables/question.json")
     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
-    parser.add_argument("--conv-mode", type=str, default="llava_v0")
     parser.add_argument("--tp", type=int, default=1)
     parser.add_argument("--pp", type=int, default=1)
     parser.add_argument("--num-chunks", type=int, default=1)
diff --git a/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py b/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py
new file mode 100644
index 000000000000..8891a8e9d208
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/neva_convert_to_dist_ckpt.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to NeMo legacy checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--gpus_per_node", type=int, required=False, default=8)
+    parser.add_argument("--num_nodes", type=int, required=False, default=1)
+    parser.add_argument(
+        "--precision",
+        type=str,
+        required=False,
+        default='bf16-mixed',
+        choices=['32-true', '16-mixed', 'bf16-mixed'],
+        help="Precision value for the trainer that matches with precision of the ckpt",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main() -> None:
+    args = get_args()
+    cfg = {
+        'trainer': {
+            'devices': args.gpus_per_node,
+            'num_nodes': args.num_nodes,
+            'accelerator': 'gpu',
+            'precision': args.precision,
+        },
+        'model': {
+            'native_amp_init_scale': 2**32,
+            'native_amp_growth_interval': 1000,
+            'hysteresis': 2,
+            'gradient_as_bucket_view': True,
+        },
+        'cluster_type': 'BCP',
+    }
+    cfg = OmegaConf.create(cfg)
+
+    # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+    # precision plugins and precision to exist
+    cfg.trainer.precision = None
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
+
+    save_restore_connector = NLPSaveRestoreConnector()
+    if os.path.isdir(args.input_path):
+        save_restore_connector.model_extracted_dir = args.input_path
+
+    model = MegatronNevaModel.restore_from(
+        restore_path=args.input_path,
+        trainer=trainer,
+        save_restore_connector=save_restore_connector,
+        strict=False,
+    )
+
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py
index 80a297a5b952..43b1977aa993 100644
--- a/nemo/collections/multimodal/data/neva/conversation.py
+++ b/nemo/collections/multimodal/data/neva/conversation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import dataclasses
+from collections import defaultdict
 from enum import Enum, auto
 from typing import List
 
@@ -24,9 +25,14 @@
 DEFAULT_SYSTEM_TOKEN = "<extra_id_0>"
 DEFAULT_SEPARATOR_TOKEN = "<extra_id_1>"
 DEFAULT_LABELS_TOKEN = "<extra_id_2>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<extra_id_3>"
-DEFAULT_IM_START_TOKEN = "<extra_id_4>"
-DEFAULT_IM_END_TOKEN = "<extra_id_5>"
+DEFAULT_IMAGE_PATCH_TOKEN = defaultdict(lambda: "<extra_id_3>")
+DEFAULT_IM_START_TOKEN = defaultdict(lambda: "<extra_id_4>")
+DEFAULT_IM_END_TOKEN = defaultdict(lambda: "<extra_id_5>")
+
+# Update llama3 default
+DEFAULT_IMAGE_PATCH_TOKEN["llama_3"] = "<|reserved_special_token_3|>"
+DEFAULT_IM_START_TOKEN["llama_3"] = "<|reserved_special_token_4|>"
+DEFAULT_IM_END_TOKEN["llama_3"] = "<|reserved_special_token_5|>"
 
 
 class SeparatorStyle(Enum):
@@ -36,6 +42,7 @@ class SeparatorStyle(Enum):
     TWO = auto()
     PLAIN = auto()
     LLAMA_2 = auto()
+    LLAMA_3 = auto()
     NVGPT = auto()
 
 
@@ -109,6 +116,34 @@ def get_prompt(self):
                 else:
                     ret += ""
             ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            """
+            <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+            {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+            {{ user_message_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+            {{ model_answer_1 }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+            {{ user_message_2 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+            """
+            wrap_sys = lambda msg: f"<|start_header_id|>system<|end_header_id|>\n\n{msg}"
+            wrap_user = lambda msg: f"<|start_header_id|>user<|end_header_id|>\n\n{msg}"
+            wrap_assistant = lambda msg: f"<|start_header_id|>assistant<|end_header_id|>\n\n{msg}"
+
+            ret = "<|begin_of_text|>" + wrap_sys(self.system) + self.sep
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if type(message) is tuple:
+                    message, _, _ = message
+                elif i % 2 == 0:
+                    ret += wrap_user(message) + self.sep
+                else:
+                    ret += wrap_assistant(message) + (self.sep if message else "")
+
         elif self.sep_style == SeparatorStyle.PLAIN:
             seps = [self.sep, self.sep2]
             ret = self.system
@@ -346,8 +381,25 @@ def dict(self):
     sep2=DEFAULT_EOS_TOKEN,
 )
 
+conv_llava_llama_3 = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("user", "assistant"),
+    version="llama_v3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+)
+
 conv_llava_plain = Conversation(
-    system="", roles=("", ""), messages=(), offset=0, sep_style=SeparatorStyle.PLAIN, sep="\n",
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
 )
 
 conv_llava_v0 = Conversation(
@@ -416,6 +468,5 @@ def dict(self):
     "nv_dpo": conv_nv_dpo,
 }
 
-
 if __name__ == "__main__":
     print(default_conversation.get_prompt())
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index caaab2c5d67e..70afc5b4a19a 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -34,17 +34,11 @@
 import nemo.collections.multimodal.data.neva.conversation as conversation_lib
 from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform
 from nemo.collections.multimodal.data.neva.conversation import (
-    DEFAULT_BOS_TOKEN,
-    DEFAULT_EOS_TOKEN,
     DEFAULT_IM_END_TOKEN,
     DEFAULT_IM_START_TOKEN,
     DEFAULT_IMAGE_PATCH_TOKEN,
     DEFAULT_IMAGE_TOKEN,
     DEFAULT_LABELS_TOKEN,
-    DEFAULT_PAD_TOKEN,
-    DEFAULT_SEPARATOR_TOKEN,
-    DEFAULT_SYSTEM_TOKEN,
-    DEFAULT_UNK_TOKEN,
     DEFAULT_VIDEO_TOKEN,
 )
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
@@ -188,7 +182,10 @@ def flatten_frames(self, cap):
 
 
 def tokenize(
-    texts: Union[str, List[str]], tokenizer: Any, context_length: int, add_extra_token: int,
+    texts: Union[str, List[str]],
+    tokenizer: Any,
+    context_length: int,
+    add_extra_token: int,
 ) -> torch.LongTensor:
     """
     Returns the tokenized representation of given input string(s). If the list of tokens exceeds the context
@@ -216,7 +213,7 @@ def tokenize(
     if isinstance(texts, str):
         texts = [texts]
         texts_is_str = True
-    tokens = tokenizer.text_to_ids(texts)
+    tokens = [tokenizer.text_to_ids(t) for t in texts]
     max_len = max([len(token) for token in tokens])
     context_length = min(max_len - add_extra_token, context_length)
     # truncate and padding
@@ -251,6 +248,7 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
     - dict: The processed sources dictionary after applying multimodal preprocessing steps.
     """
     is_multimodal = multimodal_cfg['is_multimodal']
+    model_type = multimodal_cfg['model_type']
     media_type = multimodal_cfg['media_type']
     image_token_len = cur_token_len
     if media_type == 'image':
@@ -268,11 +266,10 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
         num_patches *= multimodal_cfg['num_frames']
 
     if multimodal_cfg['use_im_start_end']:
-        replace_token = DEFAULT_IMAGE_PATCH_TOKEN * num_patches
+        replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * num_patches
     else:
-        replace_token = DEFAULT_IMAGE_PATCH_TOKEN * (num_patches - 2)
-
-    replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+        replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * (num_patches - 2)
+    replace_token = DEFAULT_IM_START_TOKEN[model_type] + replace_token + DEFAULT_IM_END_TOKEN[model_type]
 
     for source in sources:
         conversation = source['conversations']
@@ -295,7 +292,103 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
     return sources
 
 
-def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_llama_3(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
+    """
+    Preprocesses sources for the LLaMA 3 model configuration.
+
+    The function applies prompt templates and tokenizes the conversations according to the LLaMA 2 model specifications.
+    It involves special handling of tokens, masking of labels, and adjustments based on configuration settings.
+
+    Parameters:
+    - sources (dict): A dictionary of sources containing conversations to be processed.
+    - tokenizer: The tokenizer to be used for processing the text.
+    - cfg: Configuration settings for preprocessing, including context length and additional tokens.
+
+    Returns:
+    - Dict: A dictionary containing tokenized and labeled data suitable for the LLaMA 2 model.
+      This includes tokens, labels, and any special processing as defined in the configuration.
+    """
+    conv = conversation_lib.conv_llava_llama_3.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        source = source['conversations']
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    add_extra_token = cfg.get("add_extra_token")
+
+    # Tokenize conversations
+    tokens = tokenize(
+        texts=conversations,
+        tokenizer=tokenizer,
+        context_length=cfg.get("context_length"),
+        add_extra_token=add_extra_token,
+    )
+    labels = tokens.clone().detach()
+    # Mask labels
+    sep = "<|start_header_id|>assistant<|end_header_id|>\n\n"  # part sep
+    round_sep = "<|start_header_id|>user<|end_header_id|>\n\n"
+    for conversation, target in zip(conversations, labels):
+        # the first match of round sep is going to be the one after system, which is not the intended behavior
+        rounds = conversation.split(round_sep)
+        rounds = [round_sep.join(rounds[:2])] + rounds[2:]
+        cur_len = 0
+        for i, rou in enumerate(rounds):
+
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if i == 0:
+                round_len = len(tokenizer.text_to_ids(rou))
+                instruction_len = len(tokenizer.text_to_ids(parts[0]))
+            else:
+                round_len = len(tokenizer.text_to_ids(round_sep + rou))
+                instruction_len = len(tokenizer.text_to_ids(round_sep + parts[0]))
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+    # Check if masking working correctly
+    # print([x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())])
+
+    if add_extra_token:
+        tokens = tokens[:, :-1].contiguous()
+        labels = labels[:, 1:].contiguous()
+    else:
+        labels = torch.roll(labels, shifts=-1, dims=-1)
+        labels[:, -1] = IGNORE_INDEX
+
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
+
+
+def preprocess_llama_2(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses sources for the LLaMA 2 model configuration.
 
@@ -379,10 +472,17 @@ def preprocess_llama_2(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_v1(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_v1(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses sources for the Vicuna V1 model configuration.
 
@@ -462,10 +562,17 @@ def preprocess_v1(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_nvgpt(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocess a given set of conversational sources using nvgpt conversation template
 
@@ -503,9 +610,9 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
             if i % 2 == 1:
                 turn['from'] = conv.roles[1]
                 if 'label' not in turn:
-                    turn[
-                        'label'
-                    ] = "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4"
+                    turn['label'] = (
+                        "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4"
+                    )
                 value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value']
                 conv.append_message(turn['from'], value)
                 if not turn["value"]:
@@ -567,10 +674,17 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
+def preprocess_nv_dpo(
+    sources: dict,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocess a given set of conversational sources using nvgpt conversation template
 
@@ -666,10 +780,17 @@ def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
-def preprocess_plain(sources, tokenizer, cfg,) -> Dict:
+def preprocess_plain(
+    sources,
+    tokenizer,
+    cfg,
+) -> Dict:
     """
     Preprocesses plain text sources (no template) for tokenization and label generation.
 
@@ -717,7 +838,10 @@ def preprocess_plain(sources, tokenizer, cfg,) -> Dict:
         labels = torch.roll(labels, shifts=-1, dims=-1)
         labels[:, -1] = IGNORE_INDEX
 
-    return dict(tokens=tokens, labels=labels,)
+    return dict(
+        tokens=tokens,
+        labels=labels,
+    )
 
 
 class LazySupervisedDataset(Dataset):
@@ -865,20 +989,45 @@ def expand2square(pil_img, background_color):
                 )
 
         else:
-            logging.warning("media not found in sources")
             media_tensors = torch.tensor([])
             sources = copy.deepcopy(sources)
 
         if self.conv_template in ["nvgpt", "nv_steerlm"]:
-            data_dict = preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_nvgpt(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "nv_dpo":
-            data_dict = preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_nv_dpo(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "v1":
-            data_dict = preprocess_v1(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_v1(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "llama_2":
-            data_dict = preprocess_llama_2(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_llama_2(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
+        elif self.conv_template == "llama_3":
+            data_dict = preprocess_llama_3(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         elif self.conv_template == "plain":
-            data_dict = preprocess_plain(sources, self.tokenizer, self.multimodal_cfg,)
+            data_dict = preprocess_plain(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+            )
         else:
             raise ValueError(f"Conversation template `{self.conv_template}` is not supported in Neva now.")
 
@@ -981,7 +1130,7 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
 
         tokens = batch['tokens']
         labels = batch['labels']
-        media_type = model_cfg.data.get('media_type')
+        media_type = model_cfg.data.get('media_type', 'image')
         if media_type == 'image':
             media = batch.get('image')
         elif media_type == 'video':
@@ -1048,7 +1197,12 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
         )
     else:
         # TODO(yuya): Fix this hard-code for our own CLIP
-        image_processor = image_transform(crop_size, is_train=False, mean=None, std=None,)
+        image_processor = image_transform(
+            crop_size,
+            is_train=False,
+            mean=None,
+            std=None,
+        )
 
     train_dataset = NevaDataset(
         tokenizer=tokenizer,
@@ -1056,6 +1210,7 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
         multimodal_cfg=dict(
             is_multimodal=data_cfg.is_multimodal,
             sep_image_conv_front=data_cfg.sep_image_conv_front,
+            model_type=mm_cfg.llm.get("model_type", "nvgpt"),
             conv_template=data_cfg.get("conv_template", "nvgpt"),
             crop_size=crop_size,
             image_token_len=data_cfg.image_token_len,
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 7192a1b018b1..e33cf267c230 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -78,6 +78,7 @@
     from megatron.core import InferenceParams, dist_checkpointing, parallel_state
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 
     HAVE_MEGATRON_CORE = True
 
@@ -91,7 +92,11 @@ class FrozenCLIPVisionTransformer(CLIPVisionTransformer):
 
     def __init__(self, model_cfg, model_parallel_config, pre_process=True, post_process=True):
         super().__init__(
-            model_cfg, model_parallel_config, pre_process=pre_process, post_process=post_process, skip_head=True,
+            model_cfg,
+            model_parallel_config,
+            pre_process=pre_process,
+            post_process=post_process,
+            skip_head=True,
         )
         self.frozen = False
         self.dtype = self.config.params_dtype
@@ -235,6 +240,15 @@ def replace_media_embeddings(self, input_ids, inputs_embeds, media):
 
         return updated_input_embeds
 
+    def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), **kwargs):
+        sharded_state_dict = super().sharded_state_dict(prefix=prefix, sharded_offsets=sharded_offsets, **kwargs)
+
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        state_dict.pop('weight')
+        # duplicate everything else
+        sharded_state_dict.update(make_sharded_tensors_for_checkpoint(state_dict, prefix=prefix))
+        return sharded_state_dict
+
 
 class NevaBaseModel:
     """
@@ -245,7 +259,12 @@ class NevaBaseModel:
     """
 
     def __init__(
-        self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs,
+        self,
+        mm_cfg,
+        media_start_id,
+        media_end_id,
+        mcore_gpt,
+        **kwargs,
     ):
         self.mm_cfg = mm_cfg
         self.media_start_id = media_start_id
@@ -264,7 +283,8 @@ def __init__(
         # Initialize vision encoder and freeze it
         if mm_cfg.vision_encoder.from_hf:
             vision_encoder = CLIPVisionModel.from_pretrained(
-                mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16,
+                mm_cfg.vision_encoder.from_pretrained,
+                torch_dtype=torch.bfloat16,
             ).cuda()
             vision_encoder = vision_encoder.to(torch.bfloat16)
             if mm_cfg.vision_encoder.freeze:
@@ -385,7 +405,12 @@ class MCoreNevaModel(MCoreGPTModel, NevaBaseModel):
     """
 
     def __init__(
-        self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs,
+        self,
+        mm_cfg,
+        media_start_id,
+        media_end_id,
+        mcore_gpt,
+        **kwargs,
     ):
         MCoreGPTModel.__init__(self, **kwargs)
         NevaBaseModel.__init__(self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs)
@@ -400,11 +425,17 @@ def freeze_llm(self, mm_cfg):
         else:
             output_layer_parameters = {}
 
-        for param in chain(embedding_parameters, self.decoder.parameters(), output_layer_parameters,):
+        for param in chain(
+            embedding_parameters,
+            self.decoder.parameters(),
+            output_layer_parameters,
+        ):
             param.requires_grad = False
 
     def forward(
-        self, *args, **kwargs,
+        self,
+        *args,
+        **kwargs,
     ):
         media = kwargs.pop('media', None)
         if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
@@ -421,7 +452,12 @@ class NevaModel(GPTModel, NevaBaseModel):
     """
 
     def __init__(
-        self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs,
+        self,
+        mm_cfg,
+        media_start_id,
+        media_end_id,
+        mcore_gpt,
+        **kwargs,
     ):
         GPTModel.__init__(self, **kwargs)
         NevaBaseModel.__init__(self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs)
@@ -431,7 +467,9 @@ def freeze_llm(self, mm_cfg):
             param.requires_grad = False
 
     def forward(
-        self, *args, **kwargs,
+        self,
+        *args,
+        **kwargs,
     ):
         media = kwargs.pop('media', None)
         if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
@@ -455,7 +493,7 @@ def init_neva_adapter(self):
             adapter_type=self.cfg.mm_cfg.get("mm_mlp_adapter_type", "linear"),
             in_features=self.cfg.mm_cfg.vision_encoder.hidden_size,
             out_features=self.cfg.hidden_size,
-            bias=True,
+            bias=True,  # self.cfg.get("bias", False),
         )
         for name, module in self.named_modules():
             self._check_and_add_adapter(
@@ -471,8 +509,10 @@ def init_neva_adapter(self):
 
     def model_provider_func(self, pre_process, post_process):
         """Model depends on pipeline paralellism."""
-        media_start_id = self.tokenizer.token_to_id(DEFAULT_IM_START_TOKEN)
-        media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN)
+
+        model_type = self.cfg.mm_cfg.llm.get("model_type", "nvgpt")
+        media_start_id = self.tokenizer.token_to_id(DEFAULT_IM_START_TOKEN[model_type])
+        media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN[model_type])
 
         if self.mcore_gpt:
             if not parallel_state.is_initialized():
@@ -581,6 +621,13 @@ def setup_optimizer_param_groups(self):
         else:
             MegatronGPTModel.setup_optimizer_param_groups(self)
 
+        # TODO(yuya): Refactor the handling of distributed checkpoint optimizer state loading
+        # With Pipeline Parallelism (PP) greater than 1, different stages might have varying lengths for `self._optimizer_param_groups`.
+        # This inconsistency can lead to errors during the loading of distributed checkpoints.
+        # As a temporary workaround, if `self._optimizer_param_groups` has less than 2 groups, add an empty parameter group marked as non-expert.
+        if len(self._optimizer_param_groups) < 2 and not self.use_peft:
+            self._optimizer_param_groups = (self._optimizer_param_groups[0], {'params': [], 'is_expert': False})
+
         # filter out params doesn't have grad
         for param_group in self._optimizer_param_groups:
             params_with_grad = [param for param in param_group['params'] if param.requires_grad]
@@ -640,7 +687,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
             grad_sync_func = None
             param_sync_func = None
             if not forward_only and self.with_distributed_adam:
-                no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+                no_sync_func = partial(
+                    self._optimizer.no_sync,
+                    greedy_grad_copy=self.megatron_amp_O2,
+                )
                 grad_sync_func = self.reduce_overlap_gradients
                 param_sync_func = self.sync_overlap_parameters
 
@@ -698,9 +748,9 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
 
     def training_step(self, dataloader_iter):
         """
-            We pass the dataloader iterator function to the micro-batch scheduler.
-            The input batch to each micro-batch is fetched using the dataloader function
-            in the micro-batch fwd function.
+        We pass the dataloader iterator function to the micro-batch scheduler.
+        The input batch to each micro-batch is fetched using the dataloader function
+        in the micro-batch fwd function.
         """
         return MegatronGPTModel.training_step(self, dataloader_iter)
 
@@ -903,7 +953,7 @@ def loss_func(self, loss_mask, output_tensor):
         return loss
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
+        """PTL hook that is executed after DDP spawns.
             We setup datasets here as megatron datasets require DDP to instantiate.
             See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
         Args:
@@ -981,7 +1031,10 @@ def build_train_valid_test_datasets(self):
             self._train_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size"))
             self._validation_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size"))
         else:
-            ds_dict = make_supervised_data_module(tokenizer=self.tokenizer, model_cfg=self.cfg,)
+            ds_dict = make_supervised_data_module(
+                tokenizer=self.tokenizer,
+                model_cfg=self.cfg,
+            )
             self._train_ds = ds_dict["train_dataset"]
             self._validation_ds = ds_dict["eval_dataset"]
 
@@ -1049,10 +1102,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
     def setup_test_data(self, cfg):
         pass
 
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        # Get the original state dictionary
-        original_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
-
+    def get_keys_to_keep(self):
         keys_to_keep = list(self.adapter_keys)
         # TODO(yuya): maybe not hard-code vision_encoder keys here
         vision_encoder_keys = [k for k in self.base_keys if "vision_encoder" in k]
@@ -1061,6 +1111,12 @@ def state_dict(self, destination=None, prefix='', keep_vars=False):
             keys_to_keep += llm_keys
         if not self.cfg.mm_cfg.vision_encoder.freeze:
             keys_to_keep += vision_encoder_keys
+        return keys_to_keep
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        # Get the original state dictionary
+        original_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+        keys_to_keep = self.get_keys_to_keep()
         new_state_dict = {k: original_state_dict[k] for k in keys_to_keep}
         return new_state_dict
 
@@ -1079,10 +1135,46 @@ def load_state_dict(self, state_dict, strict=False):
             logging.critical(f'Unexpected keys: \n{unexpected_keys}')
 
     def on_load_checkpoint(self, checkpoint) -> None:
-        pass
+        """LightningModule hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint
+        """
+
+        # mcore uses distributed checkpointing
+        # FSDP supports the lagecy checkpointing or torch-FSDP-native sharded checkpointing
+        if self.mcore_gpt and not self.use_fsdp:
+            if 'state_dict' in checkpoint and checkpoint['state_dict']:
+                for index, module in enumerate(self.get_model_module_list()):
+                    if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                        checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}']
+                    else:
+                        checkpoint_state_dict = checkpoint['state_dict']
+                    # checkpoint_state_dict has "model." but module does not so we need to remove it when loading
+                    checkpoint_state_dict = {
+                        key.replace('model.', ''): checkpoint_state_dict.pop(key)
+                        for key in list(checkpoint_state_dict.keys())
+                    }
+                    module.load_state_dict(checkpoint_state_dict, strict=False)
+            else:
+                # when restoring a distributed checkpoint from a ptl checkpoint we need to defer loading the state_dict
+                # see NLPModel.on_load_checkpoint
+                checkpoint['state_dict'] = {}
+
+        # legacy checkpointing for interleaved
+        else:
+            if isinstance(self.model, list):
+                for i in range(len(self.model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
     def sharded_state_dict(self, prefix: str = ''):
-        return None
+        if self.use_peft:
+            return None
+
+        original_sharded_state_dict = super().sharded_state_dict()
+        keys_to_keep = self.get_keys_to_keep()
+        new_sharded_state_dict = {k: original_sharded_state_dict[k] for k in keys_to_keep}
+        return new_sharded_state_dict
 
     def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any:
         inference_config = self.get_inference_config()
@@ -1111,7 +1203,11 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int]
                 return generate(self, **inference_config)
 
     def generate(
-        self, input_prompts, inference_config, length_params: LengthParam, sampling_params: SamplingParam = None,
+        self,
+        input_prompts,
+        inference_config,
+        length_params: LengthParam,
+        sampling_params: SamplingParam = None,
     ) -> OutputType:
 
         # check whether the DDP is initialized
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index f9d6ed5250f6..70dd2174a2b7 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -344,22 +344,6 @@ def create_neva_model_and_processor(cfg):
     # trainer required for restoring model parallel models
     trainer = Trainer(plugins=plugins, strategy=NLPDDPStrategy(), **cfg.trainer)
 
-    if (
-        cfg.tensor_model_parallel_size < 0
-        or cfg.pipeline_model_parallel_size < 0
-        or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
-    ):
-        model_config = MegatronNevaModel.restore_from(
-            restore_path=cfg.neva_model_file,
-            trainer=trainer,
-            return_config=True,
-        )
-
-        with open_dict(cfg):
-            cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0)
-
     assert (
         cfg.trainer.devices * cfg.trainer.num_nodes
         == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
@@ -385,6 +369,8 @@ def create_neva_model_and_processor(cfg):
             neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None)
             neva_cfg.apply_rope_fusion = False
             neva_cfg.fp8 = False
+            neva_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+            neva_cfg.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
         #    neva_cfg.mm_cfg.vision_encoder.from_pretrained = None
 
         model = MegatronNevaModel.restore_from(
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index fd32ac844274..44a80465c34b 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -69,7 +69,11 @@ def forward_step(self, batch, tensor_shape):
         fwd_bwd_function = get_forward_backward_func()
         output_tensor = fwd_bwd_function(
             forward_step_func=self.model.get_forward_output_only_func(),
-            data_iterator=iter([batch,]),
+            data_iterator=iter(
+                [
+                    batch,
+                ]
+            ),
             model=[self.forward_model],
             num_microbatches=get_num_microbatches(),
             forward_only=True,
@@ -104,7 +108,7 @@ def tokenize_batch(self, sentences, max_len, add_BOS):
 
     @abc.abstractclassmethod
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length
+        """clip the max len based on the LM model max sequence length
         Args:
             maxlen (int): the max len computed from the context and number of tokens to generate
         returns (int):
@@ -119,7 +123,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
            context_length (int): the context token length
            compute_attention_mask: bool: set to True to compute attention mask (not needed for FA)
         Args:
-            context_tokens (torch.Tensor):  The padded context tokens including the space for tokens to be generated 
+            context_tokens (torch.Tensor):  The padded context tokens including the space for tokens to be generated
         """
         pass
 
@@ -262,7 +266,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -336,7 +340,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -390,7 +394,11 @@ def forward_step(self, batch, tensor_shape_and_context_length):
 
         output_tensor = fwd_bwd_function(
             forward_step_func=self.model.get_forward_output_only_func(),
-            data_iterator=iter([batch,]),
+            data_iterator=iter(
+                [
+                    batch,
+                ]
+            ),
             model=[self.forward_model],
             num_microbatches=get_num_microbatches(),
             forward_only=True,
@@ -406,6 +414,7 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
     from nemo.collections.multimodal.data.neva.neva_dataset import (
         DEFAULT_IMAGE_TOKEN,
         preprocess_llama_2,
+        preprocess_llama_3,
         preprocess_multimodal,
         preprocess_nv_dpo,
         preprocess_nvgpt,
@@ -415,10 +424,18 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
     list_data_dict = []
     if multimodal_cfg["conv_template"] in ["nvgpt", "nv_steerlm", "nv_dpo"]:
         record = {
-            'system': '\n'
-            if multimodal_cfg["conv_template"] == 'nv_dpo'
-            else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n',
-            'conversations': [{'from': 'User', 'value': prompt}, {'from': 'Assistant', 'value': '',},],
+            'system': (
+                '\n'
+                if multimodal_cfg["conv_template"] == 'nv_dpo'
+                else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n'
+            ),
+            'conversations': [
+                {'from': 'User', 'value': prompt},
+                {
+                    'from': 'Assistant',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -441,7 +458,16 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
 
     elif multimodal_cfg["conv_template"] == "llama_2":
         record = {
-            'conversations': [{'from': 'human', 'value': prompt,}, {'from': 'gpt', 'value': '',},],
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -453,9 +479,40 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
             copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
         )  # HARDCODED FOR NOW
         data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg)
+    elif multimodal_cfg["conv_template"] == "llama_3":
+        record = {
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
+        }
+
+        for turn in record['conversations']:
+            if turn.get('value') is not None:
+                turn['value'] = re.sub('<image>', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value'])
+        list_data_dict.append(record)
+        sources = preprocess_multimodal(
+            copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
+        )  # HARDCODED FOR NOW
+        data_dict = preprocess_llama_3(sources, tokenizer, multimodal_cfg)
     elif multimodal_cfg["conv_template"] == "v1":
         record = {
-            'conversations': [{'from': 'human', 'value': prompt,}, {'from': 'gpt', 'value': '',},],
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
         }
 
         for turn in record['conversations']:
@@ -487,6 +544,7 @@ def __init__(self, model):
             is_multimodal=self.data_cfg.is_multimodal,
             sep_image_conv_front=self.data_cfg.sep_image_conv_front,
             conv_template=self.data_cfg.get("conv_template", "nvgpt"),
+            model_type=self.cfg.mm_cfg.llm.get("model_type", "nvgpt"),
             image_token_len=self.data_cfg.image_token_len,
             image_folder=self.data_cfg.image_folder,
             image_aspect_ratio=self.data_cfg.image_aspect_ratio,
@@ -499,7 +557,7 @@ def __init__(self, model):
         )
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
         if maxlen > self.model.cfg.encoder_seq_length + 1:
             maxlen = self.model.cfg.encoder_seq_length + 1
         return maxlen
@@ -616,7 +674,7 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
         )
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
         if maxlen > self.model.frozen_model.cfg.encoder_seq_length + 1:
             maxlen = self.model.frozen_model.cfg.encoder_seq_length + 1
         return maxlen
@@ -681,7 +739,7 @@ def __init__(self, model):
         self.forward_model = self.model.model
 
     def clip_max_len(self, maxlen: int) -> int:
-        """ clip the max len based on the LM model max sequence length"""
+        """clip the max len based on the LM model max sequence length"""
 
         # for positional embedding types that allow length extrapolation, don't clip the max length
         if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
@@ -830,21 +888,21 @@ def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_
 
                 # updating RetroEncoder (RetroEncoderCrossAttention, RetroEncoderBiasDropoutAdd, RetroEncoderLayerNorm)
                 if contain_encoder:  # the first cross-attention decoder layer contain encoder
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_num_neighbors = inference_retro_num_neighbors
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_chunk_length = inference_retro_chunk_length
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attention.retro_retrieved_length = inference_retro_retrieved_length
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].cross_attn_bda.retro_num_neighbors = inference_retro_num_neighbors
-                    layer.cross_attention.encoder.layers[
-                        0
-                    ].pre_mlp_layernorm.retro_num_neighbors = inference_retro_num_neighbors
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_chunk_length = (
+                        inference_retro_chunk_length
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attention.retro_retrieved_length = (
+                        inference_retro_retrieved_length
+                    )
+                    layer.cross_attention.encoder.layers[0].cross_attn_bda.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
+                    layer.cross_attention.encoder.layers[0].pre_mlp_layernorm.retro_num_neighbors = (
+                        inference_retro_num_neighbors
+                    )
                     contain_encoder = False
 
         return context_tokens
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index 850eb3d5c778..722c493dfa9b 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -151,6 +151,7 @@ def megatron_gpt_generate(model, inputs, tokenizer, length_params, sampling_para
 
 def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_params, inference_config, **strategy_args):
 
+    model_type = model.cfg.mm_cfg.llm.get("model_type", "nvgpt")
     conv_template = model.cfg.data.get("conv_template", "nvgpt")
     final_response = []
     for idx, prompt_dict in enumerate(prompt_dict_list):
@@ -180,8 +181,14 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
             continue
 
         # Regular expression pattern to match the sequence
-        pattern = re.compile(rf'{DEFAULT_IM_START_TOKEN}( ⁇ )+{DEFAULT_IM_END_TOKEN}')
-        pattern_nvgpt = re.compile(rf'{DEFAULT_IM_START_TOKEN}({DEFAULT_IMAGE_PATCH_TOKEN})+{DEFAULT_IM_END_TOKEN}')
+        pattern = re.compile(
+            rf'{DEFAULT_IM_START_TOKEN[model_type]}( ⁇ )+{DEFAULT_IM_END_TOKEN[model_type]}'.replace(r'|', r'\|')
+        )
+        pattern_nvgpt = re.compile(
+            rf'{DEFAULT_IM_START_TOKEN[model_type]}({DEFAULT_IMAGE_PATCH_TOKEN[model_type]})+{DEFAULT_IM_END_TOKEN[model_type]}'.replace(
+                r'|', r'\|'
+            )
+        )
         combined_pattern = re.compile(f'{pattern.pattern}|{pattern_nvgpt.pattern}')
         clean_text = re.sub(combined_pattern, f"<{media_type_token}>", response['sentences'][0])
 
@@ -199,6 +206,9 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
             clean_response = clean_response.split("<extra_id_1>")[-2][10:]  # [10:] for removing "Assistant\n"
         elif conv_template == "llama_2":
             clean_response = clean_response.rsplit("[/INST] ", 1)[-1]
+        elif conv_template == "llama_3":
+            clean_response = clean_response.rsplit("assistant<|end_header_id|>\n\n", 1)[-1]
+            clean_response = clean_response.rstrip("<|eot_id|>")
         elif conv_template == "v1":
             clean_response = clean_response.rsplit("ASSISTANT: ", 1)[-1]
 
@@ -287,17 +297,17 @@ def tab_logits(logits, min_id, max_id, filter_value=-float('Inf')):
 
 def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started=None):
     """
-       This function has been mostly taken from huggingface conversational
-         ai code at
-         https://medium.com/huggingface/how-to-build-a-state-of-the-art-
-              conversational-ai-with-transfer-learning-2d818ac26313
-
-        @param logits: logits tensor
-        @param top_k: keep only top k tokens with highest probability
-        @param top_p: keep the top tokens with cumulative probability
-        @filter_value: value to set filtered tokens to
-        @started: a tensor of bools indicating whether the text generation starts for the batch
-        returns the filtered logits
+    This function has been mostly taken from huggingface conversational
+      ai code at
+      https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+           conversational-ai-with-transfer-learning-2d818ac26313
+
+     @param logits: logits tensor
+     @param top_k: keep only top k tokens with highest probability
+     @param top_p: keep the top tokens with cumulative probability
+     @filter_value: value to set filtered tokens to
+     @started: a tensor of bools indicating whether the text generation starts for the batch
+     returns the filtered logits
     """
     if top_k > 0:
         # Remove all tokens with a probability less than the
@@ -333,7 +343,7 @@ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started
 
 
 def repetition_penalty(logits, repetition_penalty, used_tokens):
-    """ Implement the repetition penalty, check paper
+    """Implement the repetition penalty, check paper
     https://arxiv.org/pdf/1909.05858.pdf
     """
     if used_tokens is not None and repetition_penalty != 1.0:
diff --git a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
index 5da7296519cb..1a5321065fa9 100644
--- a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
@@ -12,26 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import tempfile
 from typing import List, Optional, Union
 
 import torch
-from omegaconf import DictConfig, OmegaConf, open_dict
 
 from nemo.collections.nlp.models.nlp_model import NLPModel
-from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin
-from nemo.collections.nlp.parts.peft_config import (
-    PEFT_CONFIG_MAP,
-    CanonicalAdaptersPEFTConfig,
-    LoraPEFTConfig,
-    PEFTConfig,
-    PtuningPEFTConfig,
-)
+from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin, replace_prefix
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP, PEFTConfig, PtuningPEFTConfig
 from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin
-from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.utils import logging, model_utils
-from nemo.utils.model_utils import inject_model_parallel_rank
 
 try:
     from megatron.core import parallel_state
@@ -46,7 +35,9 @@ class MultimodalAdapterModelMixin(NLPAdapterModelMixin):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def _get_all_keys(self,):
+    def _get_all_keys(
+        self,
+    ):
         # TODO (yuya): p-tuning need additional handle, check peft models.
         """
         Returns all the keys in the model
@@ -57,35 +48,50 @@ def _get_all_keys(self,):
         return set(k)
 
     def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]):
+        if self.cfg.get('virtual_pipeline_model_parallel_size', None):
+            raise ValueError('Virtual pipeline model parallel is not supported when using PEFT')
+        if self.cfg.optim.name == "distributed_fused_adam":
+            raise ValueError('distributed_fused_adam is not supported for PEFT. Please use fused_adam')
+
+        self.use_peft = True
         if not isinstance(peft_cfgs, List):
             peft_cfgs = [peft_cfgs]
 
+        # @chcui crucial to set self.virtual_tokens and self.use_peft for all PP ranks
+        for peft_cfg in peft_cfgs:
+            if isinstance(peft_cfg, PtuningPEFTConfig):
+                self.virtual_tokens = peft_cfg.virtual_tokens
+        ptuning_only = len(peft_cfgs) == 1 and isinstance(peft_cfgs[0], PtuningPEFTConfig)
+        self.ptuning_only_and_non_first_stage = ptuning_only and not self.first_stage_of_pipeline()
+        if self.ptuning_only_and_non_first_stage:
+            # There are no params to add if we are not in the first state of the pipeline
+            return
+
         self.base_keys = getattr(self, "base_keys", self._get_all_keys())
         logging.info(f"Before adding PEFT params:\n{self.summarize()}")
 
-        self.use_ptuning_only = len(peft_cfgs) == 1 and isinstance(peft_cfgs[0], PtuningPEFTConfig)
-
         for peft_cfg in peft_cfgs:
-            if self.use_ptuning_only:
-                if not self.first_stage_of_pipeline():
-                    # There are no params to add if we are not in the first state of the pipeline
-                    continue
-                self.virtual_tokens = peft_cfg.virtual_tokens
-
             self._check_and_add_peft_cfg(peft_cfg)
 
         logging.info(f"After adding PEFT params:\n{self.summarize()}")
         self.adapter_keys = self._get_all_keys() - self.base_keys
-        if self.megatron_amp_O2:
-            self.adapter_keys = set(key.replace("model.module.", "model.", 1) for key in self.adapter_keys)
+        self.tunable_base_param_keys = set()
 
         for cfg in peft_cfgs:
-            if cfg.weight_tying:
+            if hasattr(cfg, "weight_tying") and cfg.weight_tying:
                 self.tie_weights(cfg)
-        self.use_peft = True
+
+            if hasattr(cfg, "tunable_base_param_names") and cfg.tunable_base_param_names:
+                self.set_tunable_base_params(cfg)
+
+        if self.megatron_amp_O2:
+            self.adapter_keys = set(key.replace("model.module.", "model.", 1) for key in self.adapter_keys)
 
     def load_adapters(
-        self, filepath: str, peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None, map_location: str = None,
+        self,
+        filepath: str,
+        peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None,
+        map_location: str = None,
     ):
         """
         Utility method that restores only the adapter module(s), and not the entire model itself.
@@ -110,22 +116,27 @@ def load_adapters(
             else:
                 map_location = 'cpu'
 
-        if filepath.endswith('.nemo'):
-            conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location)
-        elif filepath.endswith('.ckpt'):
-            state_dict = torch.load(filepath, map_location)['state_dict']
-        else:
-            raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
+        # TODO (yuya): this logic needs to change for dist ckpt because after
+        # adding adapaters the checkpoint will change
         if not peft_cfgs:
             assert filepath.endswith(
                 '.nemo'
             ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument."
             peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)]
         self.add_adapter(peft_cfgs)
-        assert set(state_dict.keys()) == self.adapter_keys
-
-        if self.megatron_amp_O2:
-            state_dict = {k.replace("model.", "model.module.", 1): v for k, v in state_dict.items()}
+        if filepath.endswith('.nemo'):
+            sharded_state_dict = None
+            if getattr(self, "sharded_state_dict", None) is not None:
+                sharded_state_dict = self.sharded_state_dict(prefix="model.")
+            conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location, sharded_state_dict)
+        elif filepath.endswith('.ckpt'):
+            state_dict = torch.load(filepath, map_location)['state_dict']
+        else:
+            raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
+        if self.cfg.megatron_amp_O2:
+            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
+        if not self.ptuning_only_and_non_first_stage:
+            assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
 
         missing_keys, unexpected_keys = NLPModel.load_state_dict(self, state_dict, strict=False)
 
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 123f0f06a33d..ca5820772c62 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -30,6 +30,7 @@
 
 
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import PromptEncoderAdapterConfig
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.peft_config import (
     PEFT_CONFIG_MAP,
     CanonicalAdaptersPEFTConfig,
@@ -38,11 +39,13 @@
     PtuningPEFTConfig,
 )
 from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin
-from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.utils import logging, model_utils
 
 try:
-    from megatron.core import parallel_state
+    from megatron.core import dist_checkpointing, parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
 except (ImportError, ModuleNotFoundError):
     HAVE_MEGATRON_CORE = False
 
@@ -56,7 +59,7 @@ def replace_prefix(name, old_prefix, new_prefix):
 
 
 class NLPAdapterModelMixin:
-    """ NLP Adapter Mixin that can augment any transformer-based model with Adapter module support.
+    """NLP Adapter Mixin that can augment any transformer-based model with Adapter module support.
     This mixin class should be used only with a top level ModelPT subclass, that includes either a `model` or an `enc_dec_model` submodule.
     This mixin class adds several utility methods to add, load and save adapters.
 
@@ -92,7 +95,9 @@ def first_stage_of_pipeline(self):
         logging.warning("no attribute named model or no model.pre_process found. Can not detect stage of pipeline...")
         return False
 
-    def _get_all_keys(self,):
+    def _get_all_keys(
+        self,
+    ):
         """
         Returns all the keys in the model
         """
@@ -216,15 +221,18 @@ def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]):
             if hasattr(cfg, "tunable_base_param_names") and cfg.tunable_base_param_names:
                 self.set_tunable_base_params(cfg)
 
-    def _get_config_and_state_dict_from_nemo(self, filepath, map_location):
+    def _get_config_and_state_dict_from_nemo(self, filepath, map_location, sharded_state_dict=None):
         cwd = os.getcwd()
+        save_restore_connector = NLPSaveRestoreConnector()
 
         with tempfile.TemporaryDirectory() as tmpdir:
             try:
-                SaveRestoreConnector._unpack_nemo_file(filepath, tmpdir)
+                if os.path.isfile(filepath):
+                    save_restore_connector._unpack_nemo_file(path2file=filepath, out_folder=tmpdir)
+                else:
+                    tmpdir = filepath
 
                 os.chdir(tmpdir)
-
                 config_yaml = "model_config.yaml"
                 model_weights_ckpt = "model_weights.ckpt"
 
@@ -233,7 +241,22 @@ def _get_config_and_state_dict_from_nemo(self, filepath, map_location):
                 os.chdir(cwd)
                 model_weights = os.path.join(tmpdir, model_weights_ckpt)
                 model_weights = inject_model_parallel_rank(model_weights)
-                state_dict = torch.load(model_weights, map_location=map_location)
+                state_dict = save_restore_connector._load_state_dict_from_disk(
+                    model_weights, map_location=map_location
+                )
+
+                # distributed checkpointing
+                if state_dict is None and sharded_state_dict is not None:
+                    checkpoint = dict(state_dict=sharded_state_dict)
+                    tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt)
+                    tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
+                    assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
+                    checkpoint = dist_checkpointing.load(
+                        sharded_state_dict=checkpoint,
+                        checkpoint_dir=tmp_model_weights_dir,
+                    )
+                    state_dict = checkpoint["state_dict"]
+
                 return conf, state_dict
             finally:
                 os.chdir(cwd)
@@ -271,7 +294,10 @@ def setup_optimizer_param_groups(self):
             super().setup_optimizer_param_groups()
 
     def load_adapters(
-        self, filepath: str, peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None, map_location: str = None,
+        self,
+        filepath: str,
+        peft_cfgs: Optional[Union[PEFTConfig, List[PEFTConfig]]] = None,
+        map_location: str = None,
     ):
         """
         Utility method that restores only the adapter module(s), and not the entire model itself.
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index f50a467cf71a..e8f7009b791c 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -1236,7 +1236,9 @@ def dummy():
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
                 checkpoint_io = DistributedCheckpointIO.from_config(conf)
-                checkpoint = checkpoint_io.load_checkpoint(tmp_model_weights_dir, sharded_state_dict=checkpoint)
+                checkpoint = checkpoint_io.load_checkpoint(
+                    tmp_model_weights_dir, sharded_state_dict=checkpoint, strict=strict
+                )
                 instance.on_load_checkpoint(checkpoint)
                 if hasattr(instance, 'setup_transformer_engine_tp_groups'):
                     instance.setup_transformer_engine_tp_groups()
diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
index 905de4eb3567..b95be90274e3 100644
--- a/nemo/utils/callbacks/dist_ckpt_io.py
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import shutil
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
@@ -29,6 +30,8 @@
 
 try:
     from megatron.core import dist_checkpointing
+    from megatron.core.dist_checkpointing.dict_utils import extract_matching_values
+    from megatron.core.dist_checkpointing.mapping import ShardedBase
     from megatron.core.dist_checkpointing.strategies import tensorstore
 
     from nemo.utils.callbacks.torch_dist_async import AsyncCallsQueue, AsyncRequest, TorchDistAsyncSaveShardedStrategy
@@ -234,7 +237,11 @@ def save_checkpoint(
 
     @_debug_time('DistributedCheckpointIO.load_checkpoint')
     def load_checkpoint(
-        self, path: _PATH, map_location: Optional[Any] = None, sharded_state_dict: Dict[str, Any] = None
+        self,
+        path: _PATH,
+        map_location: Optional[Any] = None,
+        sharded_state_dict: Dict[str, Any] = None,
+        strict: Optional[bool] = True,
     ) -> Dict[str, Any]:
         """Loads a distributed checkpoint.
 
@@ -259,10 +266,37 @@ def load_checkpoint(
         else:
             sharded_strategy = None
 
+        if not strict:
+            sharded_state_dict = self.adjust_non_strict_load(path, sharded_state_dict)
+
         return dist_checkpointing.load(
             sharded_state_dict=sharded_state_dict, checkpoint_dir=path, sharded_strategy=sharded_strategy
         )
 
+    def adjust_non_strict_load(self, path: _PATH, sharded_state_dict: Dict[str, Any]):
+        ckpt_sharded_metadata = dist_checkpointing.load_tensors_metadata(path)
+        loaded_keys = []
+        missing_keys = []
+        unexpected_keys = []
+
+        def should_remove_missing_sharded_base(x: Any):
+            if isinstance(x, ShardedBase):
+                if x.key in ckpt_sharded_metadata:
+                    loaded_keys.append(x.key)
+                    return False
+                else:
+                    unexpected_keys.append(x.key)
+                    return True
+            return False
+
+        _, sharded_state_dict = extract_matching_values(sharded_state_dict, should_remove_missing_sharded_base)
+        logging.info(f'The following keys are not in the checkpoint and will not be loaded: {unexpected_keys}')
+
+        # TODO: compute missing_keys by:
+        #  1. all_gather_object of loaded_keys
+        #  2. missing_keys = ckpt_sharded_metadata.keys() - loaded_keys
+        return sharded_state_dict
+
     @_debug_time('DistributedCheckpointIO.remove_checkpoint')
     def remove_checkpoint(self, path: _PATH) -> None:
         """Remove a distributed checkpoint.

From fe7e2e5767940d3aa114a161072f6905fa3b8057 Mon Sep 17 00:00:00 2001
From: huvunvidia <86480512+huvunvidia@users.noreply.github.com>
Date: Wed, 22 May 2024 11:41:25 -0400
Subject: [PATCH 109/178] RAG Pipeline (#9143)

* first commit

* working pipeline rag_indexing; rag_eval with rag.yaml

* udpate RAG documentation

* add image to documents

* cleaning docs

* before merge from main

* refactor code to make it easier to support more customized embedder and LLMs in future

* addressing Ali's comments

* addressing Ali's comments

* addressing Ali's comments

* fix Code scanning results / CodeQL

---------

Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
---
 examples/nlp/rag/conf/rag_generating.yaml     |  37 +++++
 examples/nlp/rag/conf/rag_indexing.yaml       |  19 +++
 examples/nlp/rag/images/rag_pipeline.png      | Bin 0 -> 86786 bytes
 examples/nlp/rag/rag.md                       | 141 +++++++++++++++++
 examples/nlp/rag/rag_generating.py            |  49 ++++++
 examples/nlp/rag/rag_indexing.py              |  44 ++++++
 nemo/collections/nlp/models/rag/__init__.py   |  16 ++
 .../nlp/models/rag/custom_bert_embedder.py    | 145 ++++++++++++++++++
 .../nlp/models/rag/custom_gpt_llm.py          | 130 ++++++++++++++++
 9 files changed, 581 insertions(+)
 create mode 100644 examples/nlp/rag/conf/rag_generating.yaml
 create mode 100644 examples/nlp/rag/conf/rag_indexing.yaml
 create mode 100644 examples/nlp/rag/images/rag_pipeline.png
 create mode 100644 examples/nlp/rag/rag.md
 create mode 100644 examples/nlp/rag/rag_generating.py
 create mode 100644 examples/nlp/rag/rag_indexing.py
 create mode 100644 nemo/collections/nlp/models/rag/__init__.py
 create mode 100644 nemo/collections/nlp/models/rag/custom_bert_embedder.py
 create mode 100644 nemo/collections/nlp/models/rag/custom_gpt_llm.py

diff --git a/examples/nlp/rag/conf/rag_generating.yaml b/examples/nlp/rag/conf/rag_generating.yaml
new file mode 100644
index 000000000000..dcd86b1b220e
--- /dev/null
+++ b/examples/nlp/rag/conf/rag_generating.yaml
@@ -0,0 +1,37 @@
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 'bf16-mixed'
+  use_distributed_sampler: False
+  
+indexing:
+  embedder:
+    model_type: bert
+    model_path: null
+    embed_batch_size: 128
+  data:
+    data_path: null
+    chunk_size: 256
+    chunk_overlap: 10
+  index_path: null
+
+generating:
+  llm:
+    model_type: gpt
+    model_path: null
+  query: null
+  inference:
+    greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+    top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    temperature: 1.0 # sampling temperature
+    add_BOS: True # add the bos token at the begining of the prompt
+    tokens_to_generate: 500 # The minimum length of the sequence to be generated.
+    all_probs: False  # whether return the log prob for all the tokens in vocab
+    repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+    min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+    compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+    end_strings: ["<|endoftext|>"]  # generation will stop when one of these tokens is generated
\ No newline at end of file
diff --git a/examples/nlp/rag/conf/rag_indexing.yaml b/examples/nlp/rag/conf/rag_indexing.yaml
new file mode 100644
index 000000000000..049afc1dbbfe
--- /dev/null
+++ b/examples/nlp/rag/conf/rag_indexing.yaml
@@ -0,0 +1,19 @@
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 'bf16-mixed'
+  use_distributed_sampler: False
+  
+indexing:
+  embedder:
+    model_type: bert
+    model_path: null
+    embed_batch_size: 128
+  data:
+    data_path: null
+    chunk_size: 256
+    chunk_overlap: 10
+  index_path: null
\ No newline at end of file
diff --git a/examples/nlp/rag/images/rag_pipeline.png b/examples/nlp/rag/images/rag_pipeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..810ef254e857017679567c1547c9a51f29b6cddc
GIT binary patch
literal 86786
zcmeGDWl&se8wH5Qa0?J5xVr>`6C8rOyGzi<U4j$bH9&CJ;56<U9D=*kxXWyEzBxHl
zQ!{n{+^YNIc2y@d?B4s8N7l2R^@hmFh$6w`z`uC$0!ds<SmDJBXsj16UVFkq1HXyF
zrV<4HdugvAD)^#o1aBAk@Wxa?THwWt%1DHJeJJ2F97s&v{>2OA&gb{bUYi1=7cZW2
z#f1fwTyzc=5j=)3o?wnulLobFlkc#%*k%?~8JOVsHWkFlh!tXl;Wkg55egR+a|*O#
zG;@-e4b1J5?-l|ZQs+zzO?JGOGR`>X=cY!=w$s4rt;<)oEz6LGgEXPD2QQuF(YuFP
zZ)hTtzb}dGPqzude_vG6*bHw`|9wS^{Qp1xyBMhdx78rH8|5$84C<6T9#A3#%OiQ3
z%JMhGxJ_vIU+dlfBIn3u`@Oop=_#*SI{)GfQfIC^ls=0*eR+ZQ?>(a(*N33~ZU;#f
zZ0G;`$AE=AYh60smWBF?<yZgS1~vEf|Lgbq44>ZbIPQa7;B=Drve}D%Kn~!{66>8J
z=M0N3lEdK+kx*RTM%X{{s$HZK%ILthe^=}LjvPUyUiz|m-a&+$k)2~pS=Kcts+0G4
zt_t6)cZB%9Oh3r&d!9?};F+1~%yG4##|N(+v^0fC>!8TQD-o-jbYB=}Q2+7Lw9Q%f
zSmxVx3mguPMBxjKC9}ms))Z9-wj-$n0a!L^>~F^T7qvq6h?*a;rSUGD3_=KXYp_fr
zy)nt9GZydfN`GTjqbGhIID%jzgDwlgr}5oaIWM?~kKePulbpPhT^-H%Yrl4Jo<myo
zTS3X4L2KEr`>1boU|K~E@cr@5I*VLMKbT!hU29L=DcXH}G1zn7{;N>3;!Eq7<9UAY
zh!b5h*E`QV>y8F7Fdx#<8>@~uc4)n>X60jUOhUTg+2gl(i(vRO(bjNYLzPuCyYkdJ
zvUL0ftSP*6rxL=JN8%UG=OzaEUN|<G!b(ER?I!JlZbsA2xUTPtWA3a`Lyt3Gl&F?h
zzkOPoYzp~zz<uU(9`$eCt<ihDIS8Ku3kG*rwTd}^*VqI(S?zU}7JLz2I<AlMX7V0q
zPIJq2CUe@RN(j4qm+C8(y^Rbiks~bKtoh9@YGt}^(M_Nec%oa<OI{P-$o*3CxIM|3
z1BX4z`tosuRgw$m61y9^v^Q=na5Q>h#ckKgWjo&7HWFNINUVY{gTvTGpy`|38n{mn
zqqP5k{5yU+bvLST`EZ9?jqqv_M`yT!Ers&YTBz1=3nsR5S6%?>ukCG8r80&Ztu5p*
z^5XqMj{mlYKY*X6$&SD7%?mYA;dJ&xl4+LKOK*V+&npAD`~Jm4fp5*2#n6XO`_UDb
za($2FNu2XA+iSgjND5S;4p!(rD4Kbal$@D4s(c>TP8Rj#k7L9y)lR#muG3~1Q|iOy
z)7}Ex_(D=<rx%0$eqj`muP)P85oyJn1B#a`4P<#iyw$8uC|*RVE{EAqX1IKaf6s06
z*5qh9(yk!zXk9eRA`5akQfZv@$x|^4l3*PD?`{N_L~F;1*cFUnp)W~)x~9jI-@S>(
zo)xyixrV}<vM`ac>~PN0dpY&XVy{=+7fQZLUpMR~6JKE`LXQG(zLgaU!N-0cg~`#3
zx!spaozSz_E7+;~;&NB{+Pdz`<=2K7vn!|a>r*jy&ks-e*=~AI$gjMT-eb{;9e*fs
zW^NbcGP-Zl4l0*!_w}gDkP$<pgYDG-ANS`6j^@|>$7bB-1nO6Uop^rdj;pqzBH_%;
z>_aD(kqVcm;9Z=B?CY=0o+y7xm7pf>ZCo5HSGS}7nmp(=>!4Q7%_7=LP@_n=<6IK1
z15Vu!S9@c*7`wqkLq4xR>vL<n8QoDg)LS&shW`ZlU>~%L&Qbg}%gKZcVeWo0>%7{f
z#xSqh_HVQr5OjLWz`}p=&5ac*VkGxm!%l{{ZP~#X{95DFV}Zps{#Z@`HLRFU-zv`u
z&-)Y>kq4HrMDEF=$a?+TiI*J7f)i%j36PcV^&nEb)E2_XUESO6cnUlBUBmOzi{N9@
zVdmRKg6>vsDaa7&lI$lp9l;uvQF6Ko;lD4H-pkH_ace5mJ?<z1_R$oi4OKx-c5n*L
zl|*3zq`_fC>23c?8e;;zwl8_4#`JBn_Judvwt}VkK`=!*ijak3a@IsXNFs7s5-uBs
zJAd<aj{N7=`6x*aivi4E?`mlGrzVvO0-@dF|6?Puhd~G;_|dxMd6Jfv3(8OY#`Vc=
zggJTjsjJb{aP1fMUl5=ZU$0bmQzA+7-&?{g2w48qk&ZC*Y`&GrT0%jd`n7*JDS@3U
z1U#uYas5`NjhGK_{Cr8`hn6x};y-iiXTaey?YnrJHc?#LRjJgUv*|~$r)i;7iKjrD
z`S&{ETEA7n!kB^_)oQ^S-s(z^8@tfN7RL(&vTY!~4(np_(eihtcf{PJzCGo)(M+94
zwAC*EPYSX!tY2C2xxVZ*wcX9rkse;;t}j)MwF?ZLBfm`)?7Yy}YWTUTNXK2u{!LYo
zT#-6|V(@qu`Yc6?lTH9+kL!H2H>ITAYdm2}`qwmyyFfo&jFs3~MSD24WCmyNyJI;r
zft&Jk)?NFGI3uUC?B_<o$uM@tWk{Fr1W&*2(Hrua4rI-i<Qwg{MQ|g{k1z1wv+W;#
znbusOs%U?RC$p#g_x4pXh*)w+(wZrOTZXGm>$(eLn$(;Zkv2t<w#D--))j<U)B1OC
z;Pe!PeWt9@7l`<5?}R8BJIBPp%L{&rYHSwhQuf4nnqWWGeeRCR$6}Z2<S*}@&{U*#
z0~M#`2&iowC%rn(TaMbUGTsEF#W?ORAF|E8_WRcesMz)E_2f#2&p;FVl})XkMV*Dh
zFf(kcy~_lsd`ec~+q6553+;Igk|36v{7<l7N9*=M)R)MDWdTHF<L^gotEr_7Y|nd>
zXN_p)m431=?P}`=VMmyq-g28+<Mg&Rz<u=oWbX1=#-2Zx?QqREfm_qtf0Pv*oo2zS
zhRsg<cObzf7@bh^W%#$X&eOQ7$2XQNm`BKC^Cy4u*H{P+HRT}E;q6bqT@2d^n2ZP=
z*w!3IIN@}^(X<mayN-8I9L3uy9muiVL5cd0m=kGGFf85=%9D|)1V9jZ1BRZA+$ThB
zG3HIk#f&jZ#2M`!EsDyU$j;YOlpDWtr0j$FNJ-ybm?kUy@5%)4pkgynAs0OCy*V${
z$9aR&e(5SSwmp23)an42K=(b8+UBa=hiMR`l!8WS_UikmAPrBZ)r;S@k)cR)5jR3e
z<xCFWyoSk>_-?E^N3UTeAHz|QiY}O%BQ-*F9!BI#Zhupj;l&75JO3IT{G|TD&YJV>
z>;2SpT;z;$)m&af63xDnZzy9UlMc5>@^rW#UwAIDd7=bDoOWIy;#9wR8Yl^4@u>Uj
zJaN_73`raGAx?(8?fm62CtvwDsIRR~zK&w@;UUc=PGXgWw<rd+&)3Hn#aQw69H3-~
z<ode77}!#76nIG!UgIM~GHP$gp-C^@zd`@q%kIb4=;uRd($TAg-2a1f$rUW_Ash52
zZOmhZziSf#2>fS3KBAXVeRdJ2-#x=jX?uF1D_8R%?Z=CO3-Zf;LfxiJ`W}o*%)DKh
z@3dm#g8tfkR$33JT}y}kCjl2OLLSQJE0Y@es)8>c7G6e>TVy+CL^8RL^_r<N9L`L0
z&p-lXF+D?Mmv$qzY9vy?c>)5aaV1lD+MAB8$>I8?Xjnq8j;_6flDVZ$d|vON>;zEs
zQzCpG86GQluBrFg7>P)}9$AJTAjK#R*NJUyKN#~I`+dyA(F?`!jKLzyAVY1w{a`tI
zfh0NX2;0d2c)rC6P-3`!r`itqgMoWmFmsy>dxpxyZ)N`$7V;x&Lr?<>qOFI1bwNno
zajudT56w%--)1uvPmrpM>wm=QkpG?eemm%EmC~axGHffRZ+4UC@%l*pERidU;7WDB
ztj&v>{hmbUzgLs|b9v3@CEMYA$UPhSnRC!isW8cCyXlAv5XI_3YlT!9O~C?xjbPfJ
z2a76nvAq=mT7PRq4UEL`jy`115w}g6)uttAKmQeog$mC$5=0f^L?|V)+*s&XM)B_h
zv(h?%;Vt=r?(c?OhUQVx2#Ipqei)&j<J?mGcKx9Cx7?~yLByiE^H1xzT;x5T_j*TJ
z-aV=W0x`g`N9LI=w%|8g#QO{Bc*6XJR_f$|2!DquY=p|@@b|$aB%vR){@(rns~<mI
zW_;Dje|Jp#@3ct<bRu~!lXOTex<Ktiok1v5rz`ER9cgvL7yjlJ`O8=zfrL-4r$jbO
z&9-%MXLg?TL;}8aSCD1XDE@zEX7rj&uP>1?LxZD}*mUDJNT9=ic2*68?ytc_TiJm?
za-l=!nFq1^9Z-Sr|9)5aQuI~e{mq#@g%Ddm>#DEEP*M^$#(#>~=!a-MfmhIQI8+v!
z-#`Mdpx`j=#3Ka%y+K^DIAh=|D0mR25N6dKNI-%}yyEZBgd2j+^`d+PQuq|Ns`o*w
zOJH|%;(r$*YeD=Kbb|PRhtoW-rU~tTH+x{ws6>rsL2j#R5cJhs{cnKm^Ed2C9zE)S
zx<Tk@CHr?6!bUQ$0%1@HcIXGFn&7dHmRrjoeg7^1D!J3uk@l=D?wX}B`>OAKfZl%x
zNxjNpw^f-65_lB+@E?GdH7N##r%q*fK%%vx3;pk?NhD}Tbow}&V6l!CoAPJ=4lP)L
zQW?Zimd3<J$k5f(v$on9<QDq(&pTt-3`Swv{{J<}1cSCHFE)IJPcFwk)tG;;rP7`q
zWPh;a(po9`PeCie*x>U0rO@r{{eP?8tn)Pv``cN|dLr+Y@v+qZ+yu;FPwWZY<UdwE
z0NW7!9qdVOTD|(}sp2mxruukn;55kepG&i}!pX1yAFi7-swZlH_1~I#01?loHf?z1
zKQ|y5Kul}U<4pbD@E>~+=JflB^>>KDeI$tgzwt5h!8CKmst52y+wBp2u*kDzHH71J
z1O^4IkLOClA|MD^T9*7yod0w=YX|~?d}3qKX_N{wt<N@AFNFTsVM2`Ien$YpKD`9O
zKXJ!MaYO9`2M2cR<1H<l^+IsAXe0?Uv%=x16`2^7#!sqGJUk~0_M<-xdn1mIkNI0X
zZq7;ntPK@CHiKHk>K}$;E^aW9(XibYO0ChMFr|k7v+LQG$kovzlih}xQkhorN9;de
zp=$%R+kIO9v(>4@Sjh(q4d|`+rwD!l!UUgOd@{Ul#p%b!#@rbHgh%;lz4|}9!nS^S
zIA4S~esi{cLBLG8I#^{q2wLw6*P8S5{d3<cltGeBG~juim4>~O{T!{_q@tpt4o3^>
zQ`FDFbRKlFCL)!=i!Yf#H|9>olsxiq2Wedop-h*{jrntjoiR1cemg|0gN28?tCP({
z-6HhkYSYmot8d#!M-lO~s#(^HjczEwxYR#>+4$tRr*wb49_fZcZhCXNx%2DCYdk!>
zbAmtaCxO%n1sG|CA+zS{F#B@X?;49K=g<ALx-Fl)u2*4LtQXWE&*8=5q5vdNHaVKX
z*UH6IVQSYdW@%~pEjMywHV;_qy{12_lqG6dug4jP;3V;=mYba6kAR3tP^wg@pjK^y
zv^P^;eg%Y&XjV6+dLk^OpXO8W*4EZ;k)F5P8QvAnkfnE#A@BbQlj?!1zMI$#vxUdU
zpvhuY6n%Ytx5yKT7KwP8AA0S+Zn8&Yn?GXi?$!?u!b&ykIwmF*{~6?20cf?KGNUU;
zJa%PsGxusXf0XY@E54=K>!EQe_U-@NYw&|s1zsl<ca8Uxm*Z4=NSYdhcH_IzPiInP
zHV2f;U$u4<@xC0<v3}F6vo8H-%mBLX0)2g6d@6fjwX_I4C7me9$b`$^1FMbvbhrE@
zm&yNf1F$Pq#=<GRh_Eo>avgBRmkA1R(4Q^)^A8%ZJo!2884ZE~@SC^q5vXmQon_y0
zEHpYI02|pGh=lvUUHP*$%)Owmc0AOjziKR0!Nv}EmsaQdGbk1_Wy-^q{|Pe$olpZ8
zIZETIx}E{RTN%RC=nNrPa#=@Go=ATA`q>E3cXL;Ec7jPnzL7Qcogn>VuPO%h{6<8S
z+vC{sCbUV3kug52i_Xi+t9~qKl)&?=pi4DsIz$OQ{bhM?$mVhZi$eUzFQ_Depf*mX
zm&9t3uvfRTvXXiA3do<Tlo0#v5#zRJCs6sJ19sb)tIluyK+=$F(EY71HYn)!k4win
zy7UMbeU|CqyF2*U{BSuBR|e!{{rl^aBu;w@Nitw?b8ro2K2K%nf5w@M(g~F%OyGXN
zB$dX23fS6|d)Tx4iN{jtpKkQ0wFA#H%FfyN5g8fTXtwg*d8xe-AQpj$<N1$cqUKH+
zKFzIcYA{y}#<op9-0qi~7d49yr?6I<juL?5n`*vHKtDa+l`p;Wy4|I8IGB~U=?)DK
zR|aE}8Aa)}3!AVb5`4mby7GR?PoV~&g#0BOGIGykq2kGUB;Otn9M8>0?4H33Mn*<3
zWjdY^U`t9oSF@Fd^`3XNd!;M}U9cRSoV%jrjGsQ?5fc;dT@%$s8xnp&)6Noxdq<}x
z2l<06s~GLtZC5)2E--VU0Nd;a5gUI&!t(crKie8stj0K60vBK5c+WLI-tHIaXPuh^
z4wrQh9SvIj^+U+z>EWu<8KR?nDu;;A#WDq2U1+dZYw>7sEAvO)!ou6Yv|Awk;hUP8
z$`#>uyPGfn(@g%K)rt1h`^RIytqKm%!C^_6jHZWEE9Mb#acMx9+-^1#m9w`2Yu)7m
zHZLxo8I8f??yxS2*OQAux25DNhkya^lZ)K2(n6~@pY>96;nkLoU9)1Y1V;052g1&B
zE8jOP#<D-YNmAvBh(*_%ju5xs#|Ur#rQwx4G=ULOXLEmbl-^prs*d=F;%5gdQ8Y3#
zS_lgeZE)Pj><PyWvYM}DQv0<*f<R4q(TT#Rcd<8hAn+%d!~uaTQ>(YTTcrH`!{Z&q
zjo#7o?y?(57|M(K_TG;-lz=Bh6ckYX@xhYl{QUeq{3y&+HY?wZ3__S+=+TgM;Nl``
zCpo#OCCNYL*?<bTlOqnhb;(SaN{QN!0bKDpKR9%XoO-e403elRdp+F3ned4jGwJH;
z0{$4O$_AK(IumvBBw%(d7YMUe#*$^0OKF(QcR~Y-_&jb(|4ct?A(#j2;GH`|j#OEd
z#e6NjprGJ+`Vc93Ov~e~#h>h$Oi#*#<<DZK1B6R9o23#r#`E0?7(Zwv2{aBdSYj}y
z<*^*6W_?qW;cSII_6I$mr9WBHXnxkd-7c+xzab(hC<w;^NTt8rnIk8SBSJ#PG=W(e
zS<$GJ6qw+Sq_U#`5vsSiyagV}IG#4m?pH^<tP&Ag2eH$S7=Pw(l)+)&E@QhTxVKRb
z)~clPe8+qSQRt5$SKcMdO9L)ZC(ZtdH7=neI`By5T%ZL+J=SVsTS5wVlHPgMGOhL>
zLP6*0{sW1Ok-$%DrFBc=?tfyRC9dgn0`~oGP=9Q8s;VnN+J9ALv&^fE(?0%$m(%Mz
zX~d@goI&GiN{F>Uj*i`g-uFKQq1`d?@xzzBpYRQP!p7XufS~m}&ddL02<kwDIYY=2
z`1ta2TJsO?sY+q>N63|k?}#Q9l`D`V2KPSpRbK8*6*IXU>wzhW|A!-Np5Xw`|HnXy
zj*OfIvfLWk<P0gQiiwVnhDAXUH8Z2KUa0@NKU2OAq|JJ_3z{;mhOAr9fzpFm5x1`c
z>pA(R!ovP;EzJ?l>CI(GD%JT~f&UnR5z6H=t{c!fXo%vOwSPAJKCvERLj!|J62C_P
z8$3fJjjZf!R7}h~OgLyVqTp}e6!uTmTq8amwfn*BxuRinaBvWLKi>DnQlh}Wqw?{%
zIW4r!$r1@I%2lg0*sFCpSpzvaeFk98l)cnwtd(mz-8iWaGm88FxeZY~e|l-g<wLMo
z8^W?3FB|Mi>^29E&(D=8Ew=33>xn2k><+Ghq_E2a*HB--;{9|-_`BAs_&Ft_&hbtp
ztsl<UO_kEsSub_~XlKgqXn~Uzp`C?PKoANJrRtkM9JxfCTyI8I9KEIrek<Tk^7IiA
z5$4ghRO%boabJaQfmHgOh=E||{-<&r(0Px9b^q{i;thmb4wvK1M!SQ<y5*kZ)y^pl
zgFj=EQk^9im1*9f;r>&?SjLAj4M@GFHYOwXjicmpJtd1E<SR?a%p}4f>9dWl=}%>6
z9Ehh4DpD@aQYqCiKPcCzF$dY%DfHGd?+f(gW3%Pp0|=;c`wueQgvVmgYD;1=Jic1?
zE-u_-4*y<bsYpRtf|f~E*XQ5?{ak4ng>J0D?Ye^wW|^KLp=2UM_w~v8Sd+7j!{Kjb
zZI>#uF_MGdHIl;)H9dL)eRnbInn0`=%HSh}G@l`-yB~2r=kZud*`CwQ!Akcl>o?HQ
zMah`zoSnXQjQ^Nt9PRzhcE;WtI&PV_xh21LC-SHI)6?po>sa}b=&z1&?~E?@Agxak
z$sbLN-j^&kI(GIFdWW%EOs@<jf86Vc4U0iw$szQOqm5aDV~*uFK^0RMS5;MYq@Ay7
z6^ka>v&|zwd{3UCXrd)n@d3zC7dJVw3H~Sm;8Cu&Avfy_8x<!5Fs4i;ox-J_I@xos
ziI1)l%kX||;Hrd&gBx?tl&Rh5l9V|b%B`b|!OKtQuzdw2=KvrJyA9EnH)qM{J!2Fq
zck#k(V1)`pd5tK1>FmNNdo6c~Ke8J`=ZD=Tnb;4Itopox&(mzs6@2X)c77SN(x#*^
zKDna4<TsQR#Uzs~PwPpr!K+AW)*w5*$*aC$TlIYj!FU4<q+b|%jh}M8jmkwz@;rEC
zD@3FTB2|B_Df56xRkcrPXfTmspGP=3TVYdTPfr2IA9e0qm_Gu5qPpvC*Ofl<#ttE4
zjimF0j%AA)PUK0OYs%34gF=l7IlW?w&vI{0TRiV3G42!y#GrR3B%qTwIb<SGjW55_
zjra1`0&^=J$=*nk>-qYe>M)a|i6RHZcE#YZ01Ty+ZKj?B<FAEHDxG6&$$!fy<|?aB
z#u2UQ+MJsqZ6d>BvxFzh_rUUujZ=gMOTH~EWqkby`aTCXztva90l$99IlgdMX?ux5
zqh=SyA4g#4a{z$+&*3;fY}b4C%yCI9;-f(V5ri;EIR3+_?8VEacGzjjtG-f<%e7Xx
z&sfk|L|vm3>W1?aPu*xS1$k_fOcFDBjZxB?Zx`5El6|sXS!P(qYO!3FO`Fz-JQKQ{
zO$qO0b)sHByVZCaU8N3;&BcLig<Od}twLXSs5nMloyElWo1x>9J;LOc^)4q_TnsI5
zJp{BDNUA?d>HCLKiycQLtcxc*5*r3SW8+^yPE;z_(e{LCxNwR5zfO?irU`&1_u!b_
z$gJR4Qg|#{|Bk?UgKkdcXl7NTYb)V^MU^^d5=fC}w{~c@ZFY~jUM&Df%n74e`^D;s
zHjFHnnqAfEtiN;JxSb8NE4LA;W+h9?))Oht0SLep@jbIJL|n6;=!btAtmFs%Jp}z*
zXQF)ZBqG(7eh83TaaMrBihrs|SsAuJaOBy&>5cTXBSv%RHg7NXr&jK=1q1G(@Z_3v
zLT#y65wB>h19Lwu&?Ja(UNKW?P0I6)--;|YhKFu5tgp=ta(HiLHG<!xMO5GhlC!+3
zyi%~)I*3|+IaRDIx;^rJEKeHO#m$YP;hKW|FYvkqD#v6I=-6lYS(DzH`MQW#XRM+=
z+hup*Dw1<EjCNb_t;6*z-w3TXzdMvE47yRCD?A|M<P!=Q;WOhfB=z}tDJjL_kR?VD
zolL3k!eNoU1z-q*j{9LvJ%wu|Jl2M-mzNh1)}Wn^778QmOOrj)O@>o`=(c*9EjHG0
zZAHI$E={Q@`%YB#^fI~fb!yBpWYRwsN+9Is@VDPnsrO)z4<^XH6r4(lmv0E<rco*N
z5sZMk$hJ(yYh8+H^s}e*Lv`q=c$40_i7@llZWCIte7Osp6xW*J3x$yRTntPflCv&t
zuCD%Y`@Dhqj08dLGP#smQKv7)@pWY^?N7@CwD2hx4>kulEHVPSL0_3WotHg>TAv;@
zOJ}@`^58MPam<fkpc&~}FNO*JS`CH0-mm{%rd*7<<PPDfkugGIrqyzZJNHKrlrb~j
z@@;KxZPZ|>0SV}6Mv(K~Y@pwt^b)$2TSfJFqw%-g9W@omXG0GbK1bBr*!P8Y(a+I%
zN-bfCL^>jjz9GTxt7dLhfl|$@qM1Tp^T1reTss%K$cMzIXmtdv5uKip&V1WFv$wvo
zx?Q<vs&BnMZjOsHbZZ|=&6^&NOEMO&o9~WA;r_4Ej86yYpR9GOeDooEB7Qm><twWF
z&T3fC|DF}J!tb~r#YdR#bpsbysH9+pzWLMeF~72RyJG1TCWBVNOt4)-O%Q_JMOxYh
zklJ*nTH|TKIWV0+zf7EyuJxY)Dad}RBmm$T!81-`VEvnPzWZY#<?K5Cudi;OYy7Md
za`3yH`)EA$bUtt1IW#fmrtwmP_ey2*sOgcaLW@1DtMXuy(~aSVbU04>Eo;OPJiZ{(
zyH}_#*FRGhL5B)rRm&N1Z6%8N$Zi*V0TKAzMd|3%UZ|kY|I6OM<OHBORROSEXSWnX
zE_u8b%D6|r57f)d2Gb?#98L$=63QTv-WJ(iu<$hL+Gnz9`VTtN59brI>i~D5vM<cp
z|GN{4xVLCrNB{>;r^yMa#$x94-R0p0_~bd8^M;;#S18-Pny%0n3d7@6z(*GsXC;x~
z{?h2W-h%`diL+H81xqC}yM^{W_SsGYxN|Zo+8<$Dh!$u)o@NQgpBCuMzrM&Q2!)oN
z{u(RE)I^9~=kE{4UbZCc^Ms1WLPr@1y()2BV<ZF(hmq3}?2ib<gTb-DY4z~`5fOk{
zok}*yX7KLa4SxY>3!uD%kpM#fl^SIrLY&UHkvUM7fao+}7nk=luh7&pb_$xVzidIw
zwQ-i&%qPW4(Kp{|!UDM2<O|?e>iB1~%$#r*3-wH&!?1tcelR+BC$2IDX51NqP6^r_
zC)aB6;Os^t5Ply$N=&#wTc2$?S5=@^nZ@N5d)g^nE7McFakT@G%$1J&D#D*X@41I<
zB;{BJjGG4v#wH*g*Q;8+qgF}!AxJ~O>k^#G-U0gcMD@6c;bV9wOSEBb7GtOfpaF;7
zaVVSSlRzB9pEExH%R7a@OvmLwGiBOEm+msqh<P|IK!P1}FTv`Bl88jWVkkhvMj=@0
zeY)lC&KowmU`TKL#;5XF>cxjb$g8E@AQ-23Kn-}cI69?F7r^FT5X_TP8c3(Gv;!Pg
zc|<t&hqv8LcAM~*hl@epS4T`Gx>N37n{Q4tpJPA9rZ;1p>ygN^kka+{H%OYH@92Gd
z!f}eD1a3Fp&J_V1NiqD3@nRYp8VLw%jLDEA&W!L`t@P?t5ph~5_vSY*_+$~mK#N47
zl~zMOr>6!ON|cQD#T|cRZfz~C<NLEw!(W*KPIpcIe|OagpsV5t6uhJ)jDH?Yzl%3-
zd^S>gJ<&|_(}gl^ZdURHs{D;SkbumJL>yIg_HYbPZ4{SPB+QFO$5!S^rxyAAP%c&}
zN@kU}z5<A>wfqdPaZR~iFXcUDANzg-s=W?-=m9R~I~L{xWZwv#wq(EdV6e09A<%zl
zC{p7$vhelwJ?_TREhxeTaP+u)xI{etR}-t$EUWE&xk3#nR`;NBQ8(r^4i9K=>uZFk
zBgD4hs1Kfz(SSnX^0&ov8K|w}0CeJ>`M`Ib0S7>DijKUd&X}mM!k*Z{bN>rCvF~=V
z@qK`9%Okps%t#W4zCi@ClfqGg`l|0_Y`l6ZbFOB}RDXy+rv*?5s2u@-JFuy#X({$0
z$79eP0?v1PlN;a>dAf*#%O({sX3)mpploSLm%yNn<BY5V79dx0UBJ(!tuM05I(YZq
z2{#`FdcAL%@?_;vXew5~OI9*i+6ciYp##VS6#z7zni)uQk3Thg@?~$%utH8m8xfD#
z?X!F0xkpK4)KY%w2ch88Y5++u8<K5KZy3ZR>bUhE^|S)@8pBnsC?(0jOJ*N3%G9(<
z{@OgX6O!Ws#h2_tHbr{zeJcHsm1i3C>w|PETcLpcoM|{&SM1k}weHaES%nfcrj6lL
z)z)m$*bhC+%ewJ&vn`(7jIdc+;$FP~!^6R#W;XEv$Kqc0(hq8YiCHpFU_ZrdzWRNd
zR%k)H`j;+mpuBm2CE2IgY}zEzXiyu~f<j?;aINu!$NHA&pkEOso|6`**<H0W`|$qf
zM~^z=fyD7E?EY?EX1@)fKyX9ZG<mL2t3nWUlFMXDwNh7K%@IU4Voa8@$dguy<Wcwl
zC2sp5V{6HP;5}fh!4VPDk(on8Fa21G(|O$4UPBMW=gDT2S!+vhug0Tf__>EynWs|a
z$CoOhjkZ1@(kL-z*YW{m6iYmqbU>WlW~u8BoKs90rJ`3adc)On&XR8alw<x|D!E*E
zcdcl0QV~C0m}&bu@A!247$_&!%3B|SK9v8GWCPH}C&{uzBOoYyT&0(z;!Bh0dg_!i
zqTTTSDHrYAJP>Y~z@q#d!YcvtoJ{%!wQ+xZpxZmw2rXj{(-Az34+|}xn#)2)%Nvm9
zH_0Mt_+!R;xEqWsnGoVC0@>DkGIKt4=~Bt$_aqih$14IgNxCIO#`GC(c@5@ET9bvr
zOOon8jNYrdI9o<EN}E%dMd81HUug0wgVX*q<j2A{fXJ6;i>77pk4q%3QWp7+KlOFv
zVt?j>fMmYXep|+IY98!<b)H^A{5te>xgV&Xl$W8u1X~u5qthkz*&FwyBi0_xR{Fhs
zJqHE|<RULRG^+dZQAIb#Xh-w$+6Wp~YM$4s<vOK6D97SVfQ5w>v$F*#OqdiGKIED4
z?hAqe2?ejBT}342UwSfvgm75QcFkh7k$;8-hk&41r*Ib6r-I?c%w}4%*{(1E&XKWE
zN%W4Q7~3Ar=mn4llk2Hb7%n^V{&eZY)$hDX8SLCuZF3|6>6e{ALVy1uN;~$K-F&T;
zIanm|GnE#dA<}Y4mC%HfqkQvEbPo>?rE3`@tBrbMDk`cYt=}6Vlwi3$LGjRALHXNF
z`@W!}8k){*HxY0z6$^{fg{~gl&_eh>gKHqjvgWk^YMC5cT$wY|jes>pA5BlKT80OY
zNmo!;^;t{|_L-0h!{^4dm@a8A({8f#YIKoU^<|L;<bni^PKQM)eEGuZj1ybHD0$|a
zoXc^Hq92710DsKVTYVMSTE02S<!&C$8Z-BVMhCRv_#sS1Tk04x40Nn%-z5@hLzpon
z_UTD)?4v`SXPt?i0UY#9=`{EzCSnnI-`l>UiDOlLH&_DIaB4qSWjr@drU8BU%#2JK
zFzY=Ph!9Ycr_#m$9tyx=HD3XOmC8%MVvCN#(=Dl&oz1RiN@skoe8QJqEFbpeo0=QR
zt1vJudV&xX#v}Vu`YJ>5_Q|#m*8K0G&i3gCkRt*R+AGDz4b3>W!o$O-G~;l2OgMhk
zHFT)iDRfb@7v5dW=#EvJ%7URLOVrWbFAv5v`(s~+kV_^M6yBW<3gk5mVti?cE;RB*
z6hwSZfV)X}4TAmi^O`m<fs2$_X*ZilB`nti!)LGeKTmoWQS8;7PP&I!O14GtzqxnB
z`<}k<FY}!nNIG+(c{;M+xiI)1ud|n8f5po3h5#IQp?7C2>2Lr5gWd7C9K`VNo(W)c
zP_x;XziMm$zF4dO26DV)JWcLi5hXJ9#<fyJ_LAE^J-8P{vb7Ves^bdv(R5_eeXU>b
zt}6sxc`kF~nNuYF9LC@M4K6<ufm)gZBG7`uX)QTO3})erd(k#Afev_*>U<;#Up7!6
zz#<|(f892#Srn*ke+)dg8Gye74K3VTRPla$jh|*;iG>1J3iH#~n9%2-3ZWi>4os#4
zl<IvsqdR@n;oSKlvcFbi;Um}_s5O=(<IXnJECIT|$RV*o)>KAhcp76RDnt6LdJvqX
zMyFYU!$+@CeuaH;w5W++u2Q5V4ym-VD;Zc<d~Vw)0>#Zn*Ku|v@3^MXc!50WRN8Pm
z@h+bRB32(7@Fw`6m6?_>1`<f)aaYm)K^@+2Od$k}RUY6eJ<H|YI-nY=tN})yrs4x@
z_z2K_Z}H2OOf&%CUOi}4Fh=2JVe}}q-<?npvNQ-W7LBtkdKBm>35c~3k^dmm6PIi|
zK5Bv#`3SU-l4w$Ft1o+;A>r_7<2t4)&j6d+yShkO2^q~cn<~NpuyXDd+GcZHM}Tt>
z%ICA~{mw!cCU8*I?+aS?<Id)cO2-E$aBuMHt4cbw?r7cLag!(Oy_}AKcDOGB=t8~+
zNFBwGm?`Ww>11kR<p9<zW)=YQn)F18&v{g+rO~mDiEiK2c)}Y5!B=m;aB&+Cyi@Mw
zpjT+Z6!%R5h#GZzW?W%6#++$yfxdllMtJJz4G1J-Q@S>!O2fsazR}S@m%d6queE8M
z-M<g>zR8qOPza69HLnE2V+(EMJN&e!@Ci199A063>XbCHRX8GI^UYQ&l}gwb4*A)<
z(XkeltI0%U!w+_Cs%rH|upM-`kOA#i>;@H&BbY}fo)RSP#Hw5kO-;q>FY&n?e1PBy
z0I07dBFr{zC)1^xMW$be8pRmjFQCA}%1LVEykqF|YB0D182&I#Oh%osP%Og5W9{@N
zAez%@{DjSy&5%)Lge9@iPbpNK3{d5Yv0{E$l(w3FAI-{%2g2?}Nf$mpEP>dH1g?)B
zi{Z_=BVze70BwttXBsOvpDyNJIcg4}rKM$$I|1ED-46GLz$ki@L6wjf$YRf0j}~^~
z1Q8O+SvPeuI4rlY;~C8_$1d(tw<UZjjk{Lg(4Vy3^y46R#|%~rHVA5%A$A%jaUv3-
z=W!y;(Q_{a1OzxhD=)ILvL;Xb<5I-vc#NZSW(1!eAExX-3Yw_*4ZS>D@80OM4L{e4
ziV;;8V6cpSRCx`JDAy}5Tx&Lx#<}O*QJl?1njxeE05+$ENKYTad@H$JA0T0r|1qQi
zk`VJ6p5v{)8M$gN$Bp5^Ww83@;{(j>LodPoho^0$PAFgkK=+k~XZ4jj(CbzwnF&D1
z+SF9Lb;OMuOC>WvJyvH7yPvMuo5MYuaS~Ks$7lPx`f8KqxiSI1<Qr#>$71%D&Fc5N
znHgdpBerg!Bl#;w934#QssLc*^z(u~nyF&d-h;X7lP+}G92WtYXx2LbswlshaXvV6
zLy10AXa4fNQM(@W0|>m(g#idRc&9&X&w9tU7Zc?RQ@9BRWA7LuhRsn3KXtWxM;hFy
zTG-yV!C|nF^?<$tiweM_k&*b>gV8EJIPI<zB)1xM<NAGNlB}OmvENNIJXJ2j#cXt8
z-nS*r6lgcqF2yq6xB|{Qp{Xu3KqB5oLjFg3KfdO5;E8v=4Vy5c9Z;R`JH^joOaO=W
z>OOKMKVaW>?N_0Z9~21yl!JrE1qN<PIuZ=G2ANB4E<DfA?WQA9S7$wr$7<^A<5#L-
ze{EWV_Y*|D)@e}XQ^fg#oR?f-4u97uk|!FO_?^4COp>U^v6}sQ0YE3NCGRIs4lb@e
zRjK6juqgEu1_lN<*{xXh{`oo^Q8BULKmD3>GbU$4&<aFI?YQ5Wqkfyp0CLh%J46Jg
zbF#jHku)jqs`Npx$LfT{D*$T}9LyZ@`kRZ(4~RcOLHSj11&la~a0l{H$G1qzH8AqY
zaHhLMa&R<ZdBAev{H^?!j6r}`8XYb2D^*YLwpoVPQ%)J_y7~d$^<E_>7c)}#c%&B(
z=C0;R)@+_HR7o#h$D+I9yC=~1bfrUfwvoGV9}vr1ca7ksi|E~L<V{C9W(8}I_hwj7
z41lHT>F<YQY`tgx97zxkNEZwi>TSPhY*ap@*1l*`<^G|cQ7^KL2NQOdz)iOe>LLSD
z0fB)Ont|V*xg~NYXUM8coA^-bInp%U4F)Q^&4~-i_YL<ZJP(Y)R-I6Y3B9P?3MVwe
zx!5yIro!!ByY2|Vp0r0!DB<>9Z+EK`m@OoQRjWfoZ$j++DLr$sw6VAV#+-n*=}K4Z
z5x6Ucxgv#Dz+!s5z$!Zz^=Z*<u!VcsRHr}Ot2nluTfUO`6AR-A%BpiHajAWlT64wu
z*EvB4>Om`#<mWJfNL4IDS?1A?iKp%#8{Zb(sl*ldeJcqOjyv3-aor{SQ@Av(M)Ezj
zEjBq{fNO<rc~~rG-)c8H$iIF<WBX(t-P>;ov-@5@1dw|?x4k-=AT6O=!>PsFgQJXc
znLZjhoIh_!kOX2ibLO%pE2U*zTsQ!5mGFZX4FErJ+~WP|gY7dFPP-HyT;>e-b&&1M
z`T#)3(UXDu*fCV8yLzSUO|l_DB}puw^4u%Q8ag4u)E9VEWA@5zVO}mM%J$QFeT2Ud
z>{=Z`B*pfLTYp@)F(+o(0de0FDADm+NZAZzX{r&}hO|T0-L&Jr+>mIvaIU03;YfV|
zdzt&z>92KJ)fchHSG-o6yrQ*aL--WKQ4eLvrxv>IeD1`o;AZoA#uhj*@#(k9XQ*`?
zRAcl897!BvfD^(0Xfo6W^c|j&cUM=}j;Nenj;K&Pjq(S8-D16aZqw5wG0_9;x@47=
z#?X~yJ+W~@45>F8vpq3e!t0mM^KZn`+509jh0bQ1U>Ksg8VxuG_Lhv6zpUs;_mcL@
zBKFt0bTp@IPWLUi84)Q#-5HFE)%->oHwP#;>4Y{KxB$D8ef75c50>0riTtv3lIt_6
zP>1FA>B`j?ji2B6=Sfk?j1B2ZVmwD_UCI^2KQE7o0;_YKt8Nn40Ab|K?$o<_2QMs1
z6ztBBT-`cR<<6FmcQZXO2=wlT3CmUW>2Gm8dw-PI;ig==yzJbP>K#wElKaf3H0+|*
zfQIY9_$HH!Ycx41{B)snS^qKYGMJXg4n&?{BkpM;Vh53Ku*H1Y6ka^M-}EEcHa@tj
zAL!g11K@QH5U6wk8j`m_mlFoXMMOoTFYL9csVS@d57W`~=MFlc<Or`9@eSHYU|;t5
zX#EAi1x2~>M$jQcbkD*Z329nL{_&;ggIt}h^oD(3{uPdR(pxtkMEA$vk)wP%K5#-N
zE9Jx9>((rc6im@8ME53dymTKZDNr6>>3W^wUqX2OD}Du&Rb#-v8;VRgOy_^Pa&NEJ
zVFf2r0n!qnN6E~<<CY1|U*gwi!8gSNB^obO!`&fmFyiTQ?Tj@aZ&hGYDe~J;<jWvU
zI)r4fMz=cPpQrJm$M#(C2i3;QQwR`FRKu#-s+7N0=G%#Ew~~CHO217XRy5iY$R3m=
zh^9B1M+#@kLl$-F-sF5r<}sD>vB*_TCBZ+Kd1QH+kWwbCFJYV;O$%s~tpP=9&+bH?
zTD$^jyYG#%-E-m4Pc2vR!ko3>>uN0Ys`ujWOtH$G?HC12>VwZ5$oKYd&M4@6+{?q-
z$2s{{?{^VyQ5zyznvp*_z$5TI5Rl$V$$i9#IN&b3RU%{W%ohHl+I;S2+RT*2UNl+q
zCP-KSIU1nn;rigdry(vk*awB`ufj3N-^RN>(tF$r_$if-v|_!FSn-2dxjQrn!>?0^
zNc))}$yNW<VOMOg=FvLm!t^lVfW3X0`os}h934qjjp~8=(zY7YhvH)0?+qyoz@Avb
z?;KUb$`o2E!b7CrWN>B8Xk<|^-VzNsv0E?nodT3CKf$|qIRgygIII%!Z(|gsp64Cs
z3e=II<JRP35U}b8%eB-3jpC}`iAjhlk_}{V7}K7uaD<ns!6-VokyFc7!k}NIID2z1
zlS4oNZS9rz^zrlgcb^j**<PFC$F6SsMCFWtjawRy4QKvNlZ)f@RrkY7H`6#OZLNzD
z5{Z##3C&Jn+Ry$r#@vG$U2kQ@X=(H6k`NyE%bn5<z$PW9Iv4A0NnxgtjBdSwl+0u?
zO{esX-lfg{&;C3=AS1B7KGuI%y8~}IU<t_EzFOF-I_xa~5FZ(r-Lk7kmp$uIkyXA`
zd13ROjnr~tdxuGoom563xpzM^Ow`x~x8)952hNnDtHJh7YT`l17GEv7*-o0;GDh*Q
zGh6W2ST%VI#DpFCpWaxsrnSK+Xlq)%>D2ZgoaV&AR%P$ZYi)cItTeNv=l4KN8R{Sq
z{qHNSLiX_vgzVZD6dQJR**o_$&ntYXyUvP~HNz6!zeSpn<5M{5WUBx#FK^MkRQBia
zbDcXhJiL+!Rj>OKQ848X$_x**SPE&Nh)rBVw|Tg|!2W20IFi9ve)R&VCc1-BNn_&n
zqh~nX&dsWNg~xwg11Ynp(AGe~`U?R50I?1ypr3&QWEst;1GGi0LI%$r8xC6%C&={0
zI2yoUM7r5v(5g(jqs6?(24qA<X>u}zO`H9(Fe0It0imIxdt*^}H!@@n;$J9w<ERxU
zO=U_Q(4tvy0eCi5QqW?UG2<R=IT|CrKY0j<UVy5fHOjCOC^?GPHSKjhxe4Q&*X}s_
zU%0xManzPAf7MmJT`>`JDeC>~db;)1&+qjq7KlIjG<WS5ZqlQf3m!}QL-*oV-*!Y;
zaK|dR7j?z^jVNN>c$)gHMt9FrZzl|OVrJYD{3dOJNPha^Sk(hM)s_N_KtaX9;MBGL
z<EYzU0igNYb79T3{isQ5_!V!W8WE5JJU0_$E;rsJq7}(!2Xfso$`O4DM-N1p7B~Nd
zV1ax8vmDfpknOP?<P5L6lTT*3Z6M`hFX_>_j%IE4yB6v9Pn%#06sNWBmlhZJk+4`g
zghAnilS~Qp5}#+=wP%~nKeCl`Cwtsvp*bF}`PY5;rI|=k#i8}V+xv@$4U=d^MZ0hb
zQ;7;bC;r(TaGFvd7oZ*4Cm`>MS&t5x^!mi=L5IItJ+4=SW}98@7b=ZR0ujbr1$=#h
z7V+AxqoGYg2|%Ip0QeB$St$zim5T)f(|B_d(V1>akZ@S|0bYG|H~D-SV4aQ1u^FOx
zfY>rkRl4dMWk{Ty`__*2U8>jbZ>9Tnq|7AxhHD#rK8;$~(d;$^%GQMnHSh)U<wbjp
za;3FFjpn)$_9b&up)<B?$6u6B#jO2CwFD2qcYCu+2~PyPx7<|9wQsnZXQ7uyYd?%A
z^(C_RZ=2Nf_v@>ZeX|Dyrr^(8Wjf%LP4ahCXYtgEpOGs1`Ek`ll1Q&N*RS;iSa~kv
z31Gjk9Bu}Z1So!kkI{FrG>>$0ZH97_Ne;{HP0$~n-c4f3=5xC|7L;~=@WB<WFgeDv
z<jAc<;OJu4W{X;Bo5Ur_CFi{sBg?&V+9RvoouY8G`lv=Ue+rN)hRZFUZlV7EuQ~&f
zI-dneC@3hb*y+iN&6_TO3J$@xT~H-j&fxN3jz({;(UG>+W*O)_Ic@BaKm-JF-^T&8
z1FFn-<_F(r@!WtLGZo#Y$AAh|xEJK2t9fc-?5q@PMPq<`3d9e+l#BJ1#Ulc?DL}|Q
zQvWbUm+8W94tw?s$L&vd7(*E8&bp6@>!ab^Q!C(es=gxx?v3q5M7Yke2g(ZLQpT3_
zmUr1OQa!Y#%U!A3yltk0o*_tsh}-ipvW0e{0F+mnjnSwe>TSoMN$m8w((T^nlWLQE
zXev-p94DLq{Tic+061hZBFPB$M<Yk0Xl?jDq~ZbrUrn-jSJ8KW!w}jX&wc+vEw(LI
zO@DeOQ*soO%8)njiij4cgex;x)0oAxmghFYov<3SHFWEiAlddUh<Tx2qk(y-+E@E-
zEtT6ep}04aNUF4kpH;C>AWkIx@VcbYxGgFwjSQ&OWQSGMD)heMq(P>N6XD-8`^7=i
z0KZ5`59^P+d}axTwE&&snCiMQWwJJ%aY{~CUNhg^vkZAa#tx6dZ=HOwer-x%@pw2-
z{aSBNZ_-Lj5x<z{8)A=SGC3Sx7Gy!0JmB&OUXBN+<=W-tlD8wCj_0k_xrceCIod=2
zUG*gY)9A9b+-m2NHp~>FFM>d#682jRS54ML8IK`~d9)W|Z$axC1$%I|pH|2&?;ZK$
zoo~O4xp{DIa!cXeat1mOh9{<6^jeHI`$a!c=#4zlcQ6jJk*;Kr7QKlBgjLHwhZNoh
z;30GrI|_N6ce^K5o71F27UfxO64EgcG5QqfLXGmr6*0y8ly4s&ANQ~ntSMXNa>-&m
zoT(q|jMd5Z$&90mRLjd=eQfYmfaWUs;V)Un7#de_E;mZ3`ol&D{8bH4_X9L!7ziY?
zHFV=q{j<ykjZ4AKjHg__50QGQ5C*)D&Asf))_y70$UWSK*6Zhi-Pj+!5(ze7xJI9l
z{lJ^8Giq;!aKfSy9A|y@c0OhE-8vM6k+>q2yC1$th`c2*rKuCZg>V(j6)+EUg^T2E
zf){(zLmob;@Oo9(Rh*}?Q5iOOZdK65IDULc2Q;6n-Y6$PKdt!TQi+6`)Hs?hI=xEF
zF3b&J(wi@)HG!6E((lAMfPu*G^m+NZ0>I_P@R&pWA$L*4LP|pWBqD3`x7BoLb3kw=
zac`><GYY4p4n3!`TW~m3fc5Y`>x@`-USH@}!g-M=q=GnLx>cDzv{$gEj#|paCJJ=i
zl&=kT-A&ZVB#@s&_7c!>>IAiOB?Igl^93O7-y2v2WGfCwlkZ0z-DLP^?lx~h#HTN|
z+@-sr!6BCqm1YL+_1puQPbf*}C$(=LP}g7{MKqv~M-8M$qt#sGY<ML?E|bnBy;)ct
zm0NxyOt_7FEuTC17FU0c9d6UZ>SNnSM-Mq9OTZ?m?x@%@sP{HzZVxU66cvj@odFS}
zPD-pd(v*gmXdl$<>xoE6NvE!Ot$wpiad1_4LQywNmh1A}12M=g@*)}_5ZGPLcgEmM
z0)VQ@9nc*dyzGRk`kefpfsDcv{_%1V;py=1t;_Mdqs578gpH?Kb|oB9DCoeaZC5%J
zcnpT%-$)j1#mb~C*6O+!%6+gvI}s09Rg$Kyt*yMWV+SXvp1^S2<NY4p3!b>cZ)|O$
zA2r%o5+dnCPXbk)f0oaq_>rqotwK9j+X?iJXlPV7Gwa-2vgiOr#C##u*o)U#Ex>Vq
z*g8yX-a42)zBQL(sk_Id`>)h$V%crz*Q({U=MEDV!s_LK63J}7wp7D3B<Fxwb<g=|
z;k~Zc4LQKm=KJ~pYL8v_k0XFo7HVhf<Es+s2D?qU8gpf3l5bStjSi}0d1+x`aq&!Q
ztoXjLo&_=Jr;j)<;;3~c;$NrKy3nZ{Y59vId|u6+;m(#Jl?8gsYTCPu#=bd>Us*oY
zvN6LlKqQwknq^&EyVF85^{X3;@FsuS^cvh;mNV%;ia8uCv30h;l9%|Ow)C=ZAQ>T#
zHj~Oz>LZrahJ<QixhK#F!E0LMni<vl)p1{^cXO3mIdK(3BdfG!%gEwchNN?_@Z#E#
z=GMcb?Ya$(QSWw|pg@<OZCnMje}}E?=)5;@xCyk?C-NqZIi8CqAZ2(jbGlk*XQB4x
zK+dED`6<^W_G1a#lkJ<{`)~;A(Ug3vJdz9U<I>L)JDJIfn=uHz<sjZUcI{w=$@Nk#
zP76Ek2v(KhmM@~uAU{WL#+NPMen3RJy1!pMZRH8=S2L>hp~3s9C8e84I(^f&UpqV_
zJsCkzf-xMQTZQLo30$(H8tXNlFLA5Z{46$+`{BD}(Upu~1UFF$k{SIZjQ||%ac@gT
zsO@&JrpB5O=ls0OZWgJK((u7>n#HA-rW|^*T5xF5ox_<1Tj1+9s>5kB%?}68Ynmfr
zk@Zy}I5q+t%$3$4l#^fY9QvL<+6GT~K>KP8XWJmn1At9nl_Wr}y%Q<ExCuAm+wpNp
zu*4KiK?O0bm@G_kLxtU#8$dk+$A<psaie&un6;wDdE>l702#(>1v~OW#LErgHog7T
zSJ8bDp!V}+_}q-R)h|n*T09#n5`EfLIs$%*+0+wBe;-Zb%yu(>^X3f^(3Mw|HJK2)
zrR%eex(8b<HnI*R(5G%}Bmhig$Gu=`TH^<@(tj|Ip5{<wBKc6FAvY&)B|CKh0;Kth
zO`}<jaHP8$k|f|38@L$Uwt1!sx)^ty+PUeuNUgVJeW(8eH8?C-@Z8GqI@1^WCS&!Q
z?&vN3No5KdR0<P|3*sps2Gz#!>)wt4xyeFUEy>2V8VB1rs`YOQfw@vQ<5a4ZVXM_x
zilhAu<K=6;jNU54dBea_27oaI)bE$TMD-u<Z}t+HgAgSEAstGc&GN`4x-x*&P^vy^
ziFsV`7%u>7y?3g`%K&j3i169&6P<p;y*mYE({x*V`%561D=tggQ8qpIB0@}0G|mu#
ztBg>9gDjX6CsqD83ON0Xi`v6_RPr4y63YPN{b_Cu?%b`Mw#Y?gVEW~yXms&iHVLF4
zYBfT?!F`+IsJp9Zovi^(=&s9$)~8Fvcgl%9$k5+-E=sjwriNIDMq@OE=zkuyz6x%7
zb^NfP%;Cr)24Mvx(g(l+v4(GW4&bv9Zu1o(<erVpuwbe>7eLQHmL-BvGeUq^D{TVb
zN7Q`I9z}~;st-v1!^~GKr73TW?sA+H^dNL3SvfdtW(BXH8?FU?e*od!GtzT@zZHcN
zIiqKB2OHeWC6-KqrP_It?EYS<RQ<gp&Xh1vIR2^^@nmstgiGH$Hd;9KcP6cAZI5+S
zo-mHY!H};N{KAUcnc@qD>bTE_{NRqbEu;O*J$Mb+ge)!5SI~t}-}$_~pb>?EGXg`O
z75mA6AB_%iSj^gdh*5X_rdm{-5fVpN-KAUkRk-Wf+DCP4<sfzN<FtbZ?&=@~h%9OG
zqh*|CKST@;d*nNJ5KtKre_EKdAMp|o=?kaz0G%GcFHUa%Qf|c7lirG~cSZPA<e1e3
z$e`uR$X<3<=yygj>hk`1e$thZ9&nUpIs!OBBR^oYl}W7!3lFdG5gRxMlMAWQ9XE1a
zyh;UaU2{Ssstb4c`?*KxpY8G4?%rbn#0l}`bF1pGfv7bq*Q@-#rf*pBxAc>`*lrpi
zf+)J!sMz*BjNQ%77`{eCA>i#hB|yw;OaeBoHVlx5hfcO7Ha>NzxPACkv@HlUub9jy
z{|~O-I;^U0eZ&2fl9uik=}t*O8l@YgK|s12=?($uP?T<vZkBX6NO!Y9y3bgC`<%Vc
zkIR36%eCfW&N0Rt&vU;8+<y2B>j`?X|9pft7)Cmpj38Y5xc*5D)|65u-8C`vW%dRV
zQEwoCv|3C{6Z+sbf{@xUY7*48B*(-S^~Z-|Dfir5Bfj|7t89|w^?%ak&+DdJorKI|
zx`doWx`^sCB$kF#*(@fGIokmlf^=oMik5V3ZeXF78?==@-2Fg$12j(Rm?6MMRB&sp
zE+<Sd$n$X?ck$Lu<DTFLGnQFQN)rcv^CiEpe)AKgOVTN?1mh6|V_<n2$6=MR5xMU9
z05*9>(>P+g1`flx#}^x?@+|9&>O}=OMv5aFNC8Bwvq(e*H$YS{mF)xagRv5AE+0j3
zyuZHqv>I^Ffq*tVJiO=jt;P#D$le9hvd7K;j8if@Ug-eVzk(%`M9}w((adG`&)6CG
z*woOl>jj=HnO<^8@UM5_eU9O=!KECAhV^j(X_TIbvl_6Q5<3nJCeUx6P!uh#`$WQL
zy8<#1l_!BwaD5Ijqy%$CKZ8(nz<GB@cqL>dqE808#jBM<2ak#RpnoA@DuH`(L-y;2
z!dHI0{S`+DD)1e{B#kLT9-J_pMqTd@tEr;B4bcX_xW6Cm>kWw6rqs(EkH4t@XCE~9
z!CQ`Yf%35tg`WrLG?~wVm~k}_Q$&XIYbH?jF0GenSA)kAv1YNw%gy^wADz{ro)iRu
z=ylu^&<LE!ctK#6$ogJ~j4#*ui>WOJK?nv}(PbQM1Rhgfzt0QTolz?F@poDfXjuk2
zK@|tL3{wayF9RI};nh~LPHEASd{W$#_q((ftjx@BqiJN?m6}{_Tw>C7BoRL*P1}nl
zGV4BNYo5k>mt_^kX-f;VXxXiy<Ki*2(5?jtP(7Z^Y_-AKm_hoO6PI~Wt51O<@ZS#Y
zZ|69L6Foh>Nl}aNatmBE0_$a<sK@m4dt}DMhX(-`5>PAkz-pi{>642tGJX&cITEnx
z{{p(k$=8kCB#DtYKYF@ARr#D|EtgNP-tSQa))quS#V_!O`}is&If0`v8t{1Rg-!>#
zWX;>RE`U>7O=;`%qX<a7<!iDg+91{(di<?l1((u``6mBqA_;h$4d5AYC60+^qbdm^
zNvl2lq#5#4`W?YKB<I4eW4sH595e2I_em^c4SsaomonU{%neW6FRN#p-D#(-`9H0Y
zyiV<D89i&baL<({tKXQye9H_AEMky(A3gu^MJgw8<%cX5Q~h@S7zl1e{+&=D`y-3i
zLyK=YeQ?@g%jo7I!OHu?@$Zmeq}69y4Tv8#u1!V??b}-f(nwk(TkUE;c_|ThB0x@K
z?Uq7@iqG9FXU*=KZH|q%5$O{)esYqwLJ*CA<F9L+5}|SwMTl-GvU|Hiwp|OS|6#ek
zXeSxbl<K6~82mD^`ZV^MGx<y}h3i9%fI@+8KI@YA{?H4mAQl-0J$k)aERM^gRnR1B
z>5*Y|wYX7(bKbe&77}90ahD|kF2;T~R26XgP@mnrBNeoNsyltXw$q9ud!q$Y@WdP%
zA1zIIMS*RTL_jqo*}zx(o-@v%TfCoob6T>O4Hp^z0s8nbvbB$^@~0Iz{H#%d54)tV
zFM-a6Q|q~-v305Da-VDpS_bf9$RVI=n*o(ailAGW)o*%&1qo?s-7adVr1?|1IO+~$
zWw2>EgrX7UOq<!OATlsy+LCuf_(G1mo0uxKF+<k7X%Dm;2%#>iaBUwe`CFVB5t|zc
zNhS?jP98qFFmaqTAZt0iIA3<-yR`W=%Syv>s%ug~Sw;@RMFt!Wi*_AS#jVB;FR^*{
z(_A-p*M595<}iEsTRTWRmdPmOSpPzAya}g1MpzZ?@{-G|JjAD`&idC+RtUT6F7krf
zt&08bKU-}A&uP#_H4#v$uU_1>W(M+QWk&GaMwO@JRM2Q~uDogH>pW=UX7^nx+@UGQ
zx(f57K+C?5^p8}ppzKs2Ygfk|9yjzqh#kBm=Cw|-`s01oYve*@sX?A@T9@CPEnf4R
z+^T&(Loe!X<iaO~!-^bMyM<NliwVl<kg0n4RmE-?^@Q9I9<PqN)<%+zmNx7hEJi9{
zbcj>+1Qg8Izbu;m^Y1D9l^-U=1X_~FOhGqeAUECPtz#8*J!m)ZyAKBH%5{(wDLQVF
z&50CFH34rHu)8GglZyJ10#%*Hex{&HZ6PqUFpIglx#7|)y@pnjb8v7JO(vrK{D4Q)
zz6{F0ta|l^JX|2vlx-PI`VvQV63(D(p3rKM7q8mDXL#Jh4L!0jU^+Kjdnv4xm0`wI
z`AanT*6yHlK?K5Xe&b*7|0hVnxNjk|zNjv~dbOIOLOK*BJovi&S5Wu^vVq{iKltbO
zDCvRcu8tVC9kZD6h&cJl!yU6`3_Qx1Jz2Y=?OGMAuRc_61s2ZQE7`qtUH=E)H!q*y
z#lZ1h9i6eH+;VQ*3}O90t$m0zQog7foihiwj)l|6nL5eix)PRjng#=Sn~aA1-daO)
zmC1hTP0gdoEr>2}0_pa1WX3vWyRB`d7yU;X4^w!Jd6NdU!~MK3R}cCOB^Db?DL#A2
zDh7w8*TI5<yP9hX@6);1E8_#r*_tVeu{s=bh#VAUdriUp&+bci7kf8uZtrD@aNuBs
z-;iquYLUVxGr&Q(^*h%H5>jB$CIZwF-`a?(=&+e(xL(8V#C&h~CeSxcHo~LA((2NA
z>{KIq+Q=;THRP5a&)$QPlB`2|^qkyDf3!8-Vqoo~0tV&t0Lp$V6<awu2jpSH-cLRA
zCJS3;vyRqywRh(t0_mBE!3YM@TH%O<^=pFb_rB|+GtIc8!=M>)F6#Dy+sBSngPdJx
z=J+mJSDKEV7Pr=jr50fi{ny%6|Cib4eD(TdncIU9sq2Q&Hpm^(p|a|7U?5OkM_OtG
zifufxPPN5nCR}n_hT;=vds!1{_})L=%Xjo7*la>a9ST4ofRd*i6}Kh8p>kL%k?o);
z_o2-HoIbb?84+^ioF<(Xy=@}5u)3M!b6wrfj-@m*`Yb`tcV{iSep&M;dC4N}H|9&Z
zV7(|1AJOBn!;Gj1XuMiv!rIZgLNO1}wupSeItgIY=t~l9#KDU`Ouu74guJ@?lK?%Q
zFA}xiSvIM6evpgzclNqihCHfOV3Rr{-#&^&86??tFN7}}PG6M+WqlLiNlAAzDY1((
zYB0oZdkO{40E$gotmee!bJ|~7Fu8>=L9gT7n3LF^V*F-*5>xS>2whS_M?hg7y%%d7
z%fJyZy^4(Eaya;0e){weP_xLC&RbO&Qiqm<rP#D*L-}~rUuootX_>p-2_l4SOJM?2
zKM?<@-@AOC6NVXyj(>kF@0gjHu|h(6@lU)6K85d<QNYdcK|^~Ms`f6>23^5|rp7xy
z@CbdZ$s%*C-4_2XL-yz}BCua7Axxk|<fHjkYNTg<Ao4{s#SZ(nAlgR~zZfSM$o38w
zV{j{GxF_q%^<|}b1Oc{0dB;3^Eku(&^V`35a5dhN%wG}t-G<>91Vz4vvyY7jeBnIJ
zbsQm=uG95%2Q19@dFVe5FIyHxzfOF*vS&E)!dLe54klP$%&S9N^JM)+y;7}DcwOqi
zH;TA^w<LZFA9z!@dg&ngmb><)(wZ0Ri-YvIg%1i>+tbJsm}gQxJjiTgPOH}Li@|eC
z^l2a@PA6r0mo4)CJj80QDz_-X4pn8Z;drX<RJm^;Wmfjn%4HAvRM6GN7*{<Wc_HG2
zs!H<@E-St>HTg}eBYO^O!3Nwn9Qo9Haz0*E)oQlE(o{m+;^(T*1RThvFOk-MeVcgJ
z<{>Z2$nYx0iD4Cf#B*xpUJsSqT0`-RB=$h7UNptlW6AV8;L`-T9L$+TBiM-`NAb#^
z?_C04-!2jA1vaY$0$F=4MO_E<N*g}KYDG@9ZI4H3Q=84%V)36NZAX_ctEOCE5WSHV
z1H25UD+$)ht+~RR;oB(C>0H>FKW#B^i$n=*#zy-QD###t<a)a3*nY`eeVS{qmf(E7
zNY@4zT#AIc1#2kC!QTS|NxjFZd1KiYXstB~n9JSSxp{)hR`^fwQ{a%m{`T$LUWAy3
z50Up<<`(o!vrqa~bOX$Y9|aSWajFDd_Evy%y}NqkCE7D0R(8E!xvkXz((S$zeb>r+
z`ws7~^YpUr73V(gO}p;`)zanYPze65W8!2}yzkk10yWIroiW(HtaMCUon>+-k#fC4
zdE)8zpK|FpS~RTRCh>lL+$@Z=pQVhCoyi1VoTes{JfvJqZXZvjpY$Yv9Xj~4mPMM_
z$qm2t-B*Qp5hNqst*<>5?>z^Y^}=2S1f#z+j&|TnQ%nza$i(@=DR(@_XtMHq*IhXi
z`vnGpiaS(0YBj)z3zNXQa9KmA6Cok;0~_R$Pn7g-``K_7ipDp{_m~7d`G&7ji$$Pt
ziHujQp)}v67m<_Vq>RhD7AE9jlfM#~C>RkHrm`v*WYhra3)>2a>t`oUA)6&AM4-PM
z;eFIUwK(>}4367w9@ng&?oJp8a^gXyQ-MRTmt32OwfEDnXc7iox?HtBWc@Y!4q`bf
zK+im05%BJL_v@#N`NT+qItp1{RQJmEB4@*i9%3hm`}l3I9+C$a!ak9ND|~uqoAGO_
zi>cCd7b~ddZ_^UwCs7B_2!eKt?5YO)aOe^;exxIz)=oFSq61;@^KteQV12v-z8U+?
zemr*%k8ePxe6!@fiw;t-cp+Hh(P~i#V>_2w4V3uUf#<pTX0KS5*p5pNys|oIHR8<t
z5NQfT_*}gzj?%*udt4C7Nq)`+y#H(EM|{Z^uNR#UzPL|AXo!4kF+|3lzsW}QBotD7
z=$75Bem@@^zT{*m%XE9S^1`7$Zrp1G2rmz2K<F@Kt@!Gt<#MaAzW0fwPwaX13%%!@
zCRE4AmRC0C(kmUR{BAc-ZKaoFCn{AQo?a_gT6w4)+rPP%GgGehw&YK?7aud0RH`rd
zu0tBh|5DeNp=q)0@t-=oMJcrn8{kgeP@avpS3i;^=9a&vLDn4~@6~yVOShfFs8=}A
zi!=0_SDi~BKD`TQy8ibDDc$syV4i{-6M$ee4=j;*6j7g-^TxljKKbCBCCZgrJjqFY
zzuNg-u~U`5V=5h&tqH8_>|b=dX!zr7G^^q|us=EH)FO}}++ITKB0h=bjV6O;nUCEc
zJNvcnh=A3nkl+$~AaJRFdHxEE8B~8?5wZSiYb^~%J*k$(_8~CO&8otIAl%~O;Cz2L
z{9^bGNH}-KzK6E<@M^)Fdk*7$G-9*WCZY;t!htJa2kISI)ZQ;<Yru4K&JEb~>SZrr
zhLCtOy;YsY`o_lPN_#3?Mm?oPd}g(O5KxHmAvO<p7^$4|VoQ%LO_%mF6}>iRnN3B~
zzsMliKDT9QnXsnfO!)%)!czPB0H6J#qfF7}S8ONtX3gujctZW(H13Hl7OGlN@Kn4W
zL8t1~|7Rls1^rOeX>8_a=@<mqQ(dKV`|q1i^9YKPMr%AwY-!xLrMyp$bV&J*o(OrF
zJp5KDDB|N{ji&SI0}q-9tZ#an;JhmgF{HXL5!S1aL?ZyOrrP%Q*MpfKN#6~FNRnMo
zMp#cqFTRgnS^Ee1_eyg;I9%j~unxAne<tk0Hv+>q-rsvr-$aS#)U6HB(mwb2=d>ly
zIB+RSH0$l^{m#1t`sFTn2m1xwz^lvT9hZd1L59khb)^v0i)|Q!XFCBN71Z%GAG?32
zorR#jbR>17+KIPX^FC-t*28Eu*D5ROxIT;XcVkH2&}$~lkmoPfMJ9>BWh%jMDF12#
zqq!&s<5=}2YYsh)6Z_ae_@=DQ{pwhhr<MV#_%s&jl7f==qBYHuvKsqGN7bNASTkPm
zeBBWx^&Nia8}xypXF<zhN>pz=L!73V5t5DRb-&>*2={;22;3`}D9AHva;_o>fyo}@
zO$56jVPVXs0JB<(D(FqgsuhRR%L=FS3kfynwNZWDq5tao-VPa}Kb^yBnm%2Qb&sKc
z*g+~9zxJDo*Rfl5MxteV=lmVgE1(rPUA}8h+FPziw58oy=95Q`605N=)O>P^e|5b)
zy?;`6V;hNyk8LkTbo$Xx<7VA3ML*Vw?9nz1{E&VxgmLJs(SP{}Cu|&;^2E$}K&g&l
z#3Hgo+&s?T_>yt+L@(CW-D_}vhwIEt=ut=d=Zyi^RK;v=kW(7s)VsGn=&TtnQIJP+
z$S}*J;e?{@$wbb%*4;JdVCEURd9-dA04R0b^)0Uz1J3dB1hjHz0x0XcF|W3&DNdqS
z9UO}<h?2_l{bJcfuZNGXx>C4|KCfOI7%lzix^mreylZxc^S!;sys}yRp2lGy;D%85
zeOzO!PvU7bc5dW%R|v7iV&yCSgIiL4iX~JyQE&9atV<~n0OI=~NrXFW1!S%b0)JF0
z>U1BslUQD#u4;wh02cwae&$wG9<(KL^?tv-E5|Vb`2%?YQ_Dh(AtYJ1!lq6%28l#r
zzli7cz)|M0T<nIvAeq@`avsT+`=+a+QqiS|TWeX`hChER0T0Z4uu4GDw;Ha67UG<D
zb-Vz#<T-wsxw0qhVZieIdd`^nxBWjId~)BmPHjdn7YUS>JDD(?PyieQrZd2Kc$G`>
zfz`BiLBz?Q8F(r=)NPq6fq<kiZ>y6yAUGqlA<lKHhkNdolURXvxn13Bb%K?p8aS<o
zCAAX%0;mKW+Pn}k*@Brc{*!7%lMQiIhT;D<979Z)&jHkN{p}{N#rk_B9ujpaBI*0l
z6dJ8RgAyg|HKk0-NW%Hek(xRJG;OOc5ZAJfAig2<Ih5-4KCcI+6P>mib~6-KeKy0Z
zp9Et+e+I7BKq^0Ud?0K({?R(q>uugO<pQ*p>u}xZ6DN)o5ROuAa}2@Ap^0rnr=^gc
zmBV12kjuj(<3^=WE&|gn@hPiRzU5bZpNqRrw-sl1K@|$^bbCY-ECU0J<grv#24vJ|
zrJ<NWg$zwDk{_n=iTI5U_jy+EHUpDiE+xs-`)FzsS^Oi^`@cIzM=)AAILZp;G9B_e
z{d)$0y*tj+JKlc#MXJb4I_&r|9YL6mq!Yo&21%usl=X$koXP^pS0^aO&Sa(k>V^zd
zUcJ-)>uDT0KaZPPCyvliS20re2w%oVngafp3`_!;c=WEWt#iw;CUtWCkYl&{!D273
zPBH-J;9^H`gE?uCN05W%<Y}JWq5qBJnkC%dO%TBLq=+E)RHvI;_Wt9Fh`ocUm+_TJ
zM5t&4<YcTrx;M`qt{wsf!br$l{?qAXcp(!kA3Ju{UE-}goAip1+gh{JWP0)ie->Kk
zaseiX!y6s1C|bsO=N|i#-^=zT!~|(S)=rjE+B(%l5<$$kj0z%3;6v20djj-h?ZB~N
zM$i$^TV&snamJ+-9XF!SyP)3{UHt7CIx^%PggQ5jnuZ$7=`zdr=lN$}Oa+y^zKZ%J
zXE6teN>vHd0^=9S;C|a8_;{h0T)<wj@gExotA2AB@aD<BKg>+n5H)>Oc?WWkQV~#K
zHOvIQNJIhxf?~nAs!jVR{E<Au(#A$rbn?UYezkD=9yRD`qO6)Bxi6IYkDrAr;yVfH
zJrn$N{XWka@?F81g75C2%e+ITh;BA_)Oq1HWfI~|+~xw<S&Lv>&oA5B%vS6M5tw%a
zhhE%zr_+a#U6y?_#!%n{eB+^~Lryb(s<U;gkR?!vcrXM(_=pAyj=__i!36A9-7en3
z)Ynh^l85b9Cmd{6vQ`4HSF%=lx-qoFKdHqu_5Xy%R;vE~ZWXZI4b;blDeU_)D5B8!
z&AYrv6Zku1;W2dTui3A-m`@a04ediNqh^%Mf034qdc~gw)o;Y6-is<;i{ru(lq(R5
zG}O&?(KqnvJwr$DvOT@g(&&$0{@6Lb>=%3EV-fG3Z8;rUXwU_$G*j8xE%2z+Af<5n
zbKgH0yr2BW{_qcakbYZXK*;gvv#ESBIcmueX{x<xoAdgnF*o(tpput9B%t{W-viow
zKee!MT{u}maNF56h1@0IP@s*Hw>2?tD!}wVx2rpbEK<ALe77`%_FI5=2tJF>j_`E&
zYJf}pGG<In_^?vRC!0{zkc8&;=oIc_qyt$S`}afUGA7zIaId~)2vxm9QQctr8-jBj
z70XH2f2|{T3Ao!y5P*d51#pDS@J-#9#tb_xb&=S}Q#}UG^S5}=Zbi>x^Pzs<P4q}P
z^JZ9=ins5uHx{tr3~}QX(u7vhq7js2quEatD8I2&_M8*1w+BQ?Q2$%+2rLmTe22;~
zK)D<q7^tmC&INAu*E25k+4N3?kLds2PlWZ4n*3F$B%0Qc+Dz%^*8EA#r|u-qJ9hj@
z7R6pHFM8A>{>0MoZ~65R;@4|R-%IhV*zhA-u7CL>$)YS>YjnM4b3E^et+^>q$Y3-#
zaCOR>;&<_DHNcp^R<{m+K=YbJ-Z07j&EIOUofmhxD?`VLMZCS3(6gSCq4^4rMPK-X
zq-;CN^05MpG?974!<J~;thG%bh8Q1lwzWhd0P?CLU+Lef6_izidNRX)n2)MH7%vRN
zo<(vq*Qt%#-%i`qh~8YhpEA@o=ldnmL-}%x+!)$bG%99Gu@f?cnea+~Ae}ASs)~8l
zpWDxso%tG-zrDl$Ep7BSRD5pc7zFhK%AuTG%b(nFN#j!8b0`XXtp@=8`ZoXueF#Ef
zPi`PU&#!aT$P)6X0N^TkxXWo^IMVK%h9GQqW0j=-!Na%xtPfXcZx?7}a3PI^dHN(z
zV)b>nLP~1`-(LFm>wilD4nyCIV=`i{eco!GZ=2Q|Ud<`OvI7lJ4RALoPaolx!4={5
zLX~f`gj1&#0ExjHx~SA(;4HzPCS~odJl-5go_hOby(gB2k#T!Iq(59M9GmtSAPK*N
zZW_IQbN%lRFuO{pkY2@>p_r*#{p&<LSl9H_{+rSF5LY7RFyNwkg=(H-b;@aFC_*QD
zY=3EscC^$G*)-OzYf^7-fPf0vX@N+LhB9ya1B&g%kWd|&fS-kfQAz1(NPs=F-e?7z
zU;KK}!ONi%KfMa#0fLe-SOr^Y_(4-fL(-ROvg|#HWCX7ABh}_4HC>_#Uj<k!H<?(U
zr|6ZD*oHGgqgxX=tYk<g)uJ!sv6|7jZhd2g3-OwL`k0I=C>Qp>Hq80m<i{E)dJQd<
z;Cgd({JeX4j+4yc`1(uF{*FuF;s-3!!Wo>x<&wrBP|PzW3kVG40ANNK0nh5SFVMNY
zYVAFk;V|hgxI6<Ur~(xNiPVnYz^0OuVP4)LZe<&^@Op5Ou;q&r`M@b5$}9XJdE^g?
zrAj(H?=DI-(OpPpVC~nHsYa|c;YE0sd13!)vUECaou8*FmwY}&Pl$!IHHFJz*|1t^
zfTu8n#|yDH3P`Bp=9UZL8^KG7xDbR0)nW~nyZg&!MxvzR6GyAhS(wW}4U2+X@%boE
zqCx3ld}>vAhh^}0GYUzaLQg$I=F0ma{vna!Xcn!;4mBzI!Yw=;Vg0_?r;_JIVc}Cu
zub7!hpLnX5xYr2&o{_6}CN^A(VkNCp-HA4s9m9{L!utFw_0@B1>;Qn{5qA81)ARW=
z!;k1)O3hXuK(w<;+AGy%xXbAi15hR<s9jGCS>NytWfd;$kCJ#OS|K_EYVOBZ$m%&^
zRU<V6RP~N0X-m`*0Go&%$E}uEIq>r$L#L!tLj)_AxeX}k{95^>?yH`E)Hst;Z86Be
z7(I7La9}y8)vuO*s818Oi{NC<FqSpl{~@#u+O+IIK_kujryek__CJ_sz@w-naWD@u
zZn@Ah(bLPkFTuu1IMu|B_knS56Pp(ZnV&tjU9OB@Gigk?^)D^Pw#2GR2R_Gj*N0NU
zE-5Tb#SKC+F<3TAaqZN9j~^kl^n|UojE{WPxz}c4V*libs{8IKZgQtgD~km$)5@nJ
zPj#8oR+b(WgLv-Ci3t4K2t^4ABFfc(vqHHtA*HoV;1vH=Ycqew#iiiu3UFcSBPemH
zWZ2BTw=eqX!80boFc94lQ18Bq4Ko%2*u|t5JrgM6Kk;%skhTar*Iy8S6$&Gw=9WpA
zP4evrKCdt)iOJWpI-wp;xuku^LVD#&@bJ#t8rvJ**L<X0XN*Yl%rmA4mRod``7k7V
zQB&U<qSx3Y&~Uw7Tv$-5FcfR)x#1TS{I1vFNM!5&u3;Xi&j<lEIrppTw!+StUcMd?
z^Rv{-<5;N3!EAH-=I5%}^8u>aT1x^9sBl@D01oY6%M<xG+~%yy@qI-9j~qtFn81xC
zaMU#<9xbUh`lN(V2oDwRoHY<*s3;#B!y2-{@e#{@$lYwe<s(vs>7CqxvThFNZ+BYT
zj0~c(^*h>oE*jPPqKi98y)XZ4`mZ|4s>eR%8Fe=Fjt!g0X6e3r_W>(cw9ONGA7XUy
zEBbiRWBN*uy2^4|fSnau3`A800DSUiyRMa-H5ip3q^^!H%lnTkEH{MGR%HNWE$nGj
zlWC7j4_=OHhl)OiAGRHen@S`u$)_@GSkc#(%*V9(E=L#WUg%AyaDs{f7M5`QKy?d8
zk<;CZXUe10;K4P9z!c9H`MqF*?DPoQ*3}!VkZs^a5uVRmGN2rEqm}Z!<m8pO$cn&w
zFdu=A!5s>pOCdHS@?kOz8+R4oe26)n5vIye&TG`+K+%i5BqkBjT-3i>!}L(ZD0m$I
zc+6_lOu`?IP9wD@d4C-Zy~n;NxqC9PL>hi6BKQya680RgqM~APX3J=+j~5y-_p=d^
zTjn%w%K!jC+P)h9Dh7u~?+jfc1FR54azCNq-}W&3LFz_-@W-poo}t-#V^rIaAZ?d9
z8C~k-^{(pV%7@V9mFj5%zOu?Y+}r~Whyh{xEt}$Ay<Cg7+#GPqMF5_`9?MstN}H{w
zd+;>uCmmN(E*s`z(5`xcK_OC7_WS>uru%;=pB^}GWAiGeMx^em95ooK<oWnTWsZ}p
z<D7jE4aWnlweThJ@+Crd4;uV51nifrOD!B8Qu&kFqNbd+Xc3C=F>E_Ams@-2XYC~>
zY+v)1#0RmMJ<O>!Uf5P0zzmVZzA;AUS&wqf({?uTh?DBl3O#os-S%rs9W{i4rnFC0
zET);`LA%xIuW2Cao>*nC#}M_YD?If|`MMRct8JT3#KAm_TG(F|6cVy0yt@vdtN*oN
zUteDXk5%|=%Y1W|UbBeLjp45_EEv=UxV^g~39w*pf=H*EI1$Axp(0Y1&HgXyyLlU6
zv=S-IUj-Zh=UG8{Z&Pzbs`V-rOyxx$@0>CKb<L+o1b}ttvS<^g3+KTjTBJ+;(f({l
zIf_D*Ttq~~Q}K^nB5O=H%ieke`Jqx_qabjN6ho=xyB4B5DnBL6HhYp_+)$`<NvKQ!
zgq=&XPN52IOHXZImn2M*{{awrm4EX$euE@>Xr1m`pn0{R?|C(JNG{WAJ~VkviG|KA
zch=XSRyv!n8B!5-eO&2!m2G^}b2#=h%|Rnbz6NpU?!?raOVptvP7Go1EA+Pm;gz>b
z;#iYj5z!Kq>&t3U-Kp1kdjQn)J7higjQsko&1fgrgX)&w9m3mXZp;KUXhXyGwp)43
z&4^oP1r*;{Sw}<=OS=Wo{0jY#fZB9lcsd_&ek`U-hya~1@<W!#OX*}-`tzkimf<h$
zzMs+2>^=r~po%T!k8el(DA%@FhK(ymVIdip*!1JEU~78A(}(#>VNNYdY=GF#2bga!
z`z;WXY=LB4YmW=Fmlu4y`4WaY1VV;#b3(l~U$V+dhBHgq;4{gc*-DxaOp3D0UjsXX
z2)YoU^D75w=)sc@tZ_VYKszY?rN5rsQCODZQ_mlvka#n>ubwuTKdm2w(uP*Fe#A{B
zHU328WF~c`A$<_MFuBZrq^3V6wI!~lwPx3B{(<_3uyy$tZ9I5M@DI2t!{~{C+6*EH
zZ4Lx<UU&fB>jL~g>l02uuy)=V6>Fo0v2)GTVl|t-n_Sk~yS{T53qT7qXd?~qtA#=?
zoitZZIZ%(cL)z4yb)=#p_qn;O>(KkY-wo7WCotaZbm6bKeJ)-eN{fu&-;gXev-J2T
zA)j~$xT*glBf*g&e)H_7?V=8kO0Dzfc(EF*squ2yrXWGIsXs>c-t!*QC!{$lLVbWn
zaVZFrerW<-2PflfwGV~wog+^kT6TCgeyQ|m_-TWPX*~f>qzRJ3%B@E+yU&JD$;}qQ
z!KGl>$7FH)#|OB3dYaw$%K4w^^hw6a4zn+%!d004WM%)CtDe59(~nU4FoPB->Px0r
z<0t+_RPHU}K)HT%%s-F7&(O1caxHCO++E-?TQ#kDPSDG;;l=NF^OTo)IbcL2Ef!Xl
zvMa7#>up_s)<B&0LhU(gnS#ZmG0B<Vqq=*IEqwl3*O{;c^8fEWsr|)EL$Y=*mzdm5
zB<$3R55JYwD!%Q%3ksH6dAsuJ>b??5boh}-HDu`5Pt0vtG0fp+R-0$OBkR{fOX&I&
zy2_*5cPS#qF{{<XU5b&@s}##E{9c@dSVo^^`zgYo0&H5VCK=k@G0&OzDS}%0x<3rK
z)*|uullQ;u%7~JE%q*~QNxxN-9x*Ad==xcaTK%pO$MWuWU#C-j@!y#To4Ck0a!p?(
z_8)m5NZHNNOHgM`;iuTia-5&V+!}Nq&rYhZz=Ei}E9SX!-q7>rwnu5bK9Ih${R3%G
zC3x<|rXdR9m1&?smi3;t=}A^?g-FA_EY8AFx#;!l5y=pC&_7p=>!GE~ej}n*s!mjn
z2@QAaMsH70<P!@Y?*hh^@rsaM0A&O$8I|VxN)aQ|SsNF5(@1Jaw4dL=7p8pMe@WJk
zI^Q~jSM+m*vV5+IqB8GW_j;6C9YNa;kxDEOtsV*KedlK%|0|#KY*;z*E%^lk%)JJ)
zH>yYabsUL4s+$ELN%ib}*ecnXAG5;mHb{T$jwqj5n@~PI_Jum6+;mtoXI5kTS<m{L
zAStc^RG(G1l$?KneJ}LWo=cJ8O4!ns{mpZsiCi84_Y><7M*!e>mCIQ_-r2qlFm|Bc
zX**^qXX6ggqhVa@rKkU|`HiKY8MNAi>+0&3p2{ULhr<}b_N$$qn99w7mSs5pLzaF|
zg^;avLIK1JEc>ks5e3H<gIU5PfS3xH5%j`$cX#X#YZ3=bExJ(I0<l0ui>2mQ8JU@Y
zoCVWw0g@;xK=}aQR~n}o%FWp>93359OK2~z_rvuzzrz~*O^go+F{^~-UzV!Yfwxqv
zlbBusEe1kSUm*aBIV1mG-kKRJr%!vPp~5-oA-IRp`NnSbc*$h8VUvP%!3h^LewQUK
z9_&kb!)`DSD4-oqQZ^?dIl8)t7go_K73m=-(jOLiiuQ^C`2zi(nuIX#_ZMeCiYqZ2
z4T{`RN>6$L)8e_YR+&Moe^`a?9>=AzX?S!~jcJ&soSf|Scl^U<rPGtWkV^wJ1oUiZ
zWe@p8tc=xMa`6e|F8e%58FVo@0EG0fU9i&T&-JUk4@Cb_<pO_H5<uT}!U(VeiXwgZ
zlG2M^dkrG=x_QhnY5OI%vZpYk9t_~{KG+?TT<A53gawimfia;<{a+vm<;TP&Q2H9=
z4Qq5E(PYzy0O{un)W*vJbOTyvI603qfWK;khxGd^{teaKpg*rRB(s`5rhMm;{t;(^
z@nP)6aGBLckKG|oXtAF^iJPm)_#tU;q|i^>dtDB{mmo<Jtb-Q0c#Jr<1yMQPy+BWn
z9^qK(Z#|34HUI!Gq18n~^z4#I&Q1A_%CPS!O5-2~qz#2ld8?BBOf9LDuRvBq?z|BI
zhzYcSomC9C01&l+T#<<7_$QuDu|ISjMpq}8oE^Z%!AUT#pxkJZIkb8OWC7IjvK|z>
zWH1~=v(Q(jJl|6e5$!V${k>%(bt?@{(zsouJb%4x7Px!hQ-Yaph8XJ|4quiTV03&z
zolzsoQSAv(3(p1W;fjyNq7Mu8IJz^?_AQ?I<9$WlN~3H1yV^ZWYwi8wi`$32{C?KM
zQ^6oWkSoxCj_upK-)D`GTJzlD6yH{wI3^cObY|SJgUV0je9o(75nZ*FJo+*Hk_uRH
zuz2<~g5{C7xsgyM7+{F!#OO4hk9oF%7zW^3tzGN559Rlv*D3|IO07j~<!)R?wJ=So
zR0jtK<!T<-VaJ(1ND4!3e&H;d`N`{v;r%3`?cv#}ree=LMO#e1fKWJSkDY&?knRk9
zB!BpjVe{O};0t5?>(}Pgz0@t(lQ|#gdZ{l5*57hrZ(I{mZDzoXV)=ri!Iv*v!yAh8
zjP4dNM@ykj1&h6^UmW7&WeU%8D68NfUxFi)hA0tm@EKXNS|do3A!qv3`))TY9uf;-
zK=R{qXx`7HtiYsOTXF+ssF_1;2u@6Lc5{>Ve`-xGQ!5=H?f|vzN9PJJN!^8(?Ls*}
z*LV%a?;`err?s5e(9OXmK>9oa20{yYSbjaI&4YKUp{Tk~W?)rZc9krGGF_4)uRLj^
zg8JUmr5(0=BYgZ|IQ^&3{r0=B#i^dWT8WDOd0-spJM4<*X1rN_&#}z7CJuuAd@zbZ
z3`l_Ii}b_~q39f-VheSt`}qO!PY7T;3RH3<mJ$SI8GZmmT@C;~3+>XTt2MhjxR4OJ
z>Mml>5l<|?pIA1d{+P_CxHaUQ$aJ5}9XDtdSMg0QTFzpAlL3wNn)sz34{@aLy*L8K
zck|}RnGk2q%PHHPLSlh!wm4zh;Enq}>mbi}AHP8&ZAPIjSy%2`?A|pGT;8YqZ{Ma|
zQt4AXpy;?+mvzF<*2(S9TjPJO!>X`J1m+odorlS-ci!%v12G!RC3sw*`$W;jES#vy
zRyP^ioy#>d2^Bo`(+#6vFEeP10aBPJFbof?{-6)-sp+&7fM%VnhEn8!oz!)zJ<So*
zlDpCK-0Ht{zuNVM6ZzRz&)BqY0;avsfn0k#cXIT)DR(*NElrR0{?v=y2=a%^=a>|P
z_>jiRyRkP@eybYpFFI|5XsDT0N~NVFRu$BKGI}`E?O_;07kdC|Y|>_pY8Dg?3SN#P
zploWs`<e^zRyi^BYibfhGWL$FMK{#F2gsI$_bEq;W_o^T9^_n0<^;UXd4H?vXarSA
z06YJTNCp{T#plTtbG&OfVJxz_+a4^^>)S-eWlp0`_?F}_&PdV-g9b+g7{}(y2GC`m
zAtQ@fm)s|5eoo*8=3yR__1*kzDS(<)22$Rnb;@+wZFq1fNUZ1Ye^scq0xx>VUv-On
z@!$auo@;S1q~c6JC&O)Tk|BzS3lXMAS~B@**zr`&QW*vFbYaHJF@{*<%cJ}XC6iTl
zW4NxY<qnS+f6mNix7vm5BHJUMpB~0%O3BV(JcDvDnc?Q*FcMSb>h+hppFrXJ7xD`D
z{pbLbF{b+o%pV{=bA{3A>A7Hh>z&^<#Y;~CojX6?g_q{X&$wds&DXL*bCn`i+(jkR
z;U&SA-2pIaZAM1MqyUy#D?s`4IEg*j05YCNPt%2*!tV_&S>JrM0;MN_G`>=aGsle#
z5BOOoiHruMWVbcrFeX!o<h=c<RS*jP3q#5(d|<$ZfWvX>fZeGIx@$XR-$vPkSS{Z>
zrF6&t_yM-pT@s>`@S;ImZ_5}jErEICFSH&74NU^LyU&sw`mbS8H^>39qyKP*1KJ&s
zSo}S^{fp9@0gbao^2`FrZ&J7|<8LxGN_{1Tmd0@a{|<z1uig;z-P$a~?i1MMSd8WD
z$>FB{9puxO4#cmGDaBK$@01!`Jz%`X_ifK3mXq#sStT-fKH{^<yeOt#_j44>8pKE#
z7k{au;&3GmrB>XvfbpdK{V>*V8;b3PR(>qeL|2{~fgr*C{(R;z&kq0J;V!Cw?t8gU
zX{2Lzgu5SKgV@3YzzR1|Aiw-DK~b$(Zt=OcxNrVFD9Fy%d?Y}@XZ-YMYM}kQf=A*h
zeJ+JqqOfU_xcv$qe0F`n&b-^|=4SB2{q4RN2;Jyu!Q>1!&`f}@W6V7nWEz6)kaa!V
z>@nBbQh>8Hmaj+wK96T_L)Bo8h}T8Q65P~xS@A_VMcf|0>jSsi{F=B2EK$FvC&jJT
z2$h#>@@{g-t_@mM@0(mbV-No`PG%73FJLtZV`Jkp7<rFR%bZ1K86oFvw%&-w^|3{F
z$ozR%1c6w%<1qc29--uWG}I5=N-vx6TuDF(O0#C3*l2ZlWV7Bz07x5Bz=UwJk)T*)
z0}fER%rn`~dEdd&^BTt18z?Sn4ej+@Y{vIrPQJ?}9s<P((yXj30Mgy3@*xnp*`b-5
zob!HG4T|Z6!n<&|#Tkm|&s3-5%%^?*4gg_-$zk=ttYwD(8JqtP;d9;R|E__UK8iyi
zlP)B-mpjz&C;k(e58(Vo$R?QrE@1VYeO|T!X8)pvX@DXF-ybN0GAtf_ZxjN!z^bUT
z7TpHNTpcKPJ4u;Y=9Iwans`SUNB_RZhf7hR$GcOoq|Z|U9uijS_he5=XscsuIVaWY
z$LlunAY?bK4aFp1yXsKAIvL1zSb;)3%ay?pAfRg`Fz{A}R$~5pg~NRm<!1&dAi}J*
zo(lj|4)(wr?PU%IpR;fY6`TFJ8UD2}-B1UhmKK_G28D*w0LQ?xeB!Gg;YG{<b7b||
zvJ_;0iM}cX-XNNzSF8U=pL|IfguEhtkNBT(Zim5O0(|$wni<#w;=J04FvV}cy4MUG
zbW0!4#Ot<C+C&sP0G4>8FI^g%8`bFd_(XwJAR6o{F#L;W`R6koci?W}aNbb@a04l5
z&A%5;mF#}J;!pfFQ8Mp9rMr)f1epNlefT?}0881H4=6)j{KMoe>F&xE<u};cv)oMI
zKiuuw>G6Bum6esH2)F<fgz4f&z?|ho-fFu)oR!|F-f4c?w|QMM<*sUM(m&O2Rit{^
z9e@+>830OYu<C@i2j-s%OO4kz0PZX3vHW~L?Ec~J5S70PFer@4AX>j0lAd?=dwRsv
z4-7^b&YODA9<Evb!TP3bwbg<)i9LrxgnD-|Z^z;H;C+6}<It|O)FRY~c@WgiLLRDm
zVr=z1gV{;+LIkAXMm<}LvlZ4{wt7v4P>y)0nt79a*aMhxMWsl7o6gV)z^@*UVCwfg
zTHhP@b75HC59X14Ph!z6A{q|Gr0AaePJl(^E1lURki1dyq2G7B{(3fx>v-Aw*;t*)
zXuX3845_><Z%Sf1TIYKByNFpuN$EMvGYeHu(8a2&*e-{A*5HUkD@PzL))QcF-`k^)
zW=TmK2V|5WV}9)|>W0T#z7#UAjLf!|q#^_3g8|3NXrU@yjZN$ZR2z8eU^8uCg`~e<
zX8r&TY<QkF#qW32{G0V)Bg`sRQ=)P$Sov22dqg9D4&aOKiR290Lv5L<A+|J}cf2{B
zN0e6^W_oQv%r-mMMMU=7?KPO<GRgayB_cdrnTrrOLrTO$ehCvwDCg&Z(8%fM?j*L^
zg50+-Rij~3nUFy>kgaq!Ob{Fc+uthG=5Y-i;H`Z6{!bBBc6Y-8zwj$)LAVNE4X2_e
z{f9>hp61|(2Ov%diCHkZ#9S#rFvC(aSos&1MMn)h+waFUoxvQTE07dAZ1iF+w0QA#
zFe4^VcSU?P^J4qyQAXRJ$QGQVAgNTL^u)*R=nTk=pZjc=r4>H+-lDF(Z^QxB9vaXB
zybp8F^S=OH^EH43>IQZsn@g&H2Xp<Ms|C#Xh+q=mFW{iSA|{Tgs9<;T+Mj0eVN>qL
zV^n(vo3b&K1`x+9z;Y~(fP}=0Pj_1l<^c@?7h{v_p-P(~t+Pzb-M!sH36+k>DLn8p
zp$BJ@DgUrA!lh3U>|gF3wFA8+5Yk4E#<7zwd7(8u9QT|A(oG!%Z}hqW2qOciDm{VK
ze}SVZ7g}lech~q^iGl^_Vb;SyXmtXc&I6)iE7z1Lf;`?y?N)COP$MFS#Tj6yTiMw7
zfls7Vn~&Iq{Zm3x62Oa|f_+y>_!D3|V2Fx}>a}>P0}bvqhG|HGHy;4YNF=kqpS(=a
z{THGR%Xj{MRcRw(khTNRxEgXJm_CIoPn%k-^zA9gDMDaV5ddu7symi@J-NdL1Vrg5
zB&H>5o$!{<EL>}_co8VD?|{hK3aBF^d`<tn4cr5uDit4Xpui6XFR|z4gc<Tb&j47{
zyez!D8t*gP95>|vg7X<@M!j}+KBEDp2_tdvan*Y%ha!tLN+SSf_&b=bVO61Ri;B-2
z21=5<A2vuWjak-nuwSOw0!5~wAtj7wZq7g|==zqY%@-G0X`>p3NL)f;YdMQ=94gJ7
z{2Orm9Y*qZwag(9sXgwOx4}$cJfpq<+@Ve^(~UDx1n}+!dJUXHLPC?JdSh%{WIvL_
zgV3<0WMXl&lY9E9;r^tK9c#%E16-M+4~DiWen`*@PJG6?@#<{=n01f`iyWwl(~U>M
zH+sFkJTPN3!gyxD7a|25hf~|VL;v?e@QD5-5~2CsorGH5Fmn<8SbptEvI{7!09FN?
z@4^BOFDU@{nL<NiD131#yFwY8(Bakar9up`#4O|!a~11)T2=3Df-W+GDQ^HX=dF0R
z5Cj65vWC|NlaMBFK?}P|h9dY3ZM%OY`2qRQ5-qzW5PLKFtF?H#q%%JG^5y?t5KC&q
z-!UzlfTUCJea#7IYIy-6gT%yND5=0)94mVram3U`!@WhrM#brOTsWrQs!tInO5i63
z;ZsW~FU=6I8?ZfPo>OzN{ZOtxOO4Y;;Y1At1BPG|?EbTrgFS$$cj^&vBWA&XE7(MY
zG>QT0CqbpLd})Y1W>S)cD!G%)Fc57)-jvFN%!P9Q|GCOd0Wd7c;ayxVRn^%Cm}q-}
znl{Wd$-gF^9GR1=LoH1fKlPguIPgIOS{DJ8(<@;7-_b^}-STrLN-%VC46w88^1_ik
z703|mG+Qj`#OvNccD{&pM{7VTE0Xga7Wd$dyV}`SitCz}b;Gi1&Je->RmP8}QbES~
zAy!o=TXkj`huB|bAV=SK2-=zJa)h!l3CU*STeGAHT!sKJpsjETxk{!$snWO$ekgyF
zBT?kPEBqkQCJ6OgsD`0|f0l*j8))xS7@rRa7U@jluo-1y5iN_DBNG+l+r`pYr6nZ`
zE5vSIq~NFSQhp0ukn=;JYQY-}HGJnoXv(f@oBQNv(6g#uYxoIjxJB%jy$A5xKNdbG
z3NQlyaU+=LQs67ukM_TB{UsE<btTwfSDaXR2av}8Gn<8#C>so~vJHq6>t-_hd|xHg
zOZleUk?Bcbc+fKdYo7)SumGQEHc1=Bk*UKU9tE4Tcb$I!@4$j!(l5sw3=BxPjx3Qy
z=;`aJyi3PFt9P$mcg3D-_;I6)7-1X}bEDpq@b3sR<9HczEa|SqYSopcu;<bAZ%w&@
z8>5V<J+R`^xe~Y<5MFb44ZBs!wm=u|=yArPovS4ginx9}jdx{y_Kf*TPz{<-dP<2K
zs(C}G;FUnoXoBb6iN=J%GTN+v6M(R)G(xhZLJb}FWM1qL{qNM^D%OmKxbN8dspl37
z$G&#dp3{s!>KFur$#tR;BE6rJu1Bw;dQtbC`*I*vxL-ZJ>UkQRMHD>8=CLC<PPeoD
zEv1%+nBMrxLls@L{8KAzJ~t<2v}U(70>p07s$>}_EQ}E1`h>22#v<lJmK`P=->FHZ
z1w1QBD4VMUAs^XZ&LiEMl}#{+O{Dhey7{&E;{J+%)*~Ov5)~6W&{AmQ4>E?eZp5<e
z*LFfBDo%t2a07~^-w0hAyK$E%9=mh&4F+RWYPA{jc*h!u{al^x>gdCHmkkX$8d@-V
z7=#}wUcgb6_M-9SwCS89AQ#RvRwJ2cF<-70f<ox*=l$?B%I5>ypI!S0E7j)lHJ%?t
zG$NaC=iO~5tTZLAuTh1puQlE*iiAm3G(Y6g@Z(@VnbJ{i#jGnj_B_IKF707xhppg5
zfnhLU3P>x_3!zQVQ3=vmoY3hkgo)dEj-FmUyJB4oqBLP6>{s~Ks>Q7sfY6c|73R~*
zsKuVqw-)}J2o*m_Ixwj_;g;Uuu2vCK3wGm^B1Kw=6QmWo$1?@Ao^xP56FDQnSG@Yi
z;6;kM*0U_O=8!GUb){eA-G2<=q66p0LM!)3CJgK0gc{qPK;x}wAmVP`{+KVByX``q
zD=3Y>V|jLB%8OqAGf|t%d3rKx97QNRbF2p}ipDfgSL;kUyGtjb!d#1(AtT!qhJUa{
z!Eex@=Mk&78G^RN-V^MyTsI4LDZ-);O=kx@{mfd=g3EW$fsQy1y1rm^$7@F_^_YQ>
z{dZ(8eKN?VhEVD|zYkGHc@=xy;h7q>9t;0crP+5vc+hpJd=ZLop#-T<qx!PHf=tGY
zV0+GY2M<o%e#bGS3hzA3fKB^>4zk0gbfe}S6DvRWhF&;hta@t<-=-J)I^P<1me?NL
zSv-4CtXpuOJleGe7$iDmzge7NFZj~x*{Y!k`KTZ{(%=1#N^_?;J0V0HwUSQ5@HeNh
z*3nupX!jLw4^u3olkd$^;fpbCw{I7f(9tkIb*~c58|Hf$L}v47cV?0QRa-h+zU-9O
zu!6LFiZ|-=^MUmPnt$C5E4p!D#ggVol#FZ`oB4=VEakaB*~*v+bx3-veBkVx{mPr4
zCs*~>O{*!D?N<$hZLy=JOe^W*m;cV(-UU%KB5u(<#z^BXI;3yx0q4V+r}(!X@5-~&
zF+u@%N<<9OunvgydZe8%WOcF((d&6Mlh?G#5giTt>~Ti@_Were$_u(A&UcGfgxVUU
z6HV`8O2eA$R@(7Gq|<v7JtiJ3kI}=Wj*oJ$jDH!fw|2bY@LjTdmY7Ya`Wt8YGTM$4
zDw#n3tZlICYV{<R;{ICYid_EaO?n&V`YkF?M$~X7yPDrc2ceOB*>Z!$I;5uEEaX<I
z{Pmq>Ia{kzEZ;2#E7_-V%eamSI6NFaLtdY<*!8v#=lY_lk>{Z*S8)Bh2wCW7H?#(%
z>=eIXlOvh2ZZ7eKaj@)dE{Pkb#?PbmT1FqET?Jm(2fPzOiVD01H*}HXYbt|2`-12p
zjklkcXlDm19HP5en+l?&Nt@yw-krX%exRnT3HQp%G=`5r8sZ{4)JPL0@Ivyp(db8g
zhia1nkqJDOx4QwW&lu=YV_pe)p^NwQ-;mfBMY`H7M(-nziEW>6%kC}F`Y~S(+LM~2
z;5SD78m%9fm#XU&UwdFa`b9WRPCPSl=L6eev6LG@Ac$<HT;sa1fbc7;B>)~W#);=W
zZ5WkR^VFA4d_BKTI+$81AaE=Y=iTzmhKG}AyE`%^=&n&|j?6dUCP>RY|LSI*#&mp5
z-C%jLf70~>LCe}*RQ71T`e_1Pse%p$<1hQF>oJSLMQ1|IoHYuz&3`loi9^Sj;L!7z
z8IXJY&1l#?vsSAOBBry2@v({$+xbiQzZtMaiz|{w+;Tab%V-tz->aUJt>}+AuaQN(
z2@%zWEg-ouejje%e;V~`uAbQEisMaA6jz3htOV%uF$OiVVTLiq;*#JLKiaSbZP~R1
zAcFCtoGTrc@Z{6`#}b)V4uzjoF6Iim=bnU^xz^$Kb59oqpdhmb(T8(xo)<Omzh&##
zMHz`e{*I?v$LKlKetdegKlSU*S3e5Q$P-eEdLxv+!#~hospS2BA&kt4!}!<`jqDx)
zzx=c-+VeZR!HB8M$?e}fBgYxcxuwa7NS^wLNcGKJ)2)m8c-vCTVbJ|%PpO7UYKT4S
z^$_7A4*FUibiiaV^f!*kNFO(kZ^zwL;Yp>K)=4fVXC0{`x47MxXf_V^N|H5&(@u=`
z#}pdkvrI!F<itO4c{g1~BFE_LB))$32yH%=0&~t0e!~PNqcug%w^~8A>5-eaDiH)2
z?QTdbJcdOl9;b9)(>=4oK9!F=)EjnICpRZ&oj`HMN9QU=DhNr?x~ci;eJML977s@H
zDQQ&C-w&vbetfN@RI&OQb^y2>t;Eb(%Pwns)_$`js`}^QMnvMhn9N%0m+*d?aB)S9
z*KVqIp9Y`%LJnq2;4|L#$R*LM^}3fN+hiyeeAOQclTWXBPtladG<k-179|*iQyGSV
z9`p!hVs{&VhUV)=pLg#~h;^VV_4l|b1fs>90<DFK;`+Tvym{wIO4XxK=b%u_ZVxj%
zw4L=*Mf+T$Gko}Bm%xJiS*wd?P5m*|^;3bb2V6eKC53ZeS6>W6OY}!x1nab68yJ6m
zOXHq8MIu#N=ANsq&EGraT&8bf)hIMd`C=_epNqeeEd8zO1&Q+;g9)dI`tR%M<?ULf
z{P+)TM{hYu8O{bN&+PWBytCeD|GYI&T_>Yvf3|(T_kB@ZW&x8Ss54yQ4;Qz3o4f3<
zcXy&*2z+OZV2y45oDcDvIk(EN$PjgY(J|P!(=zWspPMF{_!2|%t&z;nRribJAJy8Y
zjh(mmBkXy0PVNG<IO)HnXe{>iyw7s|A~us@v0+#*7)N9MdG3|!PUZbc;P$(;m#L0W
zr2$ly^|FV{9;}H^cXO|QjTDr_jXd>y&kz}-8l^eIrpHgRIEeJfB_<quReWNW=8as9
zdcXM5>t%uTf#HRA`9hQ^8JfU_;=-pE>F6Tl{2oD%b}m-ZSD%gBLV}=>uNhu<2k?=3
z&w?nq%UH#=>+vUwbwcv?2$*pneS##8gHk}jQhpO|upgm|M_a*_2mu<D`-tUZEsPJ9
zIV*QLF!Fw^vFnUaB3jk!wEt=3NUnL%Y?bq7Dl3KGgj&D27!w8JUBTGknC*eu(V`7(
z{2Uah+X`JNm$zR(4m8s1zIZU?yr^KiNS9-VZCF^!At2(!!r~h$YhyD&%&R<&V>;1n
z@~Tgs__&p^V&z$}`jb~21V!BjR+zYMaw<){Ot0|e+$B>zKE<(#-7g2_n5=DmT+x4f
z$Kx-?Bu&Sx&2BF=-E$tnAC1az7A<opQiYPg2j(Qtq0qTc$5J$kIy2f1=VMpiM%J%K
zcP!Id??mmqE-@fpwwU`K?Y1C^TA8nDGFCgO`pL^u7^7DgFU#dzAmF+COJ!sk$Oy5O
zz{|9_lRrsiv_oZ28?txyB2x_E-%svq|LAgBvLD)qFDHM$);jW)XWxPH$i%>Z1b%{J
z#ju3md{(_3?}#Vuy9tykg?*icn@?m^W`vR!magYSfv^Kd+~Gm(F8uqO&8m}J#~rL3
zvzOk(Yid5=_vy;h#WZgT`a*A*+s8o|8O9&A?2F7@20>mbB}sYrGP-*;8y|QW@SsXH
zBuhJEI6jRjh+8IDKh3ecarBN?BC5>qbKnA_#H6gxzUL|8i$h$O9Joj@nKPr8hj#-r
z!zk}+r!|w!9A7WG>YG=Sj`w++ZwHrZ+1-fb1gzd3I$r5=&vU`a)xNoOy;1GCG3opL
zn(Qc1a$vjpEL)P<JDD}aFU(gJ(-^DK4l{<Z?w`7l;|($HzqdCJY?r94*Ct+i%Wz2C
zCCZZJB*WGH-O1y~D6J(kgOZfoM-tO`3O5<d9qJYxnw)k;zOH>Fege=j6D!VKlr;jA
ziA~b(LfByJ+KxVOyzY10&MIB`eE%R5lTr+ULxTK2?7eqTR6)}>3L=7JC1(VYBp^8l
zQ9;SkC4(Y4O3olzq9Qp5$ysuiBuLH+EU+v&FKJma_n`PZ?>}GFSNB$Zb?eqWRlss~
zPS2c~p6;IRnO|3Q=ifDE$VRevgBlSit(%`#N;vMaKw#{SPCUEPRs?T7m;9A0$+3*t
zC>vjX9#;4k!9#Dd<G%bc@kK-$*+k!%V1|wB93mZO0{)R=GMyi<SfrZ54^`<GF-Tql
z%rt!UW^7;f?u7!hd+#>@ltC+6WCAs7mdU%?1F~&qyA=O?tG8RJoDJIZ7I9fZK3I+F
zqQ&QL>3;6dbw=s6ALNhW-ZICr^bY$EyLSQIF+kI~*js@(c|UV@!m`6LzgJ2sLARq_
z^zP!#XsbtmZU|);&0)8q+7POME$_3`1qwxzC6n=V+|G@cL)d3p>E9{h_QiA4^&gg+
zSYtg@g0|X^IPw`x7O7@uwnz=^=_{ou4hwQs_Mtv@!mqe5?(rt(Go@jjZ_)h*4pY0r
ztGV@)X`I-lzIQjXZ<0m5h{;5rIx>x23`XJd$QLaI=h_>2y3P}BVZ@(71B<e4L4ZLc
zB5fXfZ2My62XU{u!82V0+&`x$5-NqRdOCkw5(G)uI#+yp@u-eMf1~Cjn?`9#&{F08
z<w!-ENQ!)O<?<<C(=Z~nAYr7B=UcBniIhKByzPtWz`W_r;%Y185n*1I(vm#pv(Z9d
zriNqg0ad`l>XUDNiq)14A@e*|p{?S%7RQNty1W_LFm&7<u#vqmfW`t$3`&9!;#{uO
z52PDLLgWrU$FDL5^jIvrrM$QlE>Goyl(1HQ{}vZJPXDA|OAh2->jYdkM{(@m3=#}}
zf^5Z-U;&8c&+A%!7wTLT7%itL*vDi_=G2AGUFWvW0l_d)&c$NRIG^e}7~>kwpVKMB
ze#X+m!mJy(%VBN|&|OR<{XrA7e4!G786EGKA9j_F^C@237Dtz#%wRj@S}S)gfJXZh
zy1_a9lxzGafgsN9`%4Jd%)L}$3bvzdZ1YGn@nkx?X~_;t3a~MW-|79vQ?F+!FFAQr
znA!0C%FnXR_eMYj?#`5-pDQ(H4|lNbB?^%jO=x)F<wJ(-xT#zw^f%!zF6pV7@BHE*
zS%n)UTx->5Ey<{jR7*Y0qNPyO#x<=Aj>qYoo!eA&Z5E=vq)fa=zijcV>tw-FY<&S(
zQqVTtD~k>~pHRPqa}7AYO4SjNCre)sZRn8WvMWvh*gQ?}beod&{vXZ-eod3h{pw8_
z&VVxGxN{Sls)}QF!P1d!nKB}kT$xmc>~MyN_el0%3uTw67`8a+w&H0fIyYJAi8ypM
zZB?}GR{cus>@-@R5#GbjBA_^;_-s=h?mu0T2R53)G)fYAZ{b<~<)|KS&-2Z3mxS$s
z-Pj?p0Q}Usfcmk`Z_jEDeb;?<_ChFo?Q~I=Z|EecNsGM<qrq#KkY=;Flim*T0f+j7
zE2n?=mSNl8E>97auPVvz$LBAZz0W)MD(~U`+D`qI&}0#D^89-2ejdjMvqJCn32b-V
ze4Xq<eTEx33|zfC^mYr4fIatAMJUn{ORS3G8<aAl9%An-8&KVZFQ+T$(Nj12lW@Ol
zmgOcr=|q2?y^?~^XZGbtXZS$(IMO}K4Hx9Z*lfqVh&7a}$Ms<|d^w#hA}H81=suco
zmJK7*(n9U0)5GWha+{sKs}S*^bR+ksCK)AOxX8V9+>MY>3j=<ZjR=)7ri|-KCm0Qw
zd3w)#b~DQ;Hx7+J(M;S>uwjxddEvav%LwKtHL=x+&Xx%(-5kYyvXNRy8@6!xYc=v&
zlY8oBBGUYinF0JNP6j+<dI{yVtFr}U6WOS`3LTOmzVJ~6Oj-2?$oreM{SZbQm%B)w
zX5V@FYi31SfqSC6#E7oy7|Zgf4^?u%%wsK8Xj5RD%n+5+71N00ijVRz)JMO$G3jnD
z=dPxw(E-@m3#t8xA0xr>@5gl|=KRM{_5DpUef)87Ec3KkMJTlZ;xGGApV9zECUbyu
zwrRGGu7N6~<HzfpfI`e0dS{<c59GcVRI@ciriVu6u1;?_Z3u*?mFx4d9&L-DbtD_V
z{mk@tpy=r>V`}UkwY^Hahi4sR<c(kX?gV25^VlG8f)cqTZ&p>&9so$B28}?lLs^Dp
z^9iZKFr4S>-AiG1*8xGmU*_G#xrg82H_U95ijBbpQZGPKHHblV5nOz44$lcxKnvGv
z7FKh_Aq$vpdKiKl2LYfh4%f*(>c9gET9bzj))dlZywW5zV!tvxmhJnG?@Kurt52AH
zLM8ObDjhV*8ajP9k%X9XpqKKJ#$a-Bt#Q3Kmv$_rRf(ir>%_vjng#D^a^fUGXmJ-=
zou_DS{OMYcv*Z`YdnolcNk1|>-_un&=U(hp@gaWtvkiMb;^vIN--|r<)x^yHO19CE
zK38NW^9ZJ{5Z<@=lSjiucS5Gj0@6gnp{o%cRn&3EJmnXB*-<4wLq;6-A(;zxhR-{t
z1L*D6ROM+d(J!8QQQkXf$d-xhUQ`Hn0{kft-nr6>%RO20=VBvBkfjV|(^*|SRCxH5
zjGyWt(<*sVSTjU=*rMZ5LJLa*0D+ncF{GNWB3r_Rg9To~c?l^zf1OPA(%99)3_pzm
zrzTWPScXxk8_BRq1FtsX+uVIXc80-R8O)H1Y=yj*zl;Nf0tX7}wD#A(D8F3SNSO~n
z7YWx1Ou}skP|PX-oDKPPlg&8tFgjYF1P*M(wsD=0#ZaAo(l7g$L0&<OHZ{J1Wh_g=
zB0k4_Elj};>paFS6{ekj<Yc3=29gwnAuE)4ow$p68(slt?~gqV0t%Ug+AM#n_s1i-
zeOQC4xQ<0a_(V;1bvuky$>Kxt!hDU29wn5d2~cLIad}6LoX0IWq#Gi<saUHz_Q-#w
zi#hqm!<fj)=Ha27J{_@l%P1qpKK=5}3@p@m`Qw{>PbskdwfCj54kDm7>>Z;8({VcA
zu3Y?>_Y}Q(d6<BEjd9*hy!z(KB6`)en5K_=gq&z5(;c}=yk{5v@VOhca?MxJ#ZRx-
zFFnb=-ivEBxhv)L`^f^o4RoT&EmgS|!nQY>T;T(6_ui;Zd{QA;{*KSjkwT@W&kk6}
zU}3m%;+5t@jFfd-{ilHSvxR}|7-`d<Y;D75S*E~6T3R1!$GXMD`MHGx9s+RgCD%H+
znB4N{lMCk_qTrB2%xEmwJp?Q0S*da<a7K=qhj{Ph&_?w3pmc4ayMep<PSA9;q7wYR
z(xCQj$c<h$QH#@bNZ&o~XM4fACOuIb(*>;2hAUS=tAvtKXYc^rkB_0|<2OZDoW@Ir
z799R_*-dUABF^NP!dAxEN8ri`?xqvMAEdJ@GdF}%4<tWnA)6ppswyo48+}|*Hk>UE
zxf{)9K;%Rrv&a!$f-xlPu2Hn0qEyzkIGl*>KDVGs5U%BqAu8H`MZgRO_Cfw*s+atX
zi@92@eFw(Ms%3@0`WoZk&)AU<(?<MG=lm54y?pRt68ZBzNoy`f8ufzpQ>G6O2|q2o
z@YZboK=BKoJfv-jqywqTI`H5jcB|jDDO_<OyKt93Qi33efCQ}*1$CNU?(TC*bgsHi
zi>eF~V$v$&;<!orE?nYj?UK;YtdKfo^UM}MY*r5CSC-{u#OfqkI3cKQWOR2Q#@!|J
zgQtj?M!&)Q@`n4J$-()tgU3*FWz>h&LAPC@YuYf{UDs+h?>5Hw@1v^Oqf$h7$_G>z
zlQ+pAPitx=8?Du3#{wQO@V48MpDs#yB&sK*8p(WgouTyJn`|(feX}zlPm}M2Ns99K
z3izsfdE)h^yQ#>?FBJ|g2^VMzj8Bd90fj$2#~U9`-kmr(4?daT!F;*0%aaRHj-VBg
zMe}&VoSX9PY%ZC^VwC}D{#GM*qBJ~#?D`YUCx}MI9mqw6tK#?4SWnvryZ@~ysyHyo
zV3q~0nqJ!Nk=#hKm969%i~o>s*_O}=jO!meTABKuQvHWqGb@rD{KDVXU6L?8vG_Co
ze9l%GMZUM5(WNl*Ft%pcd6@Q%uHOtK_sE|=K|9r$Xw??zh68D4D%k>>#b~L~6%Hdc
zt1sV5`ZeY=i9S<9ge=F!1F&!3SIXe4&UBH9WqWp|hA4|_B~i{%)g0oqo^?at*1%$u
z4tds(HK{wr<nq;;BDPKm8C^iJk^I(7H_1L*H`F{f-+K*b5>Xog_K%Nu{1xwBtlBGz
zPhIx?szvj?|0wCH0`^MJMEqeqq(<&iQM_W2fhdWwK#3AES=*~=n;hwxBe9cAZ~i@(
zqr!n_XC{L-#g{=*Oe8i51*;kpcIR&!eF?r5BpP|8&N$b9{J!hsUdnwuQ^fq1rQ;q8
zS8pwOUUVUv0qZ4p9=-3Z)`t5ABDw(;gN2Wk?uFvOB7H5Ef`mKc?%0Gw#=TSFE%3mo
z#OeoDC@ImhsRitz!LtR`RvYK>z>jrnuzDN#`16C<kAvQ(ez_dIh<gOPy6irP(?ymz
z-P9nna!$K~FLx1gJMTI|biIxi9odgs<X3z|K$RdjNRD5M6S;jB4oHcy#{M{ir0_5-
z*g7}@4FfMf<EKF5EqD3u#b+imJYgQNp^jryZ1(YJwAqHIe(&xp2?xO^0~XE0FY1>=
zo%?p~67KLW!#rZOMX}maQYECvyi(WSE@~_}f`z)(cf}n@Hcy#yEq*&U>P1df+D!O^
zAG7?jp}lf*H88yr7(=W}jKx}F5X7ulBo{(#XtO$_MKpGqqAa$HoWldxxbj?S4h;i)
zU01{vPO6%~8!*}N;nukvqVuyKYvB`@@MT%CQgElr<%Rx>lR~w;NFWmVp;q~UiqB#B
zPWih7g4%NepXayy(?0yR#1)Pmn-rTz6#srPa_s&Z&PdOLju?B-7mZLR+3qyEAVrHG
zdeAaxmfktu>yZiz@Fl3>copLK{975H&jiO1;-I?uH;e4XXr~7cpK>Fi2pg+Mv-fO`
z+_{*517D<?)#4ipJI}-3In(SV$4#NUq8FEXGhZteP6N+Z>T?_*b&7drd+@8`2dx&S
zGT0|nyKb&|G#0}R@q=PpMAiB6MYHUX*jDgtkz56J!&Y${gX7rnm4ynl-lC%2DSe}r
z&8l}Kvz230i<=}nK(6~(eqy6rU@~oIdZr<J*v<3M^P6I;tPh1#wL9OT{#eswnQ3b_
zG30yQmA}*T{d2N;`8wrR??eIo;I1C@+g_0@`PCEnQ^f)G_J5SS0e1iAK7SwNq^Y6i
z$D<O=@bkOV_^(mK@%z1)3Mb<>DNaSXy5Bws_;p`%5tdQpH4c70w4r+pKtRa=xqyD?
z<YP*6{A;+cFTwxDXXRn?zYj>3*suEyk|znh_ZyY3TvCo`T;ko)KWQ*&ZhlREYV(^o
zJAhD?_rtm&g1MH)A@c|6z({{B>3Z<2RQ&+L1Ah_$%E@0y?I3mJ&`mD|LsLp#6&<d)
zJ+6NbVKM*dMsZN&Eo-*j9GN|QNOisH771xiE~No)eO0+6+Oi^c*N1qeb^ms$0jBF_
z8$^|ebj51%8S^@o;D6pnQtMp6Z2nvAR;KyJykVOiDeF0;KFJZ9E!&6KWrgibwO;4A
z^*(F!n>H?OHOr-;5P9w(AP8cW$>cT_`wpQ^EOn%OO167}t1WI?xQm?SXYbzSeILzx
zQu{l!B6)X(hu0+*`8Kf60DQ==`Grm8MJbb}<lcX}P*RI>EcVz)2bV@fUoBUDBxXN5
zX0%^_b#Q#UO1=>md)+0Mv<$8=uy1VSt=qxx)m&@NFK)LrS^w=u*_2H@le;;&p9XQY
z5!k&aLh;Tv$V6NKH2un?gDtn4xB=#9(4?GSB_OVPd>rV$5$HG+2UCrL&gaM2%YaZO
z{Qtv&y<Z&%A}FRtL||XudoUwro#$(0-mOOcQ1O!~B^ff%7v0P_+Ih&g7X|7Q*jF{L
zKeL~g@@N!HMK(T?{UH#x<<V>-I&=?H%YhfE18t_{+WfeJD|@UEzg&Njd45&0a?D!V
zBCsBGbsXtS_el4alYIR<Sqi*1@XqS^yCHEhU#p<})h_<rVKPr17b5Q)3m+RPslyd(
zc5K1gCCn{)N{mkB3PslMl_2(85L*738sf1RAl<cOn&0e5{p6>-ij(Q38+xMF4+)<B
zN0-@s(d^<aA7Ge{&5Qnlxf8pPEMA~l{BUL}x3w94#yVLw=}9<Q<^%7(9#YbozVGeZ
zR3a-XpGU_S$hkHJF0c@ET9UYQ|7ovsZIL8;jaKoVOv%Zhggx{4qEag&x%56JP5ZwY
zl^>eh&DzGFLmQiFncQ^{n`)eVC5sr$U4N^Fa;DrjMHE{)q?;dJo6Bn5I9D~>cU`h3
zA+~-WWg_xFV^K(7_$}Ifs)oDzgHvJ!+9_MrhZ5^^%ee32U1N-1Md65=jULe1SPM(E
zw8iGbYrjENSmAWiS%DW7C`)xv_NZp%?dsOGvi5IL1xe-q@>y(W{!p1}Xc)6;(?Qc}
zX(jdbX=2j(B95_ZZSQO4)^Qqpp`E`<tjZtnpf$B<x~R#TPfkTv30_SV*oQciGj~2?
zh<{ji;vb~XC-3j|JWxcf@+vlWmc9}9F6uwkjH3Rv2x2(rs+c0cY-eh5T*o!9f?-56
zv0aTDJ5_M>U0S^2(snq%@_lNJh=872x4R5mxZ7k9M(Io3X0MQk*(pV?L>9l<IGMfT
zUHT@F_aemm9H1)qDO@`i7B>=SEloFR)1Qew^b7uPe?66c1Bw{)7r<4I;^&(G(ng$J
zSNfa68R>u4!7`5mGfac%@#rtv&=fA0ehl5mRSt>N7$xs;gd4fAFwXBD_FOMAHdpm1
zejR`MD*J|m-b-2Ev(U8oz4W1FKYTR~w1t{~&&hiq&~zUG@N;4!Ds@Wge=O`f@qb@L
z{%34Z{`W-c|Ehd|aox?~9>}e6gpy&g?trH|6@-1!eebrN>KJFqXrwh`^!v1wx3>Tw
zRNFtLoVHXs{ynPfbR>ZBUshBu8i7*Pm3wsq`DU&D*@wQp*{wo$|CwUG+I`{L8LLtY
z?e4`GsA#v;Gg#M6OH<16NP%P)raHGbQSL_CW!krk0}z2&|H^Uq$GlGI)E9b3&5>aE
z>3Z+>zHhHxy>3mQWS~=>;T|pR?Og(9hXdiuzF5*@m>+pfyTm`bl60HkGFzSe@N#@o
zoT~@_ZnK7dn=Im>Z%60xWE2*jvH7QO?aYptKzc6&u!G(tGVkKt78j)AAyiaXkC~G)
zekEULm{9c_$MoM58BZPjgvYi)Q6Zb31;!YFM65smLv6uWn(Z02D&lk9*u_8(`kjA$
z?i;lsz~AO4^qhKvfqspE`-Kj$uY8;2wziC#gVZmBF=rEO@1Y7joq@jps<PlDWKQsY
z?-#fUS|i1WM7{=gAr8+sduXVue?b0wGy2rNm8@S<1g|{n7L4xPtA$wC{`=}{&$tGF
z>)S%eLFa0rP-nOeZ8ZQOm>r~f{*|B)E9jPkttP1ywWHJW__YP#RPW{74xD5s7oY3E
zYji@643qI*ySDBC91~tN1ljGgbQcNF+5)hi=t>S0PQd)45}*7g<tE_0{Z`V9dyG0!
zvMu1g$-A@c>|nc0oI~e?^0ucf{I?}CRufu)G+SH3uuod@klKi0$+~~7Lb;;~F8$XD
zD6Ib{t)u_Hu0MnDf7BxV|NH#^8-4zFWib;v+p85{dUO=4UGEt2<G>-Nwp;&6d{T{7
zA%8d1*7(?YA^Z*XIB&0hz+McG5;7_+7oHlSKF4EJ*C1kjD*}FwketD!R3hJAtvM-p
zu<2=ay=#9^M^Sj_Ca{s4*-WI=L-50%q%|w=En<|Rm7R(Izn0k-;+Zk}OHl)*Irm1I
zG!<b@JH7}|u(&B3;zAv6@9xqi;502KHj94dy}Q(KImvQnkvtdt1vZ~h%7VljTW4A|
zyA*FEyln>ozm)T`RNE%#XqdsFYTnwVCkGEr#;h~&_vY~UE-RaD^C&bui^Tz4bYe1~
z<}|d_G0~hk_e&-I5Ozr+HYf7r`PzIod8_I2Kdl<@Z5W62U=(q({#$!f4l^me@PP|7
zt5OHD!@#VG2&Hi^ew<+DekGP9SloE=i+_Q+Q29S%5mW1tAiHYXXr4UFeG=VNzKb$q
ze3I~>xzZ8#)-7GCC(B-fIvZ}z|MV`3`kVDY?4q@G1IFKe(bt}NsyZ_RD4ls1ZwnN*
zgWj@Qzl*f~SJt->v%>uA?Ahii<<a;ZN{*^#=_Ojq@BH&6zQ)ZywKV!O(x&N2#Vwca
z)D3U_hX$YK_;BkSu9b}^4Y<|V_{%BiIIAElhbA*B@zmgLU$fni5tCKbxq7NhKbzfh
zM(2}Dgu=kgOhfaQV|bh14Qt{<N*YL4Q&8aY)AT$y+Cgv@yfUV8Y##K@wR&R~Wa_cY
zmZQ_8t1`HIz@DC4)IH#klp3rsj<zN+dZu)(RMr$%8oOH0XI6+eCpxs%iyb>#BcB<X
zC0b%o({o7UM&q+Yva&)%1g?2DlxYs(kQFlhGPnzVu7K~3cLg7K<VaEa89%qzO(as&
z;ZYB~8V+q|lB@dFqIgKrm?Hez-^U2v4R8M7&UDKL)o;uKu{~Nwm!T3lj*ZioQ9~PP
zm60)+>cMwPe6CsMB#eyTJX*J}ABRPYIKp%0Cdy~=E`CtYgYvH^@i#JSFR|olJ7?FL
z8;}Qs!d3$t)&}OiXhRm>gAdZ)T@*Cz4tB^hXt3(Kz65;`D-D(qcOA^<2StHTu;t#T
zHszG9@0$W#Nz=GwN8t$OL(XSC|9n-(JOz_NG=sI$#dEr^Ph;^e-!_h4Zelpfou4**
z`ezYKvJ)W$@FC)7E$6SlNt{2&NQuCcz1*R5GT*kktEBQNPO(uBzol|Syi~?NcXJ*o
zkJxo?inN`U6gfChFCWr%mvke?K58x5iJ8rdprGO{Q2*rp`dm`3HqqIkYq_<sz(GI=
zxn(gbErxLf8sg|K(r_&0U)ipRGTr6TqtyP+T<?WuI`rH-E&IG%eq=GV#dKY@qed)t
zwl%N8(W2-{>oSW=Bk~H5=%Uzmv_)h5&1KAZamvF<ow!YfArGtV>5~YO;{L<g!JOQ=
zv#ow{Cf9VG`QHsT<7MN&=4XivTM+e})z+;pI;BCON}ENfX<M|=4cF=jcGdY~(Vcz0
z773X?(&fUrQ?D@ii^6QNMkI4?_1YQYN3}Tkc(!&oXMD^<80p}-_874{+J0yyise0g
zfgjiF#^%&C*i;;^Eohy-&;(wso)>bem!01XvKV92y7X|~ZyFSv`Qel|i-E^%dU}1-
zwMj)xG*!j8FRwS*c*Jw}FW>mSCRSx2wO^et8M&^VZ-mKCh>i87mFQmrx8QE_@Y8+!
zuB~w>G&A{Lm5`Z5W=hxC(Rinuw(WVR$hC784Vk8MWUBq(po^38=DH4VlTzZA&AIG1
zxQFb<{JcZK3Ngj?_=10)FRNxDORZ?>sR-ck=xl3PI%XSh-u8`5-Qt7ibo1@7iC|^^
zo^Dtwc(AE)OyqT4;E4k=)%$pzO0g}``*X+bD@QWca?aNN$hR<TD*-Vio)~vANN9}4
zP5*N3o6~jW#zBPto?@fpp6JV5G@m2K^EBGc@M~Xb4owZOGJ(O#1Q7#J+?<vBuh(MF
z=(YB*%Bq>=4%6K<)jZ*q_zeGi=i~H`G=|;!o&X+_9i4a~H8ISCRm)ILO=Y#5d2z+s
z6eqiK!%if5sfVo+Lbq@09DJ2?<ZnvI+5yZP3>Rn00i}r^9Sn~=6mEXb(b0jLowf7Y
zH1_)>4R~Ne{20QgWD3H}2u^%6R}dxE_EA&KbPL$o%=%swGxbxl7Q;4%t;*@HHoqDD
zOb@&VPs#T4qQ=?L<aW2t5iK@nNy-yQVWq?Cvd5Zcu9MX6i@&(r{2ok5n?er)b_KWW
z2-$9}QOOV31$LRa#$Nm~Y}OE`GpB~EZkR|o<#C-fIMqgAR*uj1UF4i#01e1&54Ouw
zQG3Q;VO;_mB!L`FAI7eFsT&H+tVcuhXlAE(I9c@zT3f%@5sJY(97~@5pj-(B9k2wm
z#p`7>HBgT?8#gT`*Ntzrt5*xnE_pbv&q8WMf7GmF!j%yPn;xl^ug&|cb><KdHV^7t
zju$Tw4F|Q)X*K>y^i0<N4s;&A!HI^m&CsVvOZ|ehU8)jYb$b&ps2V;vaeGM>`OSU0
z=IHVV-+|L*Jd=2V!~W~w13nE;Impu?BwKZlfJKJ!4@GNBNbp#V#~Y@Ga~^g*Q^?9@
zDPAz^#kN`gE}G0{THX_j#jr7_ZgOJRU<&>pB}p1USK4Ydr275N22XE#oQNFG>itml
zdj3zPj&oPLEK8}xO`Z>cTfN)ZlK8!59>jw$W!NstTU;qp4;;<)sz0aN^Ot_*)KPAG
zIKZ-YO4an;JfitCx!admuY{9quVjYEmx5V<``&)ia%JuC=&TZdYe?yI!Xf`Tz><~x
zG4hWlV4WPqSV_W;yH=1}DX!cYpk6mzl?e4bmAESY!2?z=veuDBN7D*hGoKG8)+9Nf
zv&hpqx2hD4pokmi$PZWLV|Ml~J6#b=_m4Y0o1Zj$=1jU>Q(}KI@izZU6GHoR!%D$p
z<KvuHpV{Rp6124m#T~=VhEH}3DUNLqfmm&$nr(24W;{}}pSIS{XtcgZ5<?rZt^e`T
z&O=q0RM@1RRbNN5q87yJ{L5nO*#gc$!p9I}tHNfQ!Vhb*%~wBhDUR0~5qn`}mh((<
z)!q(+ZO^3uVV&k$M}Y^+H0om*_<SpGP^xua2V$4nG%tfjB;`OfrCHCNHj80|efOCn
z>u}2Ivs%{X)drVn-P{&DTSw?(E8Nm(XfDf5$~c%4vwn9>@apf+MaQ~d5mS6TN29o2
zbR0NYb7)xW-Q#p<+Fheydx3k~lkZrnEI4>KN?XLNAVwReVKJtViPASbvHhdzIVqkH
zoJTpipJ$R>rWQxqzUCcml$qt&2KW{EtJ>0*ZEy|^AFTmQ)6Br<Ic1nSmsdH%TcLKJ
znA#>QUzxa!#2by{0E_jQ!roils@m$d!Vt|zZ-?s(qB-T+S#ldMu8+<}>6{7#%u4v#
zt##0~lRnms_RGi=m}kA^VBc6SW0A<SJXVFe1_sGCU!vl|$D~U;h&~OTPM5hpCo><*
z{4VG)h}>c$TK-VN$#Uo4TUEaa3B-;r>NeB^_komaB@ffx<q!EG0~Qm~b4u)yTrr4w
zPcMTH0(cBtJ6sgU-6Ishz>A3*87s>W#w0!IO)Yi9W_uQyy}+hG!!Pf1EZRDP6fZrc
z+_~!Q8Q5oY7n9687F5k#qGCVg7VQW;q45&CrZ3JptX_X@i%2VrxvKwXk4dr<+Qa;K
z++n%NF|$8y8d||s75J8lCcbkgLUM>+OLOOJ&(xuo@84gztx<{bU1Gj#TXjp~7LnUc
z73$?ekBBnY=I*5@OFGi%C1<2QW4k^bf8ZB)r^FLsAnVc?yXNLCL`hkoLM+D-)1x~0
zKzP<c-1BW&aWy3@*XhSSVx`~Js%5QCbw1c>NVg!j^qnWS!!XYMh>XLPS5$+c8yDna
z@Uz@BN538^9g!v|!{#F{A7%4ZN62z*3(+lOIkn!+$8?C<+*+c>cX5&H{G#{#<YR;v
zm!8l{xFK^-N?zWTxB~Qi_ecF&&Dxp5I4u!Js_-&})A0&+Ft&E6eU;<c3-=2ngF6eV
zYf!yRH81!(W5@MvkfBG;)r#A7WiRC%j6ce&)+YVRs~~I09+ZKc3*fDBbL5+tZf(vI
zv=Ux!4fLNky={5E*XG6IJ4mJ}M75b4R^b1_@16p2`jlQXg#W;%vDR#$7tuSD0%~x8
z*mtEDT1sE56ntI@=V9(0Y3jBGSFL#0I`XM|uFW4`xv*Kn=u7$!Tbn9_B(8i_!z}qv
zTz9uxMv_d1=9Q>LUUXbysnZs}2XktQD;T9G<WXJc4OjU$AUCo|ILr+OW!bgUa_+SD
zC8v2rrJZ;o{bk+)k8gfFY}!%m;M;Fgr?#={K=ND_pF@#buaQjx*6v3d$dj|Q2(Oug
zOVyJ8qh>fq7ji2n7rQmOmg0HgZX%cNTxw0a#u)E*rdwC7`BZh;J;%K!@W;H(Q0yIt
z!Nk0C86}ba%e!mMgQt5}qmMpAx8~w2rR6{+pft7fcE<QVz36n;c{q6*@}Eo4!he{m
zeqpRBzSPaE;mN$XX7l3Z>N)gVUW&#>5)vi~KlrU$BEhU_SD=vIMj^B-q>%0^cM2#?
z;<9m(C!3L<s6}hbXVJZp>H5IwrsrKDA6L+^+s4Yf?IId|>J))!&aB&3^${U_E4_6}
z0wB}Dh+O~tZsjg^?C@zX2Ja68&jREV%YD!D`F^mfLHTfo|Gl}ZfG3eH%6dtH;3Lf}
zcwf=!u+x{)jm<`xTCBGa<XJ>n)!?p}_TrF_WYopJI&qZgzDM&R>A7EEt}x;klVhLO
zARpEKfUNY-Tc(iwj=f{o77BL=+)WVF8%uYWWGEHYX@0mw4DIB!+Oy-^-<I3Xsr2E|
zWuT=3B$pES1X8AKe~#sNCe5c&1{KfR7!u3+3VKrH(dWu{)aU^kb9IvNDi4ZE&5wUF
zd$X*=wovZqRG;I9F#>7rbp6j9l|9w2nWl!%XUeDwWYUsLWjNraK`O^T3PS<yzy3Mt
zWC0Vtbk+=C6@w3(ehGq#>vLW%J<uEL4*#@XtbgeV{;hHbW@O9^OO#xjXL9WRxtnt>
z(g0?$Otz+dWj)yNYADeMJN`&D_#~*SE!3@fZU```o`~n^4DMm;iYD>%G6>zieTFmU
zj`>$#f|`SL&=xGJd1rZKjPHk_`mK8M?I9syAob1Q;iI_{)Xs)`>8j0ZX@8<0(>n96
zw`SG1)+T&<ODSIr*tn4WoPlp9F*~I{C+T{Iid{^qLAl&9avo|W0Ei)E_A`xt?oKN9
z0Bqlp?~P~2<<=$RtxpCg&r+Q$!*7h=VYNdI!&(u8q{6`ojirBU=5`c5Qk3)v2S;Rc
znOQT$rVr3Aa}z}l*g`8wE+*!`5BPegwLH=5-EqK#F5g+x{=4{*szW(p=}F1Xqj7)5
zhMb}fURDkMRJ$IT**qR&_)iV9zo<ir(NNDQHcmIcVgjV`d2#etEMP%lJFlWQD6Zo_
zDEl@zcYf~nWtAcS)A|0N*e1&VvncK#-w7_A<mym&EtR0%Ppz<4=06qshN$G(P)`&r
z>1|GyY!#ydm7`9#`m~0IhNg6pvi@yUhVwo;u1Nu?EDi9^Z!=2nJg-Bc3L=j%^t(&Q
zN)#o3+w=>-0ecsxr7HkW@;ierO7FWnfnMjwLYB8gvOKdX0O$Eo$i0`m?dU(F$Wsla
z(0NpE`zarb$2j^vIoWUD*Dvc(ToaVrcAQ0S0|CmZx*YM91l)Ad1RNQYkXo1Y+4H$^
zwZC*FhZ8oUbrHISw0h88)OJeP9gAuJzldA=mN4Hh`Pgv)@wE{>fU_NaTy{1662NL@
zp8D&I_oXn+mElH38+-gH<pDsP3qg#9h4s&OZs*w9*~!ex+G~Rf3$#P64o$rCTR!mC
z916HWlNG@xupx8t*Y^K6-kBx<!qW`UgL8M<F@Uz9tLc3D&v!lJ_xRM@&|rW|^1%?O
z={9bSsgZjfO1}L-5tEc)|HSaGS6?!phk-QVw(Ni>T{6qpzwhg-Y-+)oK!GDENT(~=
z$xZt!H{{mePQbyu7jE~w&Nt*$`E6Odm#rl2&r=N&!f77#d~YQBcu(qE;)~H|1`@dg
zwFHGP)MJpIpMmsQ`8IlksW;CMTmi|6GXUs-X?D6J0uDNQViFQnOH@qMC!){Z8NB)O
z6NG0Gbe|lDkT-#ml{IwurjsQ#xmw==sUFR0eTQEFwHpORsA$m&y33cZ>70Z<XWqvr
z?_b2GyCF}f{z^usZ^a&<UIkEJd}e`~@3v|?=o52?U{k)_Fo-lc@XH^X`P{9b{&jTk
zS~{W!9CTUAmZ<q!t8AU&B-}n8l7a_;I4SpU#(_?q%N;c{^mp(B6|xGH@st~eV+h0~
zAsF;TO}w9bT6;6hbVMW2K;>6J)7pdC2|N|^l_m3id*0_FxUXvz254I3ZkiVG@Q^a_
zw<$p=FbZ7qGI<+RcL=CfC4V_KCAC%?h+d=7e-$!AUW#A<Wqp+my`vl+5=lLP-J!on
z`$KXsVPR07vI9zPfz1=z_D66LU<smPWrhGF-O-<jn{ft2EAC?Bw7s-8!kZsx^}4B2
za0aOBroIP)$rPA)4twu&Lr^W)cr5ZVGJNfB?8cueGS{S1xIis;s!4n|O{LMe*l`v#
zG&Or%Zoc--4*ix!U~#jr3xmt_#j^=AWr{KXoBk6_IuY|zz8Q_5!EDN2{nOI`-^N5y
z{NegY5@|Jt@f$f4nwSx3h$_f?B@H}-LdBk<8}^@voL1pI<C+Wb`+}mmxHs!QJKjAH
z>dYYEjXu45CpnQgs5)kjwb0WK#r_kF^G$mBx2-5Y?-L)xp|-Us>H+sj1?z5lOj(Vr
z-WMPZ42C8O>*v-#VNFTvSHO+xlnp9l;l~;FC+B~r?D$(C;-}n4ypv}{l#eCNdDAi!
z3kD$Yo!`|@4t|A!N$Oi@cNIz2{O6l|W8n2ONS4AaGu-!T|IE#N>zi9~!gdQNraEZA
z$wBtSR+hhHuy4rZ9IFBBbW;s*`Oe$~2rLS+=yN}!77{Xi`cts_J`iRem&zqP0$`3R
zbWjS>{)U}1_<>CYzS<#+m6CRQwSQvq`Rsoh@ZQFiPVnS!Xmp(9&nrA*d`KV~{x_(9
zdyC)?Z=j%L6#i)<K+G?!;P1??{%>R(1@mSD+Bi4xtY26w*G-Egt#1NY)i`Q3C~aL(
zvYT5O{P~C2dEiHPh)75Z^YeRbPz9+Dk@i*MXD&N4IRHu-3n!-%q}CSn?s;bO-Ta0p
z-MlJexe60aZVsl@goH+%1zJG<Yc(Lj+Rh*4sh$qtPeMrSj(`bA4{!%DXbjd4fG{B@
zKnCVOmi3tmqu@F0yEidDKEt0Vi0x^$k&zK++r5df!o0JQzuj3#3%er)bRPX3*d67M
z=;-J*#JXBqT1f)7X#j%Hsuj3I2Xvo$X{FFW2|-y$M6G6^g2+ng^W7|mHUpIFQjKk4
z;>Kk|ACCpBrj(h|J%4c2E!^R=81I=Tl~++w;iE9QdG{NefI#+U>HypjTJ+iO5f6{r
z^~IV*s`(cR-r_PvrfTrM7^FZe%xIg>ccD)|KD)ZldmO({Kp4WVQ!}ms(#rwR0)-)M
zfD#8TI?ew{58Jt}{CyWz^U<iBVxvy0qHu)|6Q7m6<7x{=c!`897<3qCdH74$6ml0Q
zFV#6|4Da6GZ5aSDUuEDY1Q;uI-De$vQ4&|0@Yd^=G4Zv~vGGz%zp;B#ZPz9~CGsMn
zu!BBMkiGqnu~M7AUFQd=n9Lj10Qgt)>kI?24LtV)^*wg@^1Kk#K%&a3K6A9p<vh5+
z#uBL2ZpP?WxC($7l;Z3}ubZz2K<(tt>caEM0`S%Lyn4Poa>fo3-2#_^fdQ=cI(G)x
zVL%H$UD>UVr9Q{iO_g0&C&*_xDGy|C(+I|+PT}`mV9=Fhexxh;5(sv%a*)CNuGL?!
z=}#DRJK3yMeNwJEOLjVYzpF~^v0c${x|&>cxsYO)=}gY`KEV3Xtb8JOSB-B^1%ONF
z1;DrtzM+*=uo(UR`3wn%hvO)`Hay}ckPpDdWDLL#!3@Bb?9y<lD+ll{#1P|ZGYw8f
z4e66_^3uQ^`ga0KkB}r==Q$Fa-@hr2I)U{}k#F(a182HOIl^hgX0#ZN6jB67w1A*<
zbi!9Ght>X(AI5##GzA0u>=WO$82HR*lq1Bp*6)S0zCy4}Rf8H2oGsU!hoV6l3R?~-
zmk3>iqx<z3V$0xqhI|?&;|yT-m>Wg+hvDa7ezwAod9>TYwNcvJIEJDYgs#uj0~?P8
zvq}4vuQsLXlp9al0|D3~5zBy<uM9H6mhI>-u?U%)_7<^OXAHiRSTBuFI>=8VVj|^L
zK2sP=1v#<#*vIYYI18#V>bY;^2=7#lZS$e1leVJ=L^n9&qDbc4B)K1KFhVY+@Dboq
z2w3L>%<C{?;Dv~-yoW+J3qb2a?h7cCfALhI#ijDjb${&gNK-O`C;Njt-g+VQVjr~W
zZZEU4va*qWeUYc{eQN9hP<>1f$Q^a<w{PPnOZC`5j$({yuj_pGMhi?uUQHAg0r2b)
zF+l0z^K^j$^qf@1xkuT9pf&xS9mby{6F!EVsp9#yWBTQWpRO+9K0x41a`|RAmHG^T
zQk3Gf_}N+m*awn_8xS^Sl$<~IW+$}(!OhLRt&f5t1wGmrs|KKj_#&<I`)p*Lq?b`M
zzssfLQCxxLQjSg2y>bD({26mJ6nbnx$YqQIU`e~hOWJ%+lah7j^W2kK=U41#C<L40
z0PJB-UY^L#^HzyIEws!{KPax<WbR-D`92Ox$j#0xU)e*ao+Q|bq@V>6<x=r|Kn=nS
z0SV_0U<IbuB)UJz{fGv%vG6VR;>1=zVahS3Z4;~*73J&Ab~I)47Cfz>MF!ojUwj`Y
zZ>ZTVBu9_RZ<&U!-hpL+wcZSxjW}&hns1EeICG-VJ-pGEtPx+aF<n}G8vm3xTrCxN
zs3bb5g$Bz4`XB`X=!%US`nLfRUxHfCQ~Q<WWx+PrtxghWP68TXgatg@S-;7(0t*{^
zM;}G9uIOgBF50_3suUt(;%Ro2U^QiBLmdqlfGI?_U_Q;A0Clq&DD-WfVyfeqg21$x
z1|czV&icApW8iB60A$!y%<bTVDRl?{@@O`cCgzOIXAFSy51!RPY#b&(S<E$5C-Lqp
z7Tg<u%wupVL+iENZ(y67h}u3ds>D-u<OAIp^0}%6EIwo89ZYoqD#;pK@a&&p1<ZI^
zKrCpe^Xoo)r3C==wL;hfz5(|0TgVzZK?te7Yn#y4d4F3TbA^mfBPk}WW&CCS(7ax8
zQE7whT8cyD95$Wu|D5&@I8br`JDZi2lA~Cw@|Tc*3y_e)8NdD0DSWcp5-c0#j#Bu3
zw(g_i^Vv@wL#kml13UV3o>toF+4%Zb@&yG26x<&|{#Qf`8E#SpRn3Jdub$gV*ZF3z
z0El#8K+HR?u(!UIU%Cg3fH;=(?#PzXwM<@$z3e`c$@k}t0JvRe`4X7=p#!EotneKn
zymbyZfGBK<B6$Mz!oLgkW)|FD|F8I5@Rxmx%FC5sy?W)0{S!t+L^SnwVh)fhgYG8=
zoF6A81(S}EwZQf<ra6FtoRua3tTL9HT}qEeBgnYgwSepl(^{Z$04t0afbDkL7=4v3
z6IJ7ZZ@ZgPf9Ab|1UPEI)8`6EhGsEUo(W_mutd@T5LbTpa4Eh|O=&h_<WoV{aJ%&|
zO;_uR0MxmLof^nC4az5r*}4jAR2je^_-hQns5q?B55JdER1Vu{s7uVnIwz}Y&kWG9
zUS3%2eq{SFtt0HSUgX?y2(?<>qHvF%%MU4>7Z12$7YlCPXJCQa7bje)X1P4L`B;F|
zz0;1KvH4%EP0)LjKLSu^3Z#0jUpX31=o^vik92_@lYx6L>y#^RQ10sIY=BwPFO)4Z
zOC4dCnFa110WuR@be{s;LO>pQ%tZ~f6%{7c6)<D_I)DTa(GP)&?gD>*Am_c5;LX=*
zV=X6HVNS4Dq{2Jdfp(8vF-hAAVNwI!uy$bo*#wZR`f@+Y2j|=u28481oiV*yu1x~4
zzrxID#{fP9WQe+a{$<chGeFNWXJ+uAjAyt&JwWhQWXs$g-<hqobf3vE&X)Y`ZL3BM
zl=c?@3Y%aqnzGSJ6DmP3VCnQu#g+tKCl=mEh65Yj8B!`kq{GPyxRCP0M@rl=TQy*`
z3mgi;9xVquO}+jeUoZkz$iV)H*BwQY>oD>4PZT<;)Y-KU)n+4pM#3o1N!?o?ImGf0
zf-&C5=Z(eMa-u7r15yr^WjshJ4V<6@2ang1vNA9kFbetUoC=Z4`Z)RhniqXL<nz@M
ziN<JC2fJrF5H*s$*0@$Sd^)#%(8LnnphGF{EvKaC5?0E6KG^ii$7}C}SM8jKml1do
zpU>^X?-iH%!?;l%wmACTKJ6?4cs-~irJE{KLKo?0;+>MCcYI*KGn@6QOn$=uh3$`_
ziFunr3Tug39J^O;+MD%r20lFP9*x+$6aGq`xws-bD>AAR5R8i^uvxBy7@XT7v5IA9
zWwD!<f7}5Q=yu!?!i$E1``8eLpyp~XSYmz3C~74EPjWbL+_q=dw%RV$`n*^>qs!3;
zrJ4jVSl`(gw;X<_@o@V6Rn5(%H11+cT<q*Vw4Axb1wMZlC-VaYYoKYD<PyeNw&@?J
zWq%M*qNRPbsXlj+Y#P%tT~a;Py%7XeoM<#>jOX+^*AR1sp(WKX?xA)>(aKLyuSJZ-
z)Pl{vRWd12l6YsC`_#W;fNPl{V{14M3B{aF70$LgXj#3cNgBcPHvP>Fy@$Fa#?y0G
zKjO>|Y-QFJhjvK3Y(<s@idD`e(no0TMT{jwPjVPn3Kn<RE}Jf`&M(--gb$h*yP;K9
z6~~2N$D}{d1Fv2xKSxIRjMsKP#i)Toee>~G8`}+$#@6?;>EjDxzaOLIUd3GR>>Mnl
zyhz8aa6U_2nO%L8V_gG6VWo@leqrLnV~|**N3V9+3M0eIFJWE}4^bvWPn|0=-P~mw
z3-w0_>Dpedx&Wt!_@0+C58=6F{%B3kF1Sv}!}Vwee|^a3IIAc7xiXm<46HATs(bA6
zwJO~s)WZ85GZG_tjR1%}z%GhUaaJH*FSJHa{no8C;^m(-!w!!cO=g;oU@)634Xk<>
zCZNtdMD8<sL^`Cx)=0j7C<0sUJ~|1BEVQky_WJzf+9YLu&Y~87<$KZ5(YTzG)QiRv
zCwGVS)TAH%Hg6?2{Q5PTymHcOPrv$<<xZFU&`$l>e08>Xy7{5jpK%;ZIQf0|*KHtG
zn2Ub459)Hg!>YyLtk65H<!O*%tluEBQa*`e<dIRCAqzagrXvW~9TrF$lb%j{WF!n-
zQ?TY(*F6&)bkMEnb<llR8(G}{u{%Hb2o{0r8JA@D?nbH&X&50^y_%iu+$Px17J`zG
z#@C!H6B1`Cj};+Wdg!UAxNf|mqNbMfXh_RAhj!)g=%LMmSLqx_gR2A*n)srk$}oq!
zr1L=U|5;%WT-ZG_*iecZIntPC2jx9#K-Q=;CMahXx8n?4K<?&D&%Xv;QzDL>npX~N
zvT(-QjsS$<nUbDWA49VfS}J`<*sF&1#jq6T@&_C-_em9n4Ul5aCD9PKE7@9(@nyO6
zr*m(-t!yI_j!5;KRT+i|H8T@^yp_j7MPNf<D5-u~i$aqQhD0kzgX;x?!uh!jU!=Vy
z=+_-Fkwx?zV(7I{1LviZtc}Tx&5%PQ_;A*9DO{}9YnAh?qn=&8)vR{Xd#^B!`taR8
zxjT?T<*iTa_rS6FSbmN8h>Kx!$^q5VQ9Y#IcYSuTGIQHB5(PJD!ZZUF*2nw!7=_N~
zy%O+)_COQT>sLw=&m>D&p#rq!%2=fH9!RmZ;1AZyH9X?FXuz6`N|8ATyLw}aO|zF~
zEYTMRKy=@j1AgxEGIa!5Fh7n_BRTIpedE?M0iRq9wKSE6xu?(Tlqc*yLBH@_Q;t@c
zr|q`d#-(s!l@iUJ$=kA@_T9}9+K>uY?N%KB5}6xe_;1aG>0VbZrVuqRG=_IAH@t98
zJN%P>P`Z9rE_ZWv&Dsy{xq+t<P5API*(&Bxe3`lX_gT?bFB+$DjV9zHTpLbQ=Y+c)
zL>gDVo$pu;xQZx{YXzpAyRbajbwvo-W^jnIz4>U|9vKdjHfztDcka#qD64xCe$~tj
zsn^@SZ(4NzE9$78XWt`bjW0svgm1-NAE|lXAlDqZxg_$8H}x@)Bdx}RK`n1E6>k$q
zyTP&G3Q$slJ4wQVk9GI1Rc2H=OvS%vGR1cK7<U9?r=7;z2-SYDNK=4|7rb*hO?$CH
z8*H3zkO!e1TZYDqx6dO_`x=gekf9=*oM{)F5%OF}_-UI*?ixlJE)8Cbb)w?M*!A}f
z&O{YL8+NZnin&uI`0nVZz#*F)s6#4^C&b46Bp6Mj(aSi!R)t?zP*|eo1()tKiN_3D
zQd@UTLwvMgWX}jV7VM;ibVH7Y5)E1534?cWRviohu=D34xeU6P6Ly&eYjr(`je|lQ
ztSZOL5f1~fCx|)rD1Vrs1#P<_x30A2H==Y+=L&xV=hK#Azb5S!xD-w$9PW;G_7SE!
zn|oi<Q3qShQs<c?+rfNWJFj|skh7Ej<A{*R)(e$P{)DYTtxQ@^K29-kl;zl)OO7UO
ze~+^{I?K{u;?Hyjm&c7|REgZuUVJ2=N7UJ5Nh6~6ZCr`Gx!z=7SI?TC>nAH!BibA(
zvvh0~3*tto|Cz$i@kuX4F4?$e%gpL?p@P5Dau%T<;mDlL2g-HIF-3uDG!x^DD<Dt1
zt{k&*ue7arPl>p*_hHr~MHf5AH6s<WSC{^>18!9(3v^E=9O#~XKl<c*-PVzbPgMBp
z`e%7oD%)2R02F)B&OQ4H9^QS@3!fMo&)n(fur`@AS|9H=osBiSI=jql=_W)12U{}1
zu3jcfGwpb?bLI3v)W*aE>Kxqk^+9a?Vbw;T_3P$RnAdiYvESz~o4wSDT}r+RMO%IM
zyxb2{bFHmP@Hy}Izt!a?kMYw-G-^FfYA-rIvCg-Q4neB=E_yhLMaNT=aY!v3*7kvg
zCZDO|$z7ZPY{3*|pbLhAUQHzWz5`Oq3|72-7P)~*D)m&i&ed0S{<8nn8oV*s&FiMQ
z@r^B1Y5w(QNsD&Ys}>{n_CO0<(i2=QHPjb1tMd-6CTj^bax4N$$H&ISsW~H2Lu#3o
z=gLfv&k+7q3U6hP=yOV{n=h_(KU17#K&wVy)ysMuhz~r4H{iuD4pQb|a4H>C;asn2
znuJ<hFSM<DWIur}B)@AfVdRV$KMa$Hz^l<S@l7;sTiHN>hfeA~dg>>vp1GrJYd1&v
z;ffl!*D56wr6*I|pwkr}QuN2?{-&X=<L^E2$I!mzne`qLI>BuprgFY?u}ON;{G;<c
zM?dmn2VJ)50#zY#v3~CR{$RYe)X?(cm+yOrUB*BZ#PXE1B}<-5INJq*fxWjmk}dNw
zFLy(oH#ICD^RQ#v$*XR2FU+Uhu3VZGf<dKsY*?43hXI`DjTb(LhL&=Qghd;9d==B2
zPXo<9x9z{Mo?i3;oj46sy3I6q6wl3oE{rC!RWAMgP_wm)maK}3|BO{5mBMC`z1~5{
zW8m26co(UG{8v+ycX<G4J#YXshVYq#JP5l=LZw*|$q(kx%u|@ptyY9KTPEir)$BbW
z_X^)cTnF`QXQ3<RH3t_(MMu`|M0z;}m8C;26GvOc)Cml&<v}x1)_ktpuC~z56BXJ0
zOA~uj3uo;&WdY}q=_F(Sz<_ZbzALvOgjj8&=QRPNaDlH*J?((H_CBqFpL_bQ93gkL
zoz};gjM)ij++FNn(Kwe2A`(t;EOuu<bbK%hqGs*R-E91OA;4)1$&B~!ts3<FL%Df<
zO3NH_YJ+f;-uk&cRN=A^SeMqbmXW9<9hmeRm8oW2#4w+7GakIMnoufe`AXPGVz#dh
z;pb-uHK^tPX1MZX#AwX2V0UT1&qS2&)Dfkk-Y;b>tctqw#W7)RF1JATNrsfQGvM*s
zik9PG1Z*v5`JaZ;&XZ*Z=|&A9E-5qTxl(7If!=!(Ex$EOhPIt$qass8KV{_qhJKlr
zBaLw!MDAu2dR^ydKb**Ob$?V#!l?&4z3z}}Se3@zCfk||*~tVfSvLqxV4_5?i_L1}
zj!+j;qZ~XYykOlp^W9=f(jeqZiBNj%-G}$*6yxC2A4+vCMXs=`osBotSXQo7+4!f9
z@yWHVl1tU19h-!vpK5wNle`Ofhwf;Ytxk(@BfTN{c$9C~<)Hq~@~U~NpInL`io|<X
zl<K>L%r@z~7g7`IDuk0?S~V(T$HjQUOnknr)(eycRJw!untTOCd5V#fI<`(T7i1B4
zi>eEn&<2{%tH$^v6mhL}$#<fCf=qL%S#2K&T`#6i<`*PQSFTbCHLk>{vKWE+ukHJ<
z3SG}-CE)f{@5k?2<T(so#s=Y#O93mD?F3&~oQ{o3vpYgCH_XmjNfcTmKGw^}*!oa_
z-GCnMrb9hZ@8$MF>(izRPwOL|*ehuC;ng`pykh)*{O7zBmDx=A)%%LK+tt9%H#Joc
z%BPwitvx&FL_;4NfMh!oqQ@c+BvQX~G00dPP2EpAb55!`Zj$@9^Eya%OyoT3yJm|N
z`D*?4z%Z7r5PJR0W7o7#D*4r0O=T<&6Y(`s-8tgT+m_hcv^WHAz`_hG#BxO-(n+r-
z()GvQJnl^0NR60*UPVr;2pVXegWRIlCihn)8LYp4Q)kyFZ#m4{`t4~d=SPJ;pNFM{
z=lSVk+{{N*m>+~+pk3nAJdBzBR@Rwhf3e?f*`L6rm`s}u<4}ToR!N<2#|^dE#;p@V
z_!T#;@+dc_RIa>s)4hza>1b7Wst?Mx_BN&1MVK+BvdZS!%Q7-lx*(u4H#C{J=;$oA
zvZiQXRLLnWRR2V^79deE)!3`@U)?V1oRF&ziEuJzC8$?RdyPkt&h(j%Bp>#fZI4q}
z^-uVAb|7cu(+OIx_bDay3!3>=ws-=Zj>YdOrdA(x9yM#f5IOB8vmstJF@yP-Sd2ou
z&6W+Mw?+iw*jm+32VNqC=Hl-@eftYPEf(4nAWn)m@tfasFVd>#?XQQ>N_e*ToM$RS
z+m=5nJrL(oesI4;)IjC!pcMPQ0_n1y<?kvk_oEd@;~Dty?El5rdq6d{ZSmfyAOa#N
zO{yqLlcsc#s-kp|-a(}IE*)tek)qN|sM5RimIw+c9RfrGA<}E;5IWxqo_p^(@4fM5
zj5AJ)$<E$u%{9w!{^t&vgh^pSasCav3Nf3h$uCC9#+47=irf-utb5T{YXWT=s&=li
z!US#%aBH2|Ha6OFBeEV7-eY4E#}H(2JCCyW%SxoUZXBP^ag+7^tg0*28$UaUJe9Nv
zs~xZM^0e^FCh}^@pN1Zp^J<}EF&FIJb#@umU)*|$q0kWY?i0g&A?!X9;vKABaU3+c
z<)><6X^ft3oZ#^gpLr3VMuC0DiaIQ!cBsuquJ*#S^D_o>-gZ$J`GEE9l=-qxWaesq
zEO2A)>y%adUSQry?k@vFN5mbXm+q*#Hc_Xa{pc)A<o;22k~1<nNGy)9iC?c@Dk`&G
z5goPt{+j?_k1z1CWkA(e-bW%|BA?!m$QLU5R3MT&u@}0=m|pT(_v}NfD-Gv%j(p=$
zLs%{oZ)Kv>-7o`_vYs9K<blU*p2|b-plB1zSKk`v^cRco4I-Qx^YQQ}2roUk2{I{n
zQg#Pvp-6m76`0OE$9Q7&0`Cjt!a<iIpHWOSZO#HZxiGL<?_rDDW}3g&RD=Jfj!kD4
zofYv#M%xC~4Gr@fzKMPZ0kzKW5(14}H$DtwpM0!Clz@b&-qO;N%6MAkduL0r`CG$U
zEiJb@EfS%b0<bJq|7VaK<95Odp54Y{{ai|&NRu3|&WXZqpVv;l>RlqehDo_k%b_LD
z&R*v`2E3Ht+<CftWR$5+ljy$TWU`-dn&KKIY`7#L8wmN*Vf+GGgo1AM(~EpMIl``d
ze8w~wPD<~-^^<J{%$BH0#FJi31(SC=-LIENMPvO1Ew~1eq(jiT`?JM!6Dy%Ywk2}5
zH=MliWNuWWKRiv=KlWxG4DKu<&^L5_890(eN0?6rFP&kt>-y{hVl6W|KRL2{0z5?Y
z6W#FYUxBkV=;7yKCe39Ce-;X{^a7RilTNnXO-H**Nk4O!P4|J^JYQm0U@YWu#^Rk8
z+9aRS>q`@&Z|E^rvb@@(!BtpgbT_cYn8QIC>jzEfk=Mv`a+K7`4mX@=npKqFjIgsy
zWjazIMaO;#?5ZezD0a6nW?f~}!n*X#t7)M-V6+rE<&CwQ%!(jlRLOruYCGf52usfj
z44m=}Vp3R>uU3o2m56v@$3r$}!25R>gOlg=!Hm74mR|~A^KAR;$bDgsdo^LiGRP6)
zuC+<){C23z4NZoc9N#t{$uIo0-emOcC>|d_S5T{KA7)*E64Rw}GFe(6Wk5pxk9HPw
zV#HXed#th!l<<R`r*3gM0|`XBLL*)i(k3eXYQOgZy0u>&g|*5lx)@e2$dC`CKjh{;
zQ5dS&oO3TrojYiX)A#$3QFwsX^x*8a@AdtP=zs8nU&q_?X=kAk;uYyJid};5RCOyF
zUSVHp^-S~S&&5!q<gRKKifJEK(OqgoBUMc@s~Cf1JAbzD6m`;j62da;z@~sgs+vq$
z*<<&aOwV?iH{xFJETid*KMF?=B*9d*kR2iL<6my&+7ab_G2gZ=l61GiY8Iv3YFY!-
z7XrjJG|-~oX)2N%+1n96VNcEiT+*?gPaQ1+w;G35{p)?MmdsMh5iJj*N41<6?xVc|
zPY4UhT7m@Ua}$k+bWHr`st>Ny5z!SQqOf*{?-480+f6@?^2anWFZvKa0+k2h!Wp`b
zY`!P4JR|0A=_BZMQInr1$T%gP;j;HWMy?+nHz)RPsY1$CNHGB!;YI>GCaN}D-)J}v
z?jAkf`z4e<ZkT&C>8;>JYMb>fdaLw23-rbnB!Cw;J8yZ}a)D~|6=8gm8pY!8V4kYP
z656kT;MPIj%xeD3i30XQEBPw=L$cy=-jn5D2J&ykuzZ~A{)b1ljL*k1csQ7@tBDec
z&9nv7p0$@&FKN{9b*8uL&1$a+zn?9(&ld8J(=0+iRalnzx_(!cMUKs@%AYApeCd1i
zq-7<O{|Nl7Q*YqNH0XyquDSGbo3@!*RzyVC)k{|*WF7Wj0cW`JbFkZ$MoO<OCjtBC
zwm%b%MjuJ+{NQrfh;rPRSx)#|Tl{dgw@<@;B`%AT$y6t<xSDMp-cvI-F<?F4aEg9}
z;60Gk>A24#Fs0!nzFFkbxRUk|U4Oa-7c}c}kB@{n7oww7R}Lq0VV{b}L3VZ*HzNs@
z8|ZK~0)wNZkhXh@2_hKNvnzrS&yl(LR432TZ)ZO5sxG$;R>NSWDiE=yyi~_8zD2B@
zoC<@3@ktdU-zb0chwb+)WE~imM0u`F6uzTi3s<f%?bGLHRZU+lxo-=`r}V8{ojvL|
zL{F3nogG~f1}WgY3MUY{VkUFCy;cRSe>0OZ2n?(}^+*<V%ac4><<B=lb-F|QQ;~iX
zyQpoQ#d&$p$2v<(T8Yc}O&+i81ggecQY`BPmT&vNHpk1z?VcPucx+Nb|6*&OqtQuC
z!)BdWdFdo`vij&f<cLN&&@#v^Sbk{%rRtT7%8N_ztV(#$X*9n*;K!p)4ZGbEd`dr*
zmL)Z>-dn}E^lVi_S6DIU{d@6Fc8}F3q%LBc`~q&@L7L-;0++bURc$jA)L=>9GhSAb
z5fEypH8VXl0eTdLYzqz*f)rg-TN7yXBdm_+a^;<V^V9b2q;}!ee0nyc^2t4N3E#e7
zFP<j*8ValD9JgJ6R&iunl<8vLck$_>>Fh=F4>xPX7v!QcHiF7vpEEv9o1Y9hC%&AC
z9bW%r))?^Isa=#N>4|?rdZ3fHJQB^SRTo8@H(LfBZ=|G1{5(5_U4Ey2$>T??f&GIV
zLdTm0OpFRlh!GJPiOW$#_0JRn3kkStV?MQ*UurSD6gR<N-+Hmx_Mt9-cM=j4X|!7}
z;_dr_@lC<hT+BP}l6@*{v?y6fHwV%1(yWl@Y|%o#*0(SD@|UzUSYWqxc)27Wqa*Jw
z!ZBAE=K^Z3De+bOUiqXS7f+G%{WiFernek5R^si4BSr7u+2u2Zlp!v~#DlI8_0IcC
zl(5rCCN|EW>k}EFj%?$Nv2vdlYU1I^H&!KYGRyM0Ubvn$FUNy#inyE$lTfTmtQVL)
zRbQsLoJ;c1n6{88!$;%bzj+J3gFdlB(lY3w?LD@{6xyu<d=dbOD3G*&B>7m@%unj(
z=C*sh-T&68;_DAw`YXmzdM@5eN!V_bD>v7>znOIO0V4}dS2SKh(iFc5VjxzjxaUV0
z@g_Z^y5+3w&l3f^e01T*n9^-W-Ncri2)VR1O!S20g1nKSWfmN~1L&$3kjl;p-<2g{
z0RYC$20VPl8Bc{%R4JX*?MJ}`{Pxa27obxCxbSr&gkT2b+qDc0ox_R4Q;wU5Vr^X1
zu4I;sI_QsUzv73Ko`Tt2HxgPel1JOZD|IU3B=U;inO?{$HCmu>=(Y-K^fIG-U9my`
z3H<?2mJod3by?S0*?Q;6T3LtJf!Rzc=5M)cX-f##7z;w#*IA1?yjZ`SO>{il&iwF^
zMA{}xz%@&V&x$UE|C+zmp%b$D=Yi5ygz1})xa<Q%Lr~ZK^`td0o@f+InqI9A@mK2n
z5jMcskXcHyyO^ws6rGb$Ig8DMxvOkeKaVtIR|hxFHbm`+^xZ()4^SBbx-K>P;g*vk
zvMi1aMmM;CDpX=VIT!wQ!Z5twRBNrEo`ZpSO%_&~UbwyXi+VZj)p&<|Clw)yw8wAZ
zyFq|Q!yC3GvPG2EbA5a%zBBMu9cydt-5#mNEy9kE#~Vz5H-*bbBbvRp(}X(R1s6i;
zO%QL3FB(KVqbYbFp5S&cMI&z8DzY^6A~AlVd{8~HWzMTfJ@;9nAiaAk+QDwUfI#ya
z?DmDwhrqwney_FF8!KqhMcMyQZ)Tk-w^gDoOZv&{L(wCvYs446-qFP)?>pVjI1{mp
zUc$qt!2LNfK(!RyW}v(%O@zA|Ttd7r1r&~2JDi^5W#PmdY{_vy7T|`=X)a2HnRy%D
zmzd)~jaT}E$Ko#;kUf~oB*%Ny=)RWi)-RE03W5*O?Gk=bP|{-ACnaJw1l1<#54($l
zkLysL^tg|58EmIninw1^VSq<2`O@6{UvK@p?)v}Iu+XZu>`e(u$Ot(6Ie7#3K2Vjz
z0lsQa!ECOp*506+j^W1PWz3Jj<$+hcfZx8e5y*mH;05kt9TxZB@TT5@-vr>I&tPDc
z5si_b`q%dj=cHOFKe!ctzsSk|fBsoI7eF;Vdg~L=aZtDKz&atYPJb{@&8z3MgCFCU
zrY&MXdN^*mf$H+#A9WE&#uW(2@i0$d`gq-OI0Ej44`91<P+@FB*}ISKitxD5g80Ax
zOf?W}5MSw`qTlW2g^4&Y8_qhR3IXb*siYM7>C-11kpW(6*UxbYJ9u6cpgr$mCr`Qz
zH)ByGp>J!@j%tIamzLrAF{nSyU7SUIylJ)(SJZTM3$nBCG!guECMGVb+us87^|BDg
zzRCay%5^JWcEVoF>ZQD$v0bjJRc{yL2tBa$SR80Ie&GQyt%&I8S_A0=?Q-K%z;F#Y
z5?Jmm4lWPnzDLQjr;4~_kCy1=j5{_dgC$bqdtF8Hc;j&kfQmeemwT|iU`}c|fW?p!
zR9;w=mmL^y*tnXfDQGkLV~wD#GkU_vXQd%#7#0|4<c0F^<PDM*be#BA8<Z3C5l|eQ
zOJJIHw8Wn7$<7I@|L&kGz!Ds~sb7h-w#l<WG)r4As;SSr@$>NXIbOJ;HDVNN1pv6N
z?qb8zqW%5XR*;M)ICyfb{iov&lg$l)zzLNCkL3<9(*^a*Obw<bx|OoH*;a5NfCRB$
zX4Wh{wS+*EnueDq1aqj1j(To^x9j#iC*+?7mS!ph80Om&5=QWRRWGFQg!5yU-a_eK
z2)~Zod|%-XjBuNw$8E_=Mit_`yotjTw7Ct7pKd3tAnnRWf-YPF&+pC4S&((aH9Bf}
zPQPPY?z&&@j=Tjs4BNwah&wP3CtVj})#dAP_Pf?|0q@zTw6qfR$(k1kxs)nj{k)NH
z1Wys(PRFzybQ)TbhNkw?x14#>HW2;|!EpCX6atDP3!c8Ye}t?M8wZHlbYK<CvyFEK
z@M&JepDuyLW0mI8p-y&FMRb?as^zqaOJ!F2*Vy5<=gB^*-xi(Kq9QVd4|L|p&lLto
z=0UvWkw?NO>3yPT^~<fbPu7ZFM879y>`dt><6)wcdu@5+GUo!9CLZ3!sat%_n;rDX
z3bPP+83z#&k=Vg(EM)IcOAjn}nicKS*%DQWIZZ24yuU!!7%#}yBJcTOx5??WlCb%_
zmcLgQ+u`916%uOkw&k)K{};rZb?a*R?&<JBAKx)pLSk+s8J<X+Jvy+L<^#`;GOC1b
zdw>CQpx7ujN4dTjMoKFsZpX~93ihk$&m1ruvi$-r9`i%MgIs?T+$Y^mkdx`={pHIS
zP&|DExWS&WGK0FZ8)6;ZTCntGVP_7pJZSdvQ_;i=BW5sP*dE43ynC|EzsopdX4MJ6
z65EgWf=PJPa^#}Y{9B*MSmBcpK)#U?Kv>=$j2&-$BNNY|0+U#9Gt$QtIm?O&ApYq-
z%N+h-)h`b<g{Z(PT-$TsqzD9#-mwxZ-|m+(Ni_0IMv-KGDWHG*KhRJ)j6^b3*l83*
z9Jyd9rA_BGIo{0qYA3YurOQ{<025?aU6tW{0nmx$0as}5gdF@l{=pKn2#&oH_X<-@
zoqE&gQYN3p+Za@n4uJ3;qf}R3@m`SaZzF?_c$u919@eX$OE3USQ47H3|M=RsB&T4$
z;=f4K3%fVf{8Jd<PjFlz7?%q10CuS5vn7BZ0AbliHYK~yFO}%eLakI_+7{MTROZ~u
z$MTpBxX#rYJnQ*;@}&?Vas|nZswZ~T<z7hEO!2*6<`AKvA{;RBA2@N%I-CT@!v>iG
zzC$XMWU5pwgdj)k8^H`1*f*T&1YN53g?I=1K;mbs-{f!$i>qd|94Wi>*4UM|jM6F-
zXZir(??22Bp44e;TNsV8?wslEn?1+70IPICDxP5>lCKZ=9dscW`%BLr0}H8t(kte_
zADAK98r(fMlySPl2F}F4O#imjx<EU}*_LiUILzei5V7}Z0N&tcW!0DrvTQc+Lk9n?
z!$g%I8z)1zFn~tAKT-3n8;)2+1^Q+4&hv~wB^>kTN6cd+@+AEaf7xBYyK^0W=k^^w
z?i*?@O7HRUX~4hfCQ)-7^Krk}f6y9CTUp^U(_u?SUF55OH_)-pz{bYr5KI)S325<(
zJ~|{dv~^dCq-z(_jLnMzge`iEO!I9U%D4J{xpdQ(gyN*5)o?H<E096_wtkA8?7$%o
zcq1`3AOFYG0^lk5{eKrs+xr6vGr0?z&nhZx;H)isNIS9Z9{!OgFDEM#zHbD9-HE&i
zVYFWk09}nmMMZ%XW0<ecS56&OP!N*=m{o+B`=TlD1)Nyg|M*XE3bz6x|36PKr!gcf
z(bz|RY;5w75SYmj3cg{@`rl4q?(-q2+Cmn-xOJ*D(G)c9Q*YvsHvL-rpDtu--~Pw{
z1V#KGR~>S|Xa<t+V0v_spsvVFBtLEnv=xl#=9j>}vQ2QX;u5$2_ivuQ#l!pvfW&8D
zZMlG@@s!5sGYL=qSK}M?Us~JN1F+&JV61T834O@rYv+t5?LUEMP<wF3t#!Yti*00(
zpIJ6wqP0qAptiQQS};Kd80{?+Be9o+nnqs&;#vQ%uiu(%TDR%N+O1NzJ^d$Rm6e$X
zJ^&18T>?MI6N$5hMW6xXzk33BV^?^9H*v)QTr;TOlmKgiHohTpW@`2|=t3iSXqbD1
z2DSFd^L>zFG$8p@t^<;d6OfSqMUF+<2$<^umd@-BI4Oy(A1s1QfL2#C!d>PvQt;f7
zxZWM9yFox=$@zJinEF#}ED<2brAtA_4(|1rwdMD=$7B+kk6vaq+H&1xBO3ry9#`*y
z|5bxyJ;96lWwL4Ou<ky1ro|qdcllqM{o<1Y-x7ob>kan?@{fC%aUBM(<9T6N;k@aa
z>%bGr^!%H^d%{4$l|)AZHN7n;I0Akg*nxOKA%7e?ygLdN!C}C0gP4L3`igrr;PoUD
z^a4>6l7tYFm5rL3?Qcx&;oL(U%L!h0z*gdOoT#5Rooe3ht4hT0lhR#8!=ylLh$l_^
zXKM#l=y;z0+Md6{R?3?CjB&r?)*ru3T6dd_I*FJFT%zUQAK9ioMn^It`trQ#=l1}(
zow%3TPkfTrf=X#zUyk!DoWz?0vps07`5W6$15%me7s;>w*Ku;54cIE^fTq~gUArx&
z?Lvfc-x`ArW(Hs=DJ-zJv>FuXg|+z+-gPe=)3dWs%@}9xWmu<Poo$z({G?EDmy~Y(
z>?cL{wT1<x;zga>*ZhLuF=Z72qftx1;Qyy1;m+Ws-&~nVnKzti2opOLXAreuMp>o`
zUwzu(NmAhLySI8<7hJ6fz|O$mg03tW3o7zc&yAOI{;H9&;f)@_#Pn!(@HCMXn1QFB
z6C=5dPXczZO$`AFVB+b{Zy{b1HPQ7qpbId<T>ma}J@_}ltUK_H+49u<K-HAoIVgE1
zTrCR5^0d;k>LCbrvp}#r{tig&87+MG7Pb}5ds-xF`8_zWVx!pPqzgQ7+)`=78OJ!_
zkAchjOS+O#!(+F6e2hD$CN<6>6|@GCQTJ#*AEDyndkNl==?eNz{sOUZ<}@x=rD35_
zqu}((^V>_SY&>H$pwDtCT1(-3@(vVBvVq`kV7R4!O4h0oPAs(kyQIR&ZkWQ-yfEi&
zVda=;NTI9Zb1#M>7jq5Dg+RuBs4s6nB630UMczVyQPhmU??ytw>dt5u>VFG}%DJ}J
zY77N!3(V@A#QpZlHlC-?*<E}VrXp_k<B|`vV5<$yc#T@DBcy1qPmat;$}H~i00qsC
znc)rGebno-&BAT$<T&%f9zkg&?)R5IulcNOMe@u^CzEdp@)=i>=@b@r*3yEH7c|iM
zcUTH;%*VVRr4t@*g8q*3gBCh5Gb5w5jG`pF$0&0_n?6^k3e$R$_$DzU{tT$9xk>Mb
zx)taR*Lm9^4L68s>-LW#1m{yLFE?*q@z1greLm0Yq#hbA)qX=eFE|ZsiE4P9{UTE6
zt`wKJp(^_LK`rDYY(el>t?Fsrl92Akqma`}!}<GOuT5iS%r8QkGSYIxRKWK^D$Iub
z%>_?ZYzwU3JO<$=9^Mt@^DQPVa^xNiI7W3Jpt0N$om#47(%*1HfW4+Jq6z!{iy*e4
zQ2z_c{(BL2gHYh$fFtGMlribrtFk%mHMUrR8G3r&@41U*93WAsH>vz>*f42&Ot?K#
zzv3tY)1?zILm8FwyTPyPm2xwS77=KLeawDr8J?b2*Sb{iRiivoI@9ZrZ#_YJ((G|(
z&(eC{%`F~7UxIv~7lQodIm(bbK1gnSpkK-Ut=2rwRqm%9*}hApN7c?^ak7x+Us^Ew
z4sUe%&l&d<j-&X>+PmLnc8Rqs2FgvO&TSLtA2W1E&t06f8N)-m<MExoc)?C3!2!#Z
zJ(bV)!$Rzg-nZ<ivv>|N+*um7&RgIP2@lY@svbiegX{<D)zrwoqrcTp<MU?@jmzo#
ztIR3C%v-P3LBHRXIqBP0meMQ$b4!$waZS1;+GCzG{KNUN4m!&&pCkL>v-GLtZj(`s
zP^UEPiEDcyx<;5YevBx%4VZ}4FWul|tUo<m!Qf1E0oS=vW4_V8Gw@rSX#jMZjQlQx
z5`{KD)bY0w|CRf_3=)w~<NBSIlVh^BO55%m&-k_H**C-)Z0PPvHI3Rn-TV=fuCQ#H
zsy+Tay>@Q2PRhR8p<hEn^eHGFZ@h2ss?H((0!ubAyme<Pc=nSRMZ8a%egDy%Z=Io7
zsa0VS)eFsRl~ww$c2C3NNU8fo7Z`lye_z41CI$b|kG*1*7gUj)zIgDH&PyW#N!5Q=
zW$0#^XE!6+gt0<TezGM_E|2C;R{5#7hFG0Wxnx&UnRa;T8FhKyo3viofArX2^jn3>
zLF)&M=e_mDMDNuwXRB&vobC~<D`=$fBbf~pC3mChp1-h_g30!SEA*SAfe|{4EbET%
zP@uCl1xYm3Wn?a#o41>PWWnyp#3{N+yj`x9x<?c9#Mvn=pN%b-1E`NkOA{+cEhtzQ
zBw3G_I5rV$nx<Obm09P1vAW`6w>d=RTCRwA3+2l+kb+p}OAzZ8cFHI}Xl<K`ky^ti
zc^rPVu+}uo2@S=^B?Cqx&(Fudm>)=#4MbVR?-YRI(J=q5xWY7Yt%p&nE@tzOxd2gi
z|FTA*$vuw7nzRM=<~VL<RN?1}s-u;w{yP|9&+l&(Px_WuxQA9X3OnEF7;b(IRdPVu
zrv|vz#wm@2c8$0Adbw1X%5i=E{P`C+IfQiUr^4ya5AXt>KYR7bJN2Pl#Qof4)Xt9p
ztH${QWhZw(gydE`NpdkFL5cqrj=_Nz1~Vmj{Rr+mx-UVO&CvU4IKTe)&Oc<=8?Axt
zItNGfYJ4Ws31;}&nm})Q?21FvW%dTEWc@4Pwu^X(WqsysPrQFZ{EQxy<0=bQ-DESB
zD)<79_iZzXmxeV|@15HaKxl*8!#wo;uFXaTC9G<Xf48rgB;KjcP(nkY?d`p@(d})a
zf`L$iHBeQ$U%KCO##DGRt5YexNp!D0Qn+L{{Ki__@}jOPPMKC{<HyJ|UKNCF>h;;b
z)zxk1y(cX$U5W_}C{+03kdah%#KF<!NLf5Fb2slQ4-c`?{@`{>BfE~{!b#UiTcA|A
zcRZ&LX@==#lED3MxBlF_KMh{e_z}QBB92sJs*lkryHSie<naZ!>&UdS;el(~9VKrC
z4m+;XJ=hB9)n6ecdD%~wOMjTiqf3q`({Pa><W63WbiOI-lkjl0tmrRHV2Iq<!b+bq
zVz_cr9p5}%c{}{h-^#XR#8)4^JmPHE`8D=2Q^qO||Mn^8u80fA;Wt5fp|~H2w!{wG
z&)We}Bg4GJalZ<WJKH{92j?mtu0I-gEAPZZn0}Uql{c5oK}A1IYq%rK$rcY~B!+oL
z5f9d1D5Wydy@rcK!B`9LKHpdpc;bcnP;wWx(%G@DVO9u!efPFsIc>R%FqxJsLZp44
z>5iDTywz2BOAIPE?sd~TGtcgt@K?`MU-w4_Y;|8SA)0G0(|wIvsuL1DZmGIQBI$_9
z)@|b`pC?{Mzb3~*zU~u}jOaBd<J>wR6?)MAt@cfY*5bia15Hf`#o5J=#hIC3Yw_?O
z14Y&BoE*^0nE42V_(K$`kJK5sG<7yDfhm^$Z1;CJ!rzKff~>l&Z5;V&vN)GaE#x0w
zg7Xe$qCvx;&QU8M0J^ocm9`6z(X^TaE`A742Pkz`TlF*Hw0KGB1U`UPW+ad;cSTlK
zR*KChnWxq0fi{Xf&|A0Q${+z8t)R#J=r^j82cIdp3<D14#Ew?0F!$cvr{_vSg0$ch
zF_0_zZ}%BsQvK$)twGeR-{e_}%aHA^AjBc;pnXz>@Gl}VzYM*3yZTj~&5zI&<*&c{
z?T@J8l@1r*3OOLzZ{Kd!V`F6niuo#iuB(-uMUF<7c`oB}pmF_Dx&?jLNpP){r~92!
zNJZc_+5bBf(<}!$P&?531H!6g=)pWaw+VEDh*F6#f>&gNTTa;@bJ%wItd@wXXEB_|
zsIfq&+;|z)R9EE)=9$B9M=t|C06an#upR$I*8gdzdseR|IA7Hf`DA~!s(%N>En;ti
zlc(7Yr9zZ`o5~6A07u^n;*kKth>;?#`)%Qzz@7#T1q(i>wpYNzdoeHnL`IY91~9dm
zLU;ZZ>SRs#=*+&kL|6?v2(4bV5#I9qjZZ9fx}P)QSl6PxyBQ@=P+;m8hA7ri1qnuv
zKK7H{^1#y1pYP^Rh7jGp=pRr+g}XDL6&YMHuK(wMSM2<H-fqw@JxSWEJ%Y+-y#>4T
zlPyC;<CfOmeYo(wk4`>!(30qvVmqk<Hq(?Zg?4W!S%Iw4E~Fbs?xw9Aoi>3a3gV!L
z--BN1ZIpMqr7bI9pWmB}vfkW8)39&v=czolwJqXlT)dO-kWMD^Z3;v$K{zN`z^n})
zEL6NA&{O`u2HH@PIex}XOM&2}=6z6N7XWpf`~X?z4DP&dD0i;ZKCh~zTB2)8${`?X
zDt`R-n(cUPX_Z3)HMhRV3cw$<8>=QCIe}*Vi$fqX??bvbC-Hq(LRXu8yKIGIrI1cS
zyrg!cH`xFwQI9mA<-*3TuKN&m(mc<Q?zua!*nxlnUfeb6vifOatL2t1oYR(x|L@Do
zO;+*TrriRGmqn1aMI5ce*4<x%zHf4+{HCQ>!YiQtRJ{FDucoZ!12_rjfltE(R*Oyb
zBAS<e>(x0zG_1u~SRSMOu@KOvxUwqzCPL3I>+#bhI@0;8npM?~C?u_o%n!9-=L^yf
zi5VH?`RP6pzz_s6|2y_=?WW;OQrV;Bx3u0{9aFLTF_m;}$7eTtwrJfoLA%soF*LSu
zIW*lU-u<OjjZI?a;ovT&=J9xXn*_3+M-k0!lCusq!sv~%3tOdvv}$)tKfMsc5_xMV
z2gs?_l!0_5n=|vP4dBhGrQ&QQx#2%I(8wN`eS}Ho-|cmmVO4v5Fqa4^487Lwp3t<6
zaFY#Tt&ECK8a8Izl~9*Z#pI1*^u70*Rt3L}PfGai&LA_YkBhbR8=b?1URl~;qc37M
zS9y_v`<YoF*gpgXnGzD-=7HqppOMoAy*PM8A6Jf5*{&78Hd0^75wYzUmNucvH1bP_
zzlHwxsbLH&ZWRT`{3EchM>6mv4HA2Mdn>4_B0wc39~8cSDPFUbV;A_#Ci7Rl`EY}8
zwhH5bqQsFWqLMrH+40!H`~b7>j3xJpYZ&y@zHqd3ER)e&TuZm&8DYdV36CM0d`R8{
z^Ns?m{&bZ$%g6}l3tBJYiszjCSP)v^plphcFrts!L)-5jE5m+!WK-}wp~PL6?Paih
z<c*p@V{3MDa7S`-vPyw7D9$7Ti#YH#^2uLzgG#7F_mL4{<2;yDVB(seeO!RYW^#lR
zmCk)I1F3v4vg_vS_J+fi@8j-h{<@VRGnyhr>=VKF@3ka!4p=Cl$6Ja<;;Z_~8Iq`_
zo72Yx%zO=KK-mb#jC@vg#iZwC7O2rndK;Us`#tB^c@V|K*&(IN7j}kAWZ>Bnwt=JK
zJ3pKm7(hcZ$^Fc9|2;j-WXrhmw1pTvhkjV6TWE9H*YXx>2eGnSC-1d5WEPU9vWFyR
zXNd5|Fg8`g1;LfS&6_Bdn%ESF4A(VLRL9XgT%D{jAd}en!1Vg}g;F4y-S#y5jXtli
z^4V+8bfMLn@n_x3V?K)`_qsgB9e!Ww`AhoDbgGBHzd8{g$TNT=xfz#TxPt?2s-34~
zQQ#5c8tPF7ZSEiHto=(4Gu$F5I3vAxSXGZS1??Hk%P(YfDAO&ak@P#u@kiEg>+g!T
z<nt?Qq6$LD*d_yx9HN*c^zBAVZkvM0-Xb_GKO-RJ{CrKwo(J7}g1DY~WkDK{@+gNM
zDqtsl%e71OEwDZ#g$7_uhp=WMMU2k!J*$=KIE`WEaPO^4X107i8L?$#Xcc%2+G#kw
zt%XoI%za3n%?9+Mf;?bRRVRYomWNr5wmfnmR23sf3yAH(0Y}eoELed5P93Cn|KN->
z;A#KPST4ChIvi$okAtVn_~#FKcurtyp_2H;+d{iMQMW!VqS(8%kg-^sdHwaUO2GFM
zmcK(Jf^6u#n3WFgJ36s7o?Ok(KcpSQhvxf6n(s-mdnGmmCcYHE%W=?FELd-qEY5D=
zW;VRp_1ZVVlDA9k)VQCUo{fE}<aXm;8&kw?phMR{&vlAdPTm8)n#%$4*#`MnxywwZ
zOvGCojJoGiB-|2)pxxS83rmeJV~4SyZZ~F|h_~-c%-^=-iSy<)f`l(lj=zZi-Q-Z#
zaOPFb$3G6eHUD7R-LIcSaQ4%y8j=}~*gDYi><3LXohm9W8=|#JGvg&<0LL62)wA9|
zg=M@upNJUvA3odxHG*4|TuCo`i8^bB%X!b@<<1;J`<RlHD%6M`kkX5+PbImqc#M99
zL-eUwt2gWH#~z|U&wN)`*B4(rO$GHbYDp<M^9)Gw94-deN=nVz(P3MJ`dV@342&&N
zWn@cTtu+gNcG`@Hw^S4bS*MukK4(G2!QIE7G#Slnh;@30e;u(}&j&|NN4sw`Y$JFP
z6IPwJJ@;w10wZ!~dq#4+BtI{ukEeinChJ|JTcWY<*jO84IjV@}d;Efyt9N@(MsY(+
z{Q8eXF}G1zxp4m&vK)33#nTgO`3myHZ1o(4V1n&{*Xw~H=p!>G79CoWyp^m<insvE
zZaYl^++D%DdcG?kSoJS}R_gXt7;sRU5Zzkn-0z>SHExlG>^6PW5%Hi_6LdhbS<gzq
zydos)9td)F;a`1E%{>&WDp?4;fH4WY<9kN@Pc=G7wd5&H`)aYP+gwoottzS4avF)V
z;+S8P-*&#f>YwXZ9d3ci$8}frLsIkP%egp4yl;8+-4ghmgHp@0Y{4}6p|~Ut!zC`6
zR4San)$WOo*`~ed@LAiQr(C%4R`fC2n~CaX&2R7Hcd_v18amK-@W``y`#3-HR&Jc-
zWbtgEm^^<1Lw0w74wM*4Hfl7}JEJ`KU7aJ{YGKY;+`wiox}YHCUPhk$NjXAfp(U+^
zkOvD*lsjd+WwU7}^@TWobNR8)Dam>u1;tQgWB;}>b-uuu{)Svinxc_}!0Ob`N$kwQ
zCY?uuzlATqzzpi%vuh7zub$sXKU^;Fdgrhn&X?Hf6JP%&Va$`L2x;^f9tmvhNhU4~
zs4k}`(z1}H*T6eUp?@o477}>fUc=1{c%z3P?Tujq-Jx9r#|>kiIe(z~gZ+pI^(GnN
zWm|gbdK^@*JXy=asZuC>mE#)s#Czi*48^>+c(cTf0Jmb3eKo;ko2vKurV9JLe59+^
z;sfTs)<39DnjIQyHmcg+-fa2sx%f;(?cJW`1BuJFrfg2*?z82n-@c<G1ujL7&9xcR
zY1pCMP`As4<=7iBS1p)aziL4rT1phkXB-BH1vGB*oSC-%&~e<vc!4J6@yH0RnA<wV
z`-}(ERV<-5v%+vT4)65*E`SUXA0rhnbpK_cY?~$1v1jM4{M!hl&+r#2JR=8D%dxIg
zc%+Oj?@}1Yhu1c7icb8WzPl@3xH1q<bvWeZ<J)=HgLC8^WAXPss2RbSeoL7W!r!mx
z)Y;pmnYj+)`ke9YA}`I|Ur|L}Bgl2X82TJ0h$6atWp<^-3X^J~3ERpyjZo#hM&0}}
zexO4rD*L2XJ*tthy!Nz)AzQ!L$mQ3x_ct=FK0B15az<U&>+do!yD}Xoaj~2y{tbTc
z+fDp}E9)$RFNbeR?5w)?p0H|PVjIJncd_r1#M+$a<?+-&ODi~MlT=ng1o8xQX4m*O
zrwBUbx$)3t!uv%>)?_Nu+y-g;evoU7w{_x6fCFI=&?caFn@W3;M|3<8r4+^ZK$hh=
z;Rwx<Tp#F=;1v4!d}a-p5Y{+6M37m0_gMqIg9cHAc!$uXK@quKhpLY6-zil?G^t1y
z;&dqPsE0VRCnJAW3Y*YFl@Y5pG0UgwECO55LInepQ;}%3Du;0)oL_*)fK%ZiR|no(
z4mfW59lev6K74BQ81xS57b{@GE6SjydYSj!76siI>jo9Cb?A{};q`lXMLJWH{<U*f
z<8}JSu@2%wi#h#%9yDqkWjEVJYB(Q__6}V&!4OD2K!w{L*Q?Y0;+J_a_xa|UGKt{!
z$0LFJS9b3ffqRwiJx|M~dsEDv{d}85;3^7t!jo@sNJ*J=CvaE!Hb;xj-z44F3S@*=
zze9zX@~`hU&}7)uH3OZrYM>Oe0J|Q9_oQ(4>e2L_?zg&S@?UhSBV6P6%JtfJ$^Et#
zB+8SEyH`u9U$@uPC|lUVEyp8uD-~!D3RovoY@e~#*BnPRbZsb$&heJLy^P)oVwL3A
zqqx<qh%gaUJ8A4Vj7s{UgKWDxeodmGp&>Uo?82qXAV2dA7rWxBSYT6s3nif!vR9Ll
zd4rl6sdq7lymn2#CXA0NwA%SndLt|>Odl4|g1V>%^>;n$$D(h``}?&Ft~{(B4P=F6
zEA=*x2tOS<Nm(!%7-PZ01Fc8(>F)@v<i|mw5rhW{M+;mQ4366ACsd@jriYh8KPeES
z17NMSCCBc{8_F{Ru0;uP2%=l$XMEqb9pek5KAdQL5ML3$rfDnOzk(5Yj!1Vj!Ila1
z9@oE8;5;AW=i56o6h}YzLaF%4uDDf28O))Xsks@_lDE+Kwg}@KmwcThmR0#YOZ@fr
z-;cx53XmY4#ZN%E<zZhu4`c~)SF4|QWS!Dqs4+gDLKv5IONlh(P8~-b^HZ$0LWI-a
z>Agej`|WD8#Qdd(M@O;XltVI=F<lxdkFUf0NX`#zPO%{&>*SonB5W50zm4?MXxgj7
zhwC0HEQ{XcHu$M?sz)EVahZ0ljm0yFE$^y!b_;q!G*-alW2x-#zLB;%OJC!UI`#??
zK1X>JMb#oT+Y(8g*=Iew&|6Hv?HnHVx>hEBodHa#`Z=;}mZ7jG4v)7qsV!cRCf%a?
zY%HvJmK0m)hxSd%C=65d>mSSwK!Mh&fhnn%Q_IEWtRUXPlWzE9u6kBX(oF))($7+`
z?~M{I^PXK)(+);@R4LEduCcHPOm}=tssB1|0D1k>hz|L4{5=NLYsiuZiPO8+1HXk6
z_P!8U4iC6T5PF_a*W1$4e!kyZN{1YAJ4%{tJj9+v<j?&)+wpFJPVgFABaI<+<FyIa
z%hvoz(`w7o+NNvwcz`TK9(!NIj32k3O$lQmpSFgB#6_L_tj3j3(~H>(pZZRtkYhAc
z#^U2g0k4^yR;z5f*3vX@e84xbmooBCULSl2`Yi8FR<Lr)Tsre>@e$^y$F<uRbWAMz
zN3(Q1DjM??kYClyarbaS2SyE;1{0wFBA@_eM7Ht&to5<;d*QS`*@E2Jrfv#L*@7RY
zi22_93%^nUI;BmzcmH(7mVpg}#-RR+(CBhdU3^1-x#-#3PohUgQ53`G1ME@0uL?dX
zShBvDAEyuy?;aO|90oRIzPhX4^YgKL>AJfw73WD9t^Ra3O*R;Z8NCv}szdXYiay`p
zS24<oJ(0uo(ORp4;7;V-c0nF!4dC|FmN$HNzdLY5y#LZvT%=s&NYCH1?tPA^zS<Va
zxqKLPkqBB}6>d|(nETsU;Bl3I4cGM1Qpd9no8fyWag;O~#{rz6#%B~OxHIoU7pVN{
z__T`A<2>nm<q5>32Y*C5iQnzD8!MOE1B;dGPtMyL09ZKq<0A`(sT>5@D_((6Sr}8w
zOKc`<&+94W6W^f~&+hJF*z3J9E00zJnqLwh+0GGWt#m8_o-HNL%1yN%=UtQ~z@@{k
zpO@MFPVYFNQ>?JeziJ&P^Hk5a8&DVVLLZ)6XH5{c;W1Aqi(N&RAIfAj*GaGW&mmq(
z?><=HJSy*lhbxJ*k>D}EO1^V+e&Ge@E?a2@OJKgFpDz^p{7C-@%O9m*k;Jh)H1C&3
zW_h9mV=$e6ig(2V6!?;h&chtMcw8c>fa)40;{XW2mr8BcSGrZ9miUj!ldwCmb)Eq`
z-j<wWj5tm>hyCSuoNrhJB2PRuyH&4s1af_Jr6$pGWyb3)A9B#Ndar<!g0)%LE#3I2
zb6n-G?QUM6>J+4A5lnz9BmR+K!>frPy7`blV3=M5J(7Cnqbl{P#Og4elW!g6)eS}5
z@9!N-g-^o4)oce`l)i$?ap6gO;6z}Ci9MEwE4iJFf>xta1s8T>cDJgW7IVDj?N;U;
z_FiZXCuF$oto4xXtjm`#eXARaYqAh$p~hnl1$sc|DsY*|^PtWc2o`yCG@MXTQCVPr
zEa3~o@A$WvzB7hAG=OJ#E8DmYL9V$nOwN}u_;1X!rHGk@PbM7#Ej9hZ7T-OjV$1Sy
zX^x4JUmkc^3=XWiNsqiAo*!aGpqmOjGC`+FL)ZyCUZG{jzFi=q(zj)Ao!8YC-|sNP
zz1MF_SsAhs;YJOm9s9Z?c~41Zl{8t;$E)M5&(gayxZPP&=}ru_?aI92!oxsP`p(5K
zW;C{oe<=bF;p<?vMK|@I#UXy~pSv{}F8j%5^PHWutGva!Fy2_bcMpfCK^90Pk6)Cl
zcpLCaf{LT@YYlWjjY5+b^)I^-N%1Yu<(2*f+r!U@H0*3t;rinCS-@N<FKZG<v41`&
zt2rPRKiLs3)kl|thyHc&r5x8I?1bB&Mds-dVCMSjNl;sxtkpFSyl2{K<PD5-N7!|Q
z_<C~=Q0GEQ7z59LL~7Dsp$R2kZP)3@#+M`|Y-GZ#ThtaTv8SIq5>-?T@-rhv$|N;5
zXaz{)z2(qYG4R#3eaQzOdz5Hu9Cu^%YuuwWHpG0tSvnlry!Tw^5t+4)zWiWNY&-Z=
z%-8e_5MM4@ygb-<O44YuRN_x3?6}&4_5ZW)I+b8lj2QnDpq>cMG~9JRq25j<zC!gM
zA+(Ws=Pc7R6|3=<evDFB_5S<0Lo}g@b??@#Ye%#%`n6P*LHl1lb??X*<!0oqz_e(o
ztehLM$YO6l{*hvx7u~lRFktN`td^>L*!bqP0=dp0nvN+=b**7>LYSdb!3eCPP7XHq
zxwZ?--nPQ`7!X5+b-!`A=`%vyI{65Y*MAZqyxY4v6~$^&WvE5UyQlV(vZZGMGuNdQ
zl|BqCCdt_%A8A+irWRPNyk<mX)gW1G)yJbtN7||wQJK&dPwR&{f=cT1q{&h?Gh}*)
zHvYA)LJnG|dyuZ~Ua!eSmKHKr=|yBtY+qw0Oy$u5evLRZYWW#L2O@T_H5j+|1!I7R
z9`xhsKQJ?fDOnrKlnlm$rX6@=Mvbd{xP~e-zj=#B$kcPCXN|=H5`wl9YEkC&#V)bo
zwnNqLq8-KzOQY?kxLEyg*(Bwel+pbwgxhpmuG`ff+d>Yf-o-)|4ukU<wIbt|%Q+}<
zFS~_kluDARoSWv86YE43k;3<%`d9Klr$dJG{DVQ(WK69UHw4gj6xdoEU5`^HCeQje
zy~CbCMC-ZE=`8k}A6OF|k_77m1X|Q;i?A(i5S=MgO($TVd9&$B=j&PtMR>h#d%H0F
zL2t@pQeJv{m7}W?i=h3)K*NL~qGV;bbR{x?eca3`eP*)dNV6LD+?N<D#zMYFx#rS#
zfr2_|x?6nPlf%gXTI<m3mVRX1zYDzPe2MfG>a{(t^j(l7x~3;SrKV&$v{Fssu}s@C
zvbH}nT%_E*O-Dw>xeu(4bT5wWVBY$_Q{OJ@>HbKlplWJhpUd(CuG49QQlpwaj>|@R
zc@&dryX!?xG&&Mij9R>1w1cLjc^XK{PeLcVc?qpDQ^5-jj6hnrDuIG8slO#V`;yA|
zurgl`S&vVKm4h62lBX7QKE?*AYvk*xodNePa;uBB{pei6fAL%7sK|?g^ED}m)6oq=
z0{ygqtjVwlaW!{of+mu9nE%^aM`T_Nx)dJa1cj$iNm=_kLfyRYK7dSj*4nNOzu@zr
zywOza7NMJ}PEAVR9>+Oss2)m!NA8E)dBaTq%G!=Pn{|m(XzEsXuMwpY-ROyv%~N?(
zIJmJLu6!8qG!JSLzeE`<q;Vt6;FqabYig&k*4sXZ5s)dZs<wyA0u}sOB?Kjk8A&Uz
z4t6uh3A0{(4OC%p73+d~|6Co#QB$%2WZUg2p3%&g+4WONDp$cmPbO0c^Ag@zjeTm-
zqUW!wb}>rOU^vX*9aA|8Q9H0NS}4lT>y2?C=Brb-pV8AVcIXsR;U+jcB)9a*i%{`)
zzw3)#9dS~Djyie+2U&U-_eeAM{>e#}35AnjkTviVip~P%L~2jfT{m0CRS$83L{xfv
zwHIar6MD@cV6q5Ui}=(kRN-6&H(P&$z0Rp)ez>ewGT+zGi&3cpnz6q+Y^l+jy)nC7
z+$r~=7H&1xjOF{<E-F*j8ra)njnKN&5G#je3H<lTlFPh0lMZ#wz)j2aNh1FZK(xEK
z%S!k^FRM3I7$F!)mo0-h?piCPg)!_ub;Jfxd8Z=+G~CB1&1&BhQ7JvJ<nR4dxKfet
zTuv;p(h_!*hA^XCo17_O^q7t0;*$xc4KvN-dN@u>Cnvs3uH;NkrG?O2mV4@5Hp5Ns
z(00k~1jyQH6<Nw^HE&IDwLCf?ZJK)utc(^eaFWLS)5;h+T%}R!^ubQbAFlA%iS6Rc
zz9VA3cI^7KX^#u`8rc`nU$@PB3Iwc&)lt*Vnb8yvlN=~+4wq`wb%cblj3vVqS>8@8
zMItwz;XeQI`R4-#I<*%yoKc{%dVxZ$<SZa$GQhhGMIold{yUP-d1PBfZ<raWNB8~7
z{XBK0pF<Tgn-+tl&2YNp@ucx^{R&g1)cqn?VW&C+yrAWCbPNaI;v6K;+4W(zb#67}
zQO20Wm4t@!z!KF*>-;QTK+p2rw_rY3_q8S9C;6RM!tFB`CcFzHgH`ydqa^+AE1&uJ
z<Y`D<#8Z<vrz>z0#DLHXFSPrxYhIftc@Qz_KX<9Gq+D*kMSvjfY;(UGSz}&@BEc)*
zJU{S5IdKPm>C20kHD2w{{dBS(3gNX;^jr%5cExb7qxbM=k;un?SsYp8R>!Y^kEiyx
zGiw1hNsxZ0d%$`RI@jB&A7OGPpmOH#^k(8{0_QM6TO3=qy8_*hYOr=w`DklkiU6!9
zt#w|UW*$C|0*ygbINW5S+N3otJ$T`yeveJL`Rpm`sMF=pw)F-$3F{K4)p~q)U}Vq%
zZXI+nM@&lz7{Lv>e%k`#W9bCsdGUjRi2?3!5H8=asbA+<3X^;ww?HbX=&{0dJYv=N
z)uh%s_f+ywKI-J#H-$6nER7GpLdldiXBd~vCf}pPNzw+H0vom*B?1=io_wf;PBc;h
zZtt*HkY@2z-W<3_(og?fF|NA>&o<25;QLaH7b*NN@p3KuDsxTYqT>%{2Rg+&KO87L
zwnwnbHHGWeGmROLlRSzRAp+9*niPb+aTC$?Jc(_P^Ky?&hn_vPC_7a@!ro%wOu|;%
z7?wER5`Gcb?3T1V^8QOAz&QW@O+e;7?bAD4=Z5auSQXwiZJd<kM=QXf`Yb@a1YBpI
zH+A9^`VKprR10xVheF0kci=!I&Ezo7o<jZCT=RD}ZTys2AOpqV5X2Ox=X+{w?^~+m
zH;9VER8V4j`X5^(RroS;Sw(*QoGfZL;nF$me7|!NajP2a(w5QIo+C9_fIlQ{LIVby
zv#qhoP&=M995_+1!oWdRLITvk8-E9rtfv1kKXrKW?AbUsUcpV=qEfp)FHA+w3C!jn
zMPCe<`N^^PDA2KSGF4PHwW?OcYVp?6{YrUza}Ug*yuZqNA%&{Si*6xqO)2DDC@D}f
zCTXew13_7zvu`y!`C`}Lsw-7B$1CgA``b=6o+Dybwguld!-z}AYlf*1XL6Tb9R1Wn
zTk!6JqFxZoxnocDPm2Kj4x();>%M`r-=zZnV!|j6C3~?kGT~3HY@9`9`gN*mb;d?%
zU$Zpoq9g1E4F^e=#S?+D>O$zXIc1n8yl3T+^Y5UDy;KVr1^X(h3j74u>b)i<3oe7=
z^@P&RW|{`7LoUH`jz*d4g<T;yK_X;^D=I}m6Cy_V_CDAt&aZJt7Bl%jkF4uF3_YPl
zsbEYKZ2eR<!@v2tf$*Fd950z8j<Y1161}YJvFM(LWDHEhd1m7MFZ0wN=FltsBG)P}
zgKyeLC_GCpjDNJUOouU*yDf4dSkG6Ithdn`B+PUCeX$&!+bKepmNuZ4YZC(cJdUEe
z#uH4knA<xe6|BiAN)sHWJ^I3|7SrBe7TLYuMM8y3Cj-P4oOjZh|2*$l3`GRC<LkNE
zRNqS)tq#-a*3CboTAj=R0gD&}8-E>M@Fv=UHecIedL+BXN?t^}H#%yUOL?T)yF>%t
z8cNFgs$hjHy|%n+vT9Ed;$(Le@BjNNZ+adP#Us1^m&VvK#-{kh)N%)d?P^q&Jy?d^
zQcfOue#l1eRQ?xP50##IhlCH>lEbBqRFsC3Y0}Cx`d4#7J#5U5fe8dfaX2Rv4;Rw@
zv(8Rsele_Pgnq{Yi8$OV*6NFs2_pzY+Q|v5mP7mLbxwRDY<@|TK2rQ9_aI`E%hLAV
z!b9K*RR<jud9L%rX)#`H_}ddoi)OZWD5+nMrng7FDB#(B6k{jHp0qHsTu;4>g1uEy
z6n*L-UMpFU*Y*(@hp(;Y_s-(qZWwvJk+|3W?WSE^O<0>-FQ->(!8!&rjIQf9T2d#?
zYI6@qIH{qDC|=UZ-OqBk?;#~B-*V`w(^-arWM+NA9lZkQI|?`@|K#m8dx3$qaGLlW
z>4fk}SO=BoDCEaFMPK4QTh%G1eCQim3-Th9a?8Wq!(Ag48B2cIw#X!FI-25Zka5mJ
zE35t?d2WMF@f@0SE%bAE0#EDz*fF~gH7Id05`WlGM^DBM{LnuCwQFmcESvQPk7ENi
z*#f5FF8a%)CC?JjI!0E+ELBOtL%TUE;wy5>bfO-&qE~+DV`8LK7#pkIYI`kOahm7G
zxP6c*hHKgQAErU>XW4icIGVv-1_6C8$OeO24yW;Jn8}85B6=lM_?^1KoZnz|!+M4l
zSYPwmZTH$E=N@FesVbwHqYNxt854qxS;A-?OsOgE3OEj_^-ia9@<5ZPDWW|thEkWt
zjDRAzcn-BKF<36R12t$FNa_Et^948yjqXd?G?M|cN1E1H1_@T(UmxopH+fQu)G!cx
zQWu!kdBuI3lt{M%_3|x0{#1^Hu7Jm>98q)ve|cUc&>(CLy`nNTCr3J5;PO~dGVh}i
z9ihWrS!N}*hZPHI=T1sy?Kyru^3U+B9(j}JE-S0;aW&yEdl?iJl2XBK)l?>Vr0UrX
z@Of#6bC{_JjKGk6=j8~D>YXGaW<&rs1SPAiSMw^$;es><=bGRJJ;%8zf4vs>S{rUV
z^Upb1JMkn7?xn0a-3b92lX%0D{FXg9ItEYr2@bt~(u`{IT3P8&WH&$1Zz6211!i?;
zBskZT1kX03s$X(kFZBaxazoh=;@}K`n&V(a@=Yc0tllw$ojuqBJ3tAVXj;EK;0K*G
z6ZA-fdc@j=AY`Z1)dHojf4ergODdq+TN}%_AkkUI1oq!Ey^@_VdQy_CF{#~AX9rOu
z#IvpHx_lM49`LRZ0vj2mZ;psYDWq;EQ}C4-RLQ^?GPj!=ld!J65x3Gzr{HIPWtW~3
z0tQd5DW3EuHJ9%1YO+5ypeaC<;Hed_0#D=&ij`^+n{(hcs+SFGBEj4%C;^9j^#;7%
z=U@!6yENfWh2h1WvRBVO{036^TKf7aaGv*nya-&?LTx3fZ-ZCzKmp`yw6-n#y=?KF
z^Zb8<(m-6p;j><E3=1yF0zEi3z_5Y#bJv5n1XY&3L!ceaB>(T{;fxE&|9TB-aojhi
zF$oB;Q%j9cLoA^i3H<T>XOhRV;P^=5EW8L@8?dQPuqc8<*n*`^$9pHy8tSgg0wCZu
z1H;e1NeR4ODXrU?dV5RE0cQm8LURgW-oJ#%Yr#YY)hV^iV1P(DJWU5GU)JErOcD1g
z1NDM07K)^_o*!%NyVOr>89mDppni|scnqY&_47*&Y70%hva;!)6r0T?ojI)=f6d`s
z_m4Saj-`Kte}g&HE*M{6f^t}QY-zHdM8EPSfodwe2xaWtR7e${u8^VNmBXz)2+@{c
zk8qe`M-gk6n8pJ&|8UJq#5cY^xpI^A!b54Lhj){ICwuj#ylNAqph)V*Za-T2*)%#a
z`h@@Xy3+|7*676+g0(8Tp%mX4UiW?%t1~%*)`k`ieS)|OH)4FN=|0A1**P^U&=W^s
zSd1S$cz^?$1c5mA(ewQWXJ`?IYrbT7NIDVc%o-{h8Ut7rNQ>lw^a(DK5Au8vsBxhc
zEC@3YcUr@qU!}n*8)x*ul!^SOHFXTim;DLH4K17`rIe}`jKueJZ$GwZmt-uGT=j;Q
zQ)ZCoC;j?y+BrTIVxr%hB24LLs&&7_|A5f}ZOEyEy529`4~^w!71E~(bPI<1&Ip`z
zQzb4pJiWhaa2_T(d{xueFG3>qagS>NP_|cC!Cxexk?82@%ld&jCzIa#P4Q)^yH7)g
z;KMDx!gg&FJH{IeB(z=Kk{U~$;x$$a3q{8zK&*wX-{FN{jEHD9i}_GJbToHssK46g
z#F7dMHFhdD&i67wXR6l4cWVXu6_|b|l^=l_pD6od!|w}vws|klA4sNJ#2y1LJ18;m
z&z{8f<g|Vs*Xj9j=W4wnPjMxC5WKkDNj!^5MQNbkc7(GRNYhJBgW(M95+_}Og_9?_
zT9u{=Qa&)}*iqQQ8wJS;Gv&`2kqtLsf%+z-w351y`)Oy#F55ltkhZ(iVpWcBcTs{N
zNs{<2*~sF1l&_ZhX;@b`r)K50+kI0}wYj*arTyF(zaFfa;hwefA>zHeSs#JIDhjSY
z|E>Pz4z{FRQo?s{81T|%_si<rByR+O`9zfT^mgkPz7}ff=-j_|uYD3UHH!#t1K*Mh
zxG2;BpxS%C<J_7*^*qNj(LZn6<JMA08<|#Z>4aNJD|Gk!%bAyPLsEK|-sTmZeGQ-!
za1XkCjXv{2$YC-1D^%h6;`N7SAw+u6l0x|D|JU7DM@9X0?;-*ssFZ+$gwl;t(ufFB
zqA+wyhlDf?Eg>i%D%~PALrF>tC`jkfDLurHLkw}x;P<`veeb>N{&j!<{LWgeHO!pP
z>Cf5w?6ddtbX*?S|E89XDjK4OqV-e@xwv2plm1UtBXf(K@7Htmh)WOZ5v&F|zOQJe
z7WlRaJhH(1h$BWahZ_V4GMWKFMxS3Wv3+88jMP-6vcA-4jw)?sIWNYAW-6-&u5B80
zoRh<z?a#n+u$JX_S$&AFuy|Rg)U6ATOp^RhAtp_AY4=-Y>t2$RUUI(NXk(Qgh70bt
z_2ZG<%&Q!RkbIMebc5^1b=!$z=O4e(2+SFzx)_CO&b__|HcpWGu+eWS{~8mZvZWnq
zTC)y!N{?mHC3lv)4c>si+-|xYHZj%vi~F)U>l6j_KwUju&dw%74g()UrG6Z8uCp=?
zBKJ-#4AZ<mr!SkArNZd+)9t4>39DS*Y3VbeD~&@hts-xTlIR=T-9UI(^63k^tiOfd
z%{UnGkclYmVaV^N=t1OtTecp3dpB3ozZtt>;jBpf`#V6!o062TH;U1Xb&uVAq1(=D
zkI92Tr3xPw9l1ag3HoxpoA9n)1%ysgcb~o9wtS~qVD%0~22U8hW%qIVKuD}rAK!#E
zKkLr4MW{;e<-wBilW_-ISAFG&iP@<%w8=xWwk1KhWkTPhR-XF4L9Q8SjhY-QewQ$-
zsx?cGssHHBX*TpcRh(CYsr13;G|?eF0;G8?Q5!fjfE3UJv;q^r_fZ1$lyZhSSxiFj
zO&7Ow>U7O~(-CUyp&1e1v}eQr=rIT~7gT0_q99&C+C2BRv>~b8X2yQ178(x$RIsa`
zOcFAF1Ych|^==fdnnb%La^h+D#-xe<B%*jAe*=-R7fADv89_vYYxw6cLc#yBiUI{9
zBS{zv`p@d0ZD33rDTP1B`{nGE_DW!`JewdC8~w%h)9y!xA5EO7-0j52Le@G-$0U2`
zD1K$}#J;R7?D-$)Si3;KN&tV{KsVh`_`bEDKy;~Wvabo*BKH0KLkfn(7PX9lyfDh|
z=taYjyQf;e9xKG(McP`nvkabtqu1rI8+{pjf_8~nQoc}l^z!#3+ql=r;nTJKz^%Wy
z`Tsd$#$Q8;d(Zao`;NKxNey^)Sw*kfZ7t5}A%=%^ldn>vwKd^29r>3rDX?je=V34s
zeFTXX#bTBr20p@Vc$0x4_v?Zj7x&ZEnYMB8s*EBC>xas;gAh97+<`Y|n0RLwqy80D
z_@6Ib3=J>0;pyia!jLhAnz($uho(YWQunk|m*G>ds=sCS+m;)+tSG}BAJ<#Kch{3K
zLc)T&*OR*E7mm7pesWj1t{1O3Ox>iWw!^>FK+tEo7DJj#FA4QbmWNjU{F*kbu5dpM
zlYnX4-xB^HOnLR&_I(~nx*oj$+v*O1e%}I-{;L2%u_D^j!qU=q%s8z-(*e4{WEHKe
z1^5oy%c>DWD_UjCDO0N5S}jYK9%ajA>v;(7WAk(y%Rl3ShEWm7B6fw(h^;YO0=f-*
z0y;W6d!e_RFr{FbgdYpYtJ38MbzRcI)I54$IW%%c6+jR+_Tns{2rS(Hxp<8qh7nr^
z`2k?-i%?M}Bk*bPixs{0LEs!_`b~ciUF|-@$Qc8)m+Suh7kqVFK|RvvK0vzuo!351
z>}Jw)W&w;{uH-IH{Jp(v{>;w<SzJ+3F>X+&**}%ICHYB;{iKh+BI4b<GH_>)U2*|^
z@<8z+q-8Z&LBQj&Oh1^n+N!FNXXvU%j-2~VQb9#p7>m^5hLg(r<wd85f)0}p;ocXI
z;sgtW5e3|1+O2>p1%XbBUwyp=V6WCHsHaPD30>>Y{f;d({e~+8E3Z=LsHHpXhhu95
z%4uX8kgt0NIOBXNP~KZhBjW&dzX%1?F-|V7vS{y-CV+Gpe0I8->^3cZy)60*SKZNw
zERUac{ta=vb*5?9N8tBrz1R`^(zsft7eDw7@*^%~UB$Cs;#``B6Xta9HWR;;&y6KL
z@GUJ3W7cg0RXSPryIg7ly5e6A!-<6YeQT?{9XV#R7FF3yz~HiYveKWm-{_d0xh<Nt
zp0%RlSgf1N5-nAH84929L1krSVZp0x$IETdF<aow^}*fgU7YmUS0brh0wOkwRKM)a
z>NT3{juU?fKy`*d{cd7LN=vTaCoP?H@JLqV#$mIHBP8;$97_eRBY7~G(nO_wS+tTu
zG`*&qTLt>FpM$D939aZ`7H_fetDeS2=%h4%=j<C0(Fq^cWYlX1ha!jSMJle@s~$_I
z^352zTQJ^%FEdj4`<e<Hm)Sh_p65?<!8DhoBQ<BNF^r%MH3lH?UlO-RiW5lPJnUeT
zb5dWUVD<q4*)_X8;uWvw<uQ5j)uj?{GB8%v`lo}%jk3NPpEmUS$S@6~#TIIHdH!`}
zFNDHuqluwqWn~or%wm6IVnm93I?q7H8m86sc(<aii&{p(ZsjwrYs=78nxq9)k|dZ1
zvYt}S?#b&4pM!|=<=Tp^E6d)SfGsURAg}`;5ElM>d&*k<)zv=&e1`(3xgUX-(LB=)
z*T9P9JU5T7^woiV57xO(BPKJIm1jyQr(yQ#tO04rqqn7C&of<gT*K7nOBGEgKCE}6
z9r965Nx&Wy{jp=1{k%BaSzlZ+$)^psJ1wQ?>v#6Gw`7Q{=m)$falDG7Fb`=dc8!_Y
zX~QCVYSm%UlSm+so#dZagST-Js0FcD%};4DDEg{~u;X+A;HOwV2JU+6S%qE9PNwR7
zsUmkZX4>lw%vjTwMTf-_L+7ify0I#k77tX_V!}PYsM5%oPVy|B0<7x>tZNg9RADy-
z#3mj1!CyA~(+U-4c0zmmsb`8psthtdRcYi6(Ka4uYx!(W(Vm^+*Bq4Iv@<vzev@>t
zoI0#?yIPOmv9zx076LG>s&#sJ9h@+Xd~uH+KD7ujI%{*MbJ$vMW3^eyQ%u^K>Wr0K
z85I~WS}dd23=~Q$5ciWbIQM*&AA$Cs4qJZB#L5hRRex%lcCZ(`EJ>&T^4qCsFUGG8
z)1c-l1s2VtQ0xrx_T%9M3k~YOJUQGxo<jNMI{L2Zqx-dTc2+ZXKSU4-Bb%zDR|CZv
zzOnYu(%#EzW+J_=K(oI&Gi*2A4Uiwb&b5=J*<aYJPF;Nyf+eB>m~WBaO)e3tB4Cq<
z<3y3Up!>OmRPO~<>s<3B;{b}=V5!Q$$Q-AB<`qA5z!}8)3bB04%Z;%*d1XH#?dS##
zdj^S^NMMc$7tz^z8%3MNF;g!))*v{?fU+?wltkN%esw#a7vFV>6^OYw77~IW34mam
zhztgo(8$a%`T;tR$xILxgrVZo_-Q2@`ncz@cjMM$NzY!C8>&$<Lpu%n7{jR_uMq*E
zT#VH_8&W=QKUiArH|8D~j-1d0TVRwh!_1zLl$6o#a&J|03WzxYM&`RO8`0y`hsqpE
zv`)u2h&bN#d{dyukmR@0yC>LvN;@wWDR1lAS0F(QiIUiD|M{g?H*0BeaNomv0KM#2
zEaYDCbNj}c(}q)b*RDkC?sqo&h8NU(^jTM_kEc7SPR-tR(nEz}Uk%9{SAK{syswD9
z1?EVmwuGC!8MpJ^r44GE&U;4!jXPb-zrcA4>#MBa$&&g$@uo=U1n}BfYktQ!6g${Z
z=xE>_u6``v8e13}3)MVy$Q-UaPLi5D%tT~0`}CwC)2{=NrlHFu5*c7;)uCjHiLi(v
z^*)KGrYV}lKd{|dl|m}T*=aQ#kezihxVh>wpSNYJU_z)=91Gt+A0wJN>Z5(oFS94&
zpncYO6eqGH?bth?bFM_3lRX9%ZfxURlb4jR`dD{c_LK1G%P6YCgB;?qD#^@bHu=qE
zV7w%x<9-*6efI!5LBfv=uqO`hJOyrlgKGZA8e9?&$&7_sz@_TmIC5m~)0l@X(n5Ph
z+Trw@(J?AsQ}df~e}tw1B)Vkuj`1(+-dxIKe}7LS>EkBVV{}!~P`%6YYHE;&>q=jy
zhH#IXqGCV5^eP5WO8v+(()K1?(&HPQZa?YMlw7hTSFZ8K+E-k@8vQ!auPNyM!kx&n
zFjajigcbTwXZ}IdRjT)7pIp79p$6(-qe&*_L!&GDZ&2M#Exc-WIEfYMxY*0Bw+~s(
zpQ@@N05(!K7A_)8`XtkBzpXuP>;5O?Kl?p4wyYM_pbsunN@XLAPySF3yFefio$jn|
zUtj%_ZRsi?o_xVhjb*uT?Ug)ONuD{$;jLYNum)&G&=Xo2Ceyp^(_u$OEUc_ZEXfi8
zUCGyXsQyLd@C+@9N7{94BNHkqcuKwAfS;>gZB;9|vofL|jxI4nYNAF+G8z5jEW39m
zHg&$<!w4g^Y|p>dQ*d)AiOrSD+tkda`)0PdQ`+z65En(FDkQA-LMQ}7O_@Ls8L@aP
z0dEl?P8BFi$fhd{LjZ3@d&+Y@5;@b@h{&3A{j{*8bOzY&Ruiq#`t7TxasZ#sDA65<
znpIXTriB*aH<#sScDFex#1gtCTDf@^lFOyXe)=jLru9lxd0pStYZwtyrtt;1v08+s
z05QbS|8&2jr$_7AvnaovPw6Yw%V}jj2#F;~*TGL8M-_enDEbu_<U4Rk6*T#QBa6@Y
zHJhC$Yro?(GFb}E%d}8-a9xuB{C<61wVtXaWk=|SH>P`dsiP%~q#~X}i~<Qk*`X)p
zKmM7h9JGmvlhj-upWi^ef>bO|*`Jxb^JO;r<l@uGja7LF@g#xlTB*h<H)cLCcu2R)
z^{up29Rz5=&UWG(b>;&3NLN;hJB{kSZMJ5cI)4AgqHhSgt~|GMmg3>*CVBwyzzROJ
zsPtf1S5DcGzGmkmB4#rUN47ov5E_W4rVVhb8?FQ)b#`2Qd?Q%2ubLXMwX?tn#g4f6
zWr-}&MYWq^zL<#_Es${{g-s7AHFw(^MFKwx9z{tkMnzVk-5We}=?&KES-gLntKP1A
zVK<B!rdAs4sx!7h;-HM$ym{{cxh;*@d4Sq(ccJ1E=@gf$K%*lb7PuWrBhr<+MqL_Q
za**>U(q8l%{VuRgU(cLSg+zeC7BH*O3uFVJV#066&p0Kd9&a<)XasmPT-NN|+~ut6
zs+0l6I~ipOiq;^k&7U!;*ixXA_%<*F4R0)IyFGsVX8(ApqT9a(AmEl-2rSz=tZ}oh
z^HVG=?T1~v{RJ@kt%=VZMaf6gcYpN~2Zta*P55l)k)F{KhC6(?kK?>3xmZlwCW^+M
z&-0P8)~k<v>RXC6>kfdne7Vx-sBuv(X@67<<@sBM=ewh=-S@r{MdeK+u)JALXvMS}
zwv|tvpJWS3Rc7waoqIu~PqRF3=X?@zm2zW3=BO$QTD^$rQfAv%9GH7E?LjWRZ*=Og
z<YadZ_?Zy>=P{iyC7;Oi{PbWSpgtG^#xJct6K|@7tG*gSq{v34(-yqJ_hS$p1Lt;@
znv<PQA-@w(7G~zr134(@lDQ_eP5)!%^3{xbe?2p|-7F9GiF4Eh5?At}V|85xv0_KV
zqS2Jk>j+4#0)8j`;ErnHcCi2VjFu<f>&wdJzq-y8a;*m<UEH#~Cg*tc&XU_8CaD@9
zb9$gU0{X{~%cTt-3Xg8t>brr}`oFjP&^zjm=$C;3b^FnhM_67;5U^Od)f9Y<)#<RN
zlif||+#*WlgGuGWf_da^0eOHVl26I`H8)qkt^Yo-2iRPxvObzugUyx+mu<08i$gSu
z8+8X+b2z5;^nPcn`X|7jtf_4suG~x&R~a{2+8F6$HP~{B<sLaX37Us7)qRmX=(Te>
z4`{|GIbTfWeq)K$ZcO1cuC)Ry>p2!l>fcWB%MWIHf!ml35COwl>guNZ_WbF}4IKnC
z&Pol1@I;&@6{qBgV6!)PWeXp5k?dSt%4Tg*6~bi<)E*z7<q_Tjww*_Aih)Gadbxj6
zV_`-61o-`u8lQ!IzOWjm<RBZRi$ywoFEu-LpX}EsjrUI5OBL`G^CBWVD&YeX!orG=
z6;ucIChY{bvWC;HwS^MLAvZX6%H$ni9S`|US+hje=!|q78Yg|;+_==wS>p7lR%@jP
zzoCJg&C3_`hEd@^dov<1GanH_#O2CAktPW(u<W!H_uR@rNH^_)Yh3*^<L+W}W9cjB
z-=d4R%g#ruzQ&;7V=B*0U}P%uE8wi4qxx0#u}#_aI}blFX1)VARq7B&Ki%H*Y#HPC
z4I0F<nU0T=S(1AYkt8#Pii*QGfP(NF#AfmXhYuYP&O09MG`Idh^BQo<(*_3(#QW-;
zoE!^*bNj9(Vj3h8xvt;^C-bQ%QDF(nm*LkpHT_bNDBq{3sEC}XDn8y@Z7G$v0s*n4
zDX<ya!ldWJk=~v?%rQLJS^H1h3vOQpu<bFLo_Se&KY~!(v=_(c<qh9?uZ6ZJcH8}F
z7Ox8=Bq6Z3{V($>Fm)ON3nwAHJ&f{IKa+E;N3VId&h<R7dSwAzQIMhsh6`-FIKi=>
zGnp3jnR^7kN{KAXpRV7)nrd+}1c1t3BC`mrco1w!9T(Os!;rs+4D5s0BqQ_Jxbk23
zxyE<#Wpv<&Md#|F$;nr!OAlj@kG-KcrBYB!T&QekC<@P%B36P~W9GXn?99do_9tt7
z{vlk+I0eo&T%EnH+tJ%y8Xek@m&g9AwRIedZD^mDHyXrh1e8UCPRShF<;9d0=z#Ni
zpKo-LdUdDNw4wo&sKPsIa9SK1v?wv2tF$OSIn3_6VzPEIVnC(3kkfz8>SUs&4cV=%
zZi*u`^R}YBYEb>LoP`z9sn?sF=>xY?c8e}sekkZ~p(uQetr%cWVVx$LJHL)~^!7SV
z(d*{n?^@dIJP-4|(q8WptM#hj+kse-YqxCSNgoGl|G-MEBt=2S-Tw=@-G}@~r-gq_
z1B7?6)o)QPKKHSW%b7WHQDv}X_@vL`ND1$lZb{VEe1W7hkAx6N@?TU+C@SbGual7}
z(C7DZrf&?wYY@Lm$Iw#mmAL6!Ioi-HZ76W&66NfzFK*OBUT9P;|HOh4X0t`_%*@JV
zwzrt%r&&qB<{90tn|D$dEB2Cj^`(H|dC?t~iP{anXX%#loKl{;@L`9+qRT?lE^a@$
zyi7t_jifXN<*OuIOqS>7CST-@j8Iv-RBQ0|QizjFEZ;}cCix(4S{%*vtHWYl(w=7u
z_4F4Qe}x~eE^-#_Nbn24NYw5fURMNhxwh`Gg?&oBicJblL?RnH#qs!|YMOJkvpn6w
zPkJHFlT)9<Ne&icQMKLN@m>m6>lGkWa5CV1eYcx}v9tTrDdkgk3|fbCMCyDX<Z?*I
zdADid)Hn_9txcL-B@pk|9HtE$LV4EW9jA0Zqm+7`PQ}Zop3kSlKglb4@Y#A}u<^xo
zXLcZ;zj>NRzriAKO#bo!3ox`MvHb0I1=Vslf@19%g7BE>^k~~++N7z444UT5IvV&R
z&{PuYpGMXA;?H%eYc35IPxPvPyEO2{BEO%FbyLDdZ3u2e(o-2VVO3k;sEWO@BNZhl
zk2NKUMAFQZdqhSU8RR?LIY<}|`d9W%6yHU$T3|SdZ&2E{2t#)yL={{ls(RBFri2EH
zQ&GMu$`vvJy{mK6HF|J&?*k3K<ia|II)FrY_Nr!}FpkVYbjB!C)|h>uXx9nus@b5m
zxg}L>b|~20zANQ8Z73kv1Do8MTz=w`K5vSnA8BhViF%plE9F&9LyZ!hG{~+{j<eal
z>yj=P*_oOfvXtU4sXZ8)Oo2{8)vDS23Tu;ae3CD$|0YI>;T14OfF$!4Yde!G@Q#;U
z-8gx==HVHZ<do<<u)fqAKJPkrG@bdvkV1xH<%Wb+cP26phNu1blR2Nk%G2b6$^3@J
z5ZIo4|5a*Pv2QThLeCMu>uqvVPHQ)_*}Od{!WLITC+{St{Lq70AL+vY$%k@U61=^Q
z>h|aQ+!3yg?eD7{-OplM_SBx<D&UZ=c{cqkf0EPB_i^$Wl@MiZ9paj=K$I~R(M3Ch
zv6f4jg@{tNlMN!AwTN>D^Zb060m~D+ij!4*Z`;%Z-b-sgTw2=>N)*()Yknn3dlW4#
zN@iNXi8Up?qpCQ6?D`NFJ*z^wdH2vURDfW#l$FAVTl2SsUc{F})Nk4)^Kn0H4UgmF
zoY{V)eCwmsy_OpsufkVp#zyG1<6vKU=%Ms4&hka>yf)8%_v9~E`UP?nfR@+JvV~e-
zTeUN8c1&pPRO9QLgf7`m2l?<AHRwIjhwHKSiZlOMexJ^-AyBXw>@(>ss;pV(o=qh!
zVeH~Btaii~pT`<-wvW&()V#6!hG3M*KU@hNg5!+*`RjJO(jG?ZNKmMp*8_5<p0?DL
zIA6unhrEoA9M6u3dmZ9FMoGsMR>FD#H;j{sjYA+6<?PrGmlX6RJ;v}RdmPj7LD|me
z@^w4X;^3mYRJL)((r*G>7euo@#tLBky1TozEWh~wHgTzioS{t@c{sFihy~q(<bM4c
zZ@KZh-S;QGnDihoJmO`^1A!=xtc%&exr@a@BE4%cZcyG(VasS=C$HE2K6~e2cr-FT
z`*2;i#($c7@maf;H|%l$bjzi+Ef@J;c24d^mdiYlGsk7S&1e^)4O(3#RqN}tlPn>}
zziq$xcyr_zDznG>H;m99!DGs@VY5FeiP#qPwK}r0WA+p)Z)epVxS%-VHI?Un_szyd
z0KhwtNNusA?Paf_M&_4A61!D-<MP?(!xj`9`)5-y(V?0;HDtymzxvBC$fjL~56xy4
z`o}=ba)El2qpvbiH$urJeboFqeM}+8OmeoDmk7rMRH0b7cm%fL^|U>!wF)i7&O5v`
zJXreO<EGu4Vg^~gsMhSUdH3J9?oaQg?s4*!B@1HK4YIu&SU&pc_YE9mnyQ6+@bhZi
zo+i6!AK{}5DNwq36&lpwmMrd{8AmOu52L!%l54kwH^Xv#g9Yu7dd$aXgtC{KG{_pz
zj$7T2gC9pP8&f|2HYi@b>7J;)`Wu`_K6bK=f7*AT<TcLua~XYGq;~NnPiLYIIt_bg
z4qbn}v%B@QUT%<elq{fDM>*XH?hIAUPMQkQtEua|flnUm(#>*7NXPws&x<I-)6DM9
z{dIjx=R`l!O-{->B-H4f5l~!+*C`ARUa7*PxM(^+q{no$&|kg_0A}*5MFVc+%HY9^
zi>nmoT8<nVc8qN84SYDGGBx(*CvrKa{Hy)p(#;)l7Vl3)P3hF%Owg*B_p19j#JNqH
z7WJI_KAPY{$7YQk349*QTz1VriDlb6)TiWm^!xRl)uh*`JBx>W9Imik{>R!OL}v-b
zn&QvU1>rOYZ$xWW?=}9E&DIlLzDlWQ)@#>uP?vMwEp<f#wcA9|sK)eq;8k}m@1TBl
ze~E6VAG#ny68Z_Hm$BMXP#c_jp-}i0vQlDu<RlAjYReUVj8X5W)1z+@>Vg=~1#=mU
zppej5*4Es3{Ug*X@h22PqjD@(DaBlk9%T-vJ)Zkgm8*|V_zj)so(uUH!HBHhM0I2y
zF1QG8CnfJO%+PWyizh>D;%H$X36|b#!%*A4J&4h>E1DH>r5*y8D`DiA9NUE_X0b`5
zG0X3@P%q*++2GMp!*@QG<;Y*xa<HWrf6nBRvlho-2<JAg2<Xl4X_0Vr{qle}3HDBa
z+;!o@vXl_7-4U~bY-!JJXrkq`_q&{^Ej9gXxs|6Iz8l|K>tpwO<E=-(dEEE*oL>G)
zJk&e)+2#BZVzihc-Mxr)dAexg%)-eoI(+A#e4PO*G4pP_X1zA;;7(39F>tAgZ+T>h
zHP{FAP*YHt{nfXE@j1aAHE~-9K4Zf3YV6o$z%OjWy%+i=_KW`~0y#DVNdMpRd~h)K
z8Z~1yuQuQn5)qkz;ylZ4XqcwIYtkdTxLgRSuJ}}Rdv}?gZESp1%-%>qp={TReUOGp
z2b@hggoTH(_wS&8gJo7~P`nghpeg^8E>PTMf>(zNlO=KcC=YLAGvZ)bopIjsPu1@B
zlen^g&mhS#n}q(|alk)#!J$2b#DcB_<Hg`|Y`pJMuJy{gkBG4s-<W+o@F{-p$D&v7
z96LSnf+b$xnc33zPx)%iQgc&8bm`{saHcwCU8xejJ{M$gw%)(zz*$Jrel4F?OhjCB
z=a{Ha?;sq8>>~cfCF+~Ps#a|vnl<j8EWCV3+bin5xfqkz?y+cd%x|Y4HtXd`;%(Gc
zbH9I=e#xda&vT6xkCl~G{zh2KKOKJj*a~~s0Isvm!$gXQPU?Q?)grJ+2ubRcJjQwW
zaHWlQUujFz^FX$8+bj_P;C)J*b{4sFR;wd4{kTp1d4UzGckqvhQe9KfoO`>j*4Nl~
z64Ac-5MI;^<(;pzg5A`to@o`c*|MVDC1d$P77ZqklunlRlb)6Iz2ug}t}2|jcqjYB
zIAo#wv$fMpZXp9tM(oa-eHNulpYIka#-qg0Ngl*|FH-Bx1mo=x^L`W<zvyKD7SC#r
z&(O;OW<uIF8y_vkeSWVq+Zt?2@c8`P{chZgD4$zGOgcAUK5O}`Q1i*L*zN2O#yX&E
z4>$|alSU0<#&GwEHgC0I)j<LM9_qk(E3CeWLwXg+3dUysZ7%|5m0{JH^gMHc#|(?l
zZF?Z9d((fm!#lV(GGAyF$Z|1mJzRR-PEj<XdhYy@D5&&cuLkASs;cea4!dJnogLhi
zw3D^Ap8W}{ue^a3d@DqmPdP>M8Es8AoJdKY#T5(R(;Re2F508unZUzVjWdf4-npj&
zxhc?Q=l}^N!1yO;5m-BQI?Fk<7e7%!Y!)uP{dNET+V}9&z5CQ@B@r1uZC7}rl^J3(
zg~{YhYvj=tZL!&~t>VvHPkX8}D@J7B)b4u~RjX<Ytkb!^)g=$dA;H%3w~x5}uPWdD
zYO8|kW+t^1zN5dsnQR(;!hA4d0$nN2&vdKK+KJy0wCcIjXdkG>P~temt6BcuzKu89
z{900xfw70J>n&;E$pv1Tr`U)nvbV^Pr++?T-b(Jxb00mWM33%x1qzpo21I2f3ShG_
zs&7&A<H#|x%DhofY8<W!-htUd7uu?e?FnZo5Z)m^#}&J=>FNFh_O0xgD5)z%6??He
zaf=`7Y$01=v7GiL9%TGwcgX)$XOr;{9MZrpW_AQ|(_K-fvI=8>p~rc!)@JhAe{x*E
z$+-3HHjgr2Z2X4$+C4>jac!G4$MCN+ROR+7lk1SVT{y%++UCy_(WHb~y0zz-arNrs
z=;}TP?(mDbc*_h#GHGT4;dSyBXV>iFYG=d1v{RRFDVBd4$9R@oExgei6l(k8^2_YG
zRrO_F)oH$PSzZd2?{LVLZ6+P*!_RJGa@l%jd{%Sng_AaH#Pu$&r(L&`yrzD}Wqh%m
zdQdp_sIHF;0t^Ho-Mi|Ke_1B@0Yzp`&?PJVQs<)d%q|_2Q8%XZvxrC3(j?%QQhui?
zEm3bjZ;_tos(QYCEn6S(PG%Q#57j{>qLXW7;OkqAuLtaF&c4Xmy~;O$=Nj==d8i9*
z2&2|o)1@^;RIJ0PA>a2bVn6WVeT7zSLM~q%LGNKbN9EWlxUoL&(1-BjC2RCab;qfE
z4*AB1rkIC1x*AqCXX`dJe<WQcdgqr;Y}1O;ld_w_8=QV*9+q>fR=YLq6z%E*>2Cl}
z)3fJ`S$KY2QDvi)R54Fh{CZ@8v-9Al+Y@i}uWs(>n*r<>v3y`Ll>WQkaFh_zajE>>
z(b1gtrM}lWrgw#hpM_utCWCl?eQc2ak`+TNn2x!XdK@1~?y$cz1n4utS1R9SnDjEt
zR7zxIu3(>dE7!$(NhVmJo;u+UzF27b1uKTkSF9h54#ijQ4P3m?bXsgi8D>C<ti!Ku
zbotQxJ1(uYblGPeECl>TDTdKUe0a228?KZ=U~#WOAl-$t*2CMo;$i6*dV&?rDLL7z
zLuqoHzM!S@AFGKG__Frca;<3m!x%@g&M6-zaZ}q=Qpovn*ep4zR?4uy&b|URr;?ZQ
zR5(s|DERpUiOMPI=(;Z?W<Xi!JG+Y_Uj{Bvt@s#ZUN3(T2;58lKKJsBxCyXqybL7K
z&ag`eXi|aU#&n(BDS=?`x&q1~|J7g@Hz9@+nGya^FuZGT<(3YV3=Hm2N?^+f!u3zS
zXxXb01N=TW!2ek^&HD54SN?NrktSTbB<%5(kN~2=bZeVhpqXHmr>~-`_m=N1uw4vd
zpT`aeUe;eJyMP!OSp%*BX!?MnfyqhkstJ2v{N|9Owwkd#Eo^2|m97vqN$<T`Ty{LV
zGjDmFaQ6~Hawpb9*ew-40#;Kp<l#907Q~U^7w|75<z~F~0@$cbi@vy58-Oweex0lf
z2@6LUa}csDF|6dM2WB^5OT`m#xqq`>p6oraHM;{lttrn4?rfL9-8W-hJLcRkLQTwr
z@HA3qF&78IruQr{q+QsYc}4)46_5+J5X`>X4!MeV@w4o8O=3bzU}sZ?@ZmJI0orEm
zC`W;lwjU!ei~ejIwwpC1kv91=V5^mlJgRL4?`jh`JG~4?{-fUnmx7B@xY`}k0v%ca
zoXzdq6vl_m4f2z@=o8|DqV)?DF4hyU0{*O{k_EK9&2fsJM~oLz4}^w?hlgj{Ub+k1
zIbU|W4vE~oP%XgVtz%gS@ELD{Wwf5SBr;XwR(9~W=z!TlaHq5;tIG(v_|c>Z??P^Y
zku3Rxs~b$yj4y{*CP?1?Pg8<ou2QKi;*u&IN>h=w2zs!Z{4Ga@KQmB3R<o*D0{C8#
zMbS&bu*SFx@c{e~#eq*uA*D#bbGh#UZ_UrDq@5SuCM73tF#Ub_N8-yk8B-2sEb)$9
zd1xqs5YUb<4050uBM+1r0R|?a8wecsjJzIc>nx&+_*WaXSAZJzg|0s$Jn~yK@NjXu
zw8igxO9lV3=07=BUiJ6)54S#eO2`&W9Kb;FKu}n?W5azy;h*kD-3O&V56tN%R0t;i
zaWz4{m(@ID?n1zTdqs<8pvxoxQ>ixikt0Eg0C?Uj5eO``{WE23ra{;Yk&Dq-85n?*
z@BtgeVa=c16is?jfy@GbWMa2v0%;R9=V8<(tSx5+uYrNdc=n(Ax8@kwN$=qpfz9Wi
s#K_2f_D}Jk1=zT^f6{-e&y1)33)w+ka^ch=*zPGlc_vpRYZCN70JI<S%m4rY

literal 0
HcmV?d00001

diff --git a/examples/nlp/rag/rag.md b/examples/nlp/rag/rag.md
new file mode 100644
index 000000000000..b7de0ef46fd2
--- /dev/null
+++ b/examples/nlp/rag/rag.md
@@ -0,0 +1,141 @@
+RAG with NeMo
+================
+
+Retrieval-augmented generation (RAG) is a technique for enhancing the accuracy and reliability of generative AI models with facts fetched from external sources. With NeMo, we can employ a text embedder and an LLM trained with NeMo Framework to set up a RAG pipeline.
+This document illustrates how NeMo models can be used with LlamaIndex, a popular RAG library, for a retrieval-based text generation application.
+
+## Quick Start
+
+In this example, we set up a pipeline that lets us index a document file (e.g., a manual, repository documentation) then ask questions and details in the document.
+
+The only dependency in this example is LlamaIndex, which can be installed with:
+```
+!pip install llama-index
+```
+
+A general RAG pipeline includes an Indexing step, in which the corpus document(s) are processed, embedded and indexed, and Generating step, in which given a query, the relevant neighbors text chunks are retrieved from the index to provide context to the query and fed into the LLM to generate answers. Below we walk through these two steps of the pipeline.
+
+<p align="center">
+        <img src="images/rag_pipeline.png" width="1000" >
+</p>
+
+### Indexing data
+
+
+The first step is processing and indexing the corpus document(s). To do so, set the path to the embedder checkpoint, corpus document(s), index saving directory and relevant arguments, then run the following command. Below we explain in more details the steps run within the script.
+
+
+```
+python examples/nlp/rag/rag_indexing.py \
+        trainer.devices=1 \
+        trainer.precision='bf16-mixed' \
+        indexing.embedder.model_path='/path/to/checkpoints/embedder_model.nemo' \
+        indexing.embedder.embed_batch_size=128 \
+        indexing.data.data_path='/path/to/data' \
+        indexing.data.chunk_size=256 \
+        indexing.data.chunk_overlap=10 \
+        indexing.index_path='/path/to/index'
+```
+
+Inside the script, the following steps are run.
+
+First, the document is read into LlamaIndex's `SimpleDirectoryReader` object.
+
+```
+print("Loading documents.")
+documents = SimpleDirectoryReader(cfg.indexing.data.data_path).load_data()
+```
+
+We then set up how the corpus document(s) will be split into smaller chunks, by setting splitter type, chunk size, and chunk overlap values.
+
+```
+print("Setting text transformation.")
+Settings.text_splitter = SentenceSplitter()
+Settings.chunk_size = cfg.indexing.data.chunk_size
+Settings.chunk_overlap = cfg.indexing.data.chunk_overlap
+```
+
+We then load the trained embedder NeMo model. Currently, this script only supports `.nemo` checkpoints. The wrapper around NeMo embedder to work with LLamaIndex interface is implemented at `nemo/collections/nlp/models/rag/custom_embedder.py`. We can try different embedding batch size to balance the number of samples embedded at once and embedding speed.
+
+```
+print("Loading embedding models.")
+model_path = cfg.indexing.embedder.model_path
+embed_batch_size = cfg.indexing.embedder.embed_batch_size
+embed_model = NeMoEmbeddings(model_path = model_path, cfg = cfg, embed_batch_size = embed_batch_size)
+Settings.embed_model = embed_model
+```
+
+Next, we will index the corpus document(s), simply by using the LlamaIndex `VectorStoreIndex.from_documents()` method. Under the hood, this method will split the corpus document(s) into smaller chunks having a pre-defined chunk size, batch them and feed them to the embedder, then put the output embeddings into an index. In this example, we use the built-in LlamaIndex's in-memory vector store to save the index. We can also use external vector stores, such as Milvus, Qdrant, etc. See more at [LlamaIndex Vector Stores](https://docs.llamaindex.ai/en/stable/module_guides/storing/vector_stores/).          
+
+
+```
+print("Indexing data.")
+index = VectorStoreIndex.from_documents(documents, show_progress=True)
+```
+
+After indexing, we save the index to disk that later we can load to be used with an LLM.
+
+```
+print("Saving index to disk.")
+index_path = cfg.indexing.index_path
+index.storage_context.persist(persist_dir=index_path)
+```
+
+
+###  Generation
+
+After processing and indexing the document, we can have a NeMo LLM model to interact with the corpus document(s) through RAG, such as asking details within the documents. To do so, set the path to the LLM checkpoint, save index, and a query to ask and run the following command. Below we explain in more details the steps run within the script.
+
+```
+python examples/nlp/rag/rag_eval.py \
+        trainer.devices=1 \
+        trainer.precision='bf16-mixed' \
+        indexing.embedder.model_path='/path/to/checkpoints/embedder_model.nemo' \
+        indexing.index_path='/path/to/index' \
+        generating.llm.model_path='/path/to/checkpoints/llm_model.nemo' \
+        generating.inference.greedy=False \
+        generating.inference.temperature=1.0 \
+        generating.query='Which art schools did I applied to?'
+```
+
+Inside the script, the following steps are run.
+
+
+First, the LLM is loaded from `generating.llm.model_path`. Currently the script only works with `.nemo` checkpoints. The wrapper around NeMo LLM to work with LLamaIndex interface is implemented at `nemo/collections/nlp/models/rag/custom_llm.py`. 
+
+```
+print("Loading LLM.")
+model_path = cfg.generating.llm.model_path
+Settings.llm = NeMoLLM(model_path = model_path, cfg = cfg)
+```
+
+Then we load the index saved on disk in the previous indexing step. If using Milvus database, it can also be loaded at this step.
+```
+print("Loading index from disk.")
+index_path = cfg.indexing.index_path
+storage_context = StorageContext.from_defaults(persist_dir=index_path)
+index = load_index_from_storage(storage_context)
+```
+
+Finally, we will retrieve the relevant contexts and generate answers for the query using LlamaIndex's `query_engine.query()` method. Under the hood, this method automatically embeds the query with the defined embedder, then retrieve the k relevant contexts from the index, and add those contexts to a predefined template along with the query before feeding them to the LLM for generation. We can set the number of relevant contexts to be retrieved by setting the argument `similarity_top_k` value.
+```
+print("Responding to query using relevant contexts.")
+query_engine = index.as_query_engine(similarity_top_k=3)
+response = query_engine.query(query)
+print(response)
+```
+
+Below is an example of the default template by LlamaIndex to feed a query and relevant contexts to the LLM. This template can be modified following LlamaIndex's documentation [Prompts RAG](https://docs.llamaindex.ai/en/stable/examples/prompts/prompts_rag/).
+
+
+```
+Context information is below.
+---------------------
+{context_str 1}
+{context_str 2}
+...
+---------------------
+Given the context information and not prior knowledge, answer the query.
+Query: {query_str}
+Answer:
+```
\ No newline at end of file
diff --git a/examples/nlp/rag/rag_generating.py b/examples/nlp/rag/rag_generating.py
new file mode 100644
index 000000000000..952dc2532102
--- /dev/null
+++ b/examples/nlp/rag/rag_generating.py
@@ -0,0 +1,49 @@
+from llama_index.core import Settings, StorageContext, load_index_from_storage
+
+from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings
+from nemo.collections.nlp.models.rag.custom_gpt_llm import NeMoGPTLLM
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+
+@hydra_runner(config_path="conf", config_name="rag_generating")
+def main(cfg) -> None:
+
+    # load LLM
+    logging.info("Loading LLM.")
+    model_path = cfg.generating.llm.model_path
+    if cfg.generating.llm.model_type == "gpt":
+        Settings.llm = NeMoGPTLLM(model_path=model_path, cfg=cfg)
+    else:
+        assert cfg.generating.model_type in ["gpt"], "Currently RAG pipeline supports 'gpt' for LLM models."
+
+    # load embedder
+    logging.info("Loading embedder.")
+    model_path = cfg.indexing.embedder.model_path
+    if cfg.indexing.embedder.model_type == "bert":
+        embed_model = NeMoBertEmbeddings(model_path=model_path, cfg=cfg)
+    else:
+        assert cfg.indexing.model_type in ["bert"], "Currently RAG pipeline supports 'bert' for embeddings models."
+        embed_model = None
+    Settings.embed_model = embed_model
+
+    # load index from disk
+    logging.info("Loading index from disk.")
+    index_path = cfg.indexing.index_path
+    storage_context = StorageContext.from_defaults(persist_dir=index_path)
+    index = load_index_from_storage(storage_context)
+
+    # set query
+    logging.info("Setting query.")
+    query = cfg.generating.query
+    logging.info("Query: ", query)
+
+    # query and print response
+    logging.info("Responding to query using relevant contexts.")
+    query_engine = index.as_query_engine(similarity_top_k=3)
+    response = query_engine.query(query)
+    logging.info(response)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/rag/rag_indexing.py b/examples/nlp/rag/rag_indexing.py
new file mode 100644
index 000000000000..ab487c035228
--- /dev/null
+++ b/examples/nlp/rag/rag_indexing.py
@@ -0,0 +1,44 @@
+from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
+from llama_index.core.node_parser import SentenceSplitter
+
+from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+
+@hydra_runner(config_path="conf", config_name="rag_indexing")
+def main(cfg) -> None:
+
+    # load data
+    logging.info("Loading documents.")
+    documents = SimpleDirectoryReader(cfg.indexing.data.data_path).load_data()
+
+    # set text transformation
+    logging.info("Setting text transformation.")
+    Settings.text_splitter = SentenceSplitter()
+    Settings.chunk_size = cfg.indexing.data.chunk_size
+    Settings.chunk_overlap = cfg.indexing.data.chunk_overlap
+
+    # load embedder
+    logging.info("Loading embedding models.")
+    model_path = cfg.indexing.embedder.model_path
+    embed_batch_size = cfg.indexing.embedder.embed_batch_size
+    if cfg.indexing.embedder.model_type == "bert":
+        embed_model = NeMoBertEmbeddings(model_path=model_path, cfg=cfg, embed_batch_size=embed_batch_size)
+    else:
+        assert cfg.indexing.model_type in ["bert"], "Currently RAG pipeline supports 'bert' for embeddings models."
+        embed_model = None
+    Settings.embed_model = embed_model
+
+    # index data
+    logging.info("Indexing data.")
+    index = VectorStoreIndex.from_documents(documents, show_progress=True)
+
+    # save index data to disk
+    logging.info("Saving index to disk.")
+    index_path = cfg.indexing.index_path
+    index.storage_context.persist(persist_dir=index_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/nlp/models/rag/__init__.py b/nemo/collections/nlp/models/rag/__init__.py
new file mode 100644
index 000000000000..15434bc2e603
--- /dev/null
+++ b/nemo/collections/nlp/models/rag/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.nlp.models.rag.custom_bert_embedder import NeMoBertEmbeddings
+from nemo.collections.nlp.models.rag.custom_gpt_llm import NeMoGPTLLM
diff --git a/nemo/collections/nlp/models/rag/custom_bert_embedder.py b/nemo/collections/nlp/models/rag/custom_bert_embedder.py
new file mode 100644
index 000000000000..e2f26fadf247
--- /dev/null
+++ b/nemo/collections/nlp/models/rag/custom_bert_embedder.py
@@ -0,0 +1,145 @@
+from typing import Any, List
+
+import torch
+from llama_index.core.bridge.pydantic import PrivateAttr
+from llama_index.core.embeddings import BaseEmbedding
+from omegaconf import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+
+
+class NeMoBertEmbeddings(BaseEmbedding):
+    _model: MegatronBertEmbeddingModel = PrivateAttr()
+    _model_cfg: DictConfig = PrivateAttr()
+
+    def __init__(
+        self,
+        model_path: str = None,
+        cfg: Any = None,
+        embed_batch_size: int = 16,
+        **kwargs: Any,
+    ) -> None:
+
+        # set up trainer
+        trainer_config = {
+            "devices": cfg.trainer.devices,
+            "num_nodes": 1,
+            "accelerator": "gpu",
+            "logger": False,
+            "precision": cfg.trainer.precision,
+        }
+        trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_config)
+
+        # setup/override model config
+        model_cfg = MegatronBertEmbeddingModel.restore_from(
+            restore_path=model_path, trainer=trainer, return_config=True
+        )
+        model_cfg.micro_batch_size = 1
+        model_cfg.global_batch_size = cfg.trainer.devices
+        self._model_cfg = model_cfg
+        print("self._model_cfg: ", self._model_cfg)
+
+        # restore model
+        model = MegatronBertEmbeddingModel.restore_from(
+            restore_path=model_path, trainer=trainer, override_config_path=model_cfg, strict=True
+        )
+        model.freeze()
+        self._model = model
+
+        super().__init__(
+            embed_batch_size=embed_batch_size,
+            **kwargs,
+        )
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "nemo_bert_embeddings"
+
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        return self._get_query_embedding(query)
+
+    async def _aget_text_embedding(self, text: str) -> List[float]:
+        return self._get_text_embedding(text)
+
+    def _construct_forward_input(self, texts: List[str]):
+        # this method construct model's forward input arguments from texts, following the constructing step in nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
+
+        # retrieve arguments from model_config
+        max_seq_length = self._model_cfg.encoder_seq_length
+
+        # tokenize text
+        input_ids = [self._model.tokenizer.text_to_ids(text) for text in texts]
+
+        # truncate input_ids
+        input_ids = [item[: (max_seq_length - 1)] for item in input_ids]
+
+        # add bos and eos
+        input_ids = [([self._model.tokenizer.bos_id] + item + [self._model.tokenizer.eos_id]) for item in input_ids]
+
+        # pad input_ids
+        def _ceil_to_nearest(n, m):
+            return (n + m - 1) // m * m
+
+        lengths = [len(item) for item in input_ids]
+        max_length = min(max_seq_length, _ceil_to_nearest(max(lengths), 16))
+        assert max_length <= max_seq_length
+        input_ids = [item + [self._model.tokenizer.pad_id] * (max_length - len(item)) for item in input_ids]
+        input_ids = torch.LongTensor(input_ids)
+
+        # construct attention_mask
+        def _create_attention_mask2(max_length, item_lengh):
+            """Create `attention_mask`.
+            Args:
+                input_ids: A 1D tensor that holds the indices of tokens.
+            """
+            # seq_length = len(input_ids)
+            # `attention_mask` has the shape of [1, seq_length, seq_length]
+            attention_mask = torch.zeros(max_length)
+            attention_mask[:item_lengh] = 1
+            return attention_mask
+
+        attention_mask = [_create_attention_mask2(max_length, len) for len in lengths]
+        attention_mask = torch.stack(attention_mask)
+
+        # construct token_type_ids
+        token_type_ids = torch.zeros_like(input_ids)
+
+        processed_batch = {
+            'input_ids': input_ids,
+            'token_type_ids': token_type_ids,
+            'attention_mask': attention_mask,
+        }
+
+        return processed_batch
+
+    def _get_query_embedding(self, query: str) -> List[float]:
+        constructed_forward_input = self._construct_forward_input([query])
+        for key in constructed_forward_input.keys():
+            constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device)
+
+        embeddings = self._model.forward(**constructed_forward_input)
+        embeddings = embeddings.transpose(0, 1)  # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim]
+
+        return embeddings[0].tolist()
+
+    def _get_text_embedding(self, text: str) -> List[float]:
+        constructed_forward_input = self._construct_forward_input([text])
+        for key in constructed_forward_input.keys():
+            constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device)
+
+        embeddings = self._model.forward(**constructed_forward_input)
+        embeddings = embeddings.transpose(0, 1)  # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim]
+
+        return embeddings[0].tolist()
+
+    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        constructed_forward_input = self._construct_forward_input(texts)
+        for key in constructed_forward_input.keys():
+            constructed_forward_input[key] = constructed_forward_input[key].to(self._model.device)
+
+        embeddings = self._model.forward(**constructed_forward_input)
+        embeddings = embeddings.transpose(0, 1)  # reshape tensor shape [hidden_dim, bs] to [bs, hidden_dim]
+
+        return embeddings.tolist()
diff --git a/nemo/collections/nlp/models/rag/custom_gpt_llm.py b/nemo/collections/nlp/models/rag/custom_gpt_llm.py
new file mode 100644
index 000000000000..bcd52b3f9b16
--- /dev/null
+++ b/nemo/collections/nlp/models/rag/custom_gpt_llm.py
@@ -0,0 +1,130 @@
+from typing import Any
+
+from llama_index.core.bridge.pydantic import PrivateAttr
+from llama_index.core.llms import CompletionResponse, CompletionResponseGen, CustomLLM, LLMMetadata
+from llama_index.core.llms.callbacks import llm_completion_callback
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+
+
+class NeMoGPTLLM(CustomLLM):
+    context_window: int = 2048
+    num_output: int = 256
+    model_name: str = "NeMo LLM"
+    dummy_response: str = "My response"
+
+    length_params: LengthParam = {
+        "max_length": 500,
+        "min_length": 0,
+    }
+
+    sampling_params: SamplingParam = {
+        "use_greedy": True,
+        "temperature": 1.0,
+        "top_k": 0,
+        "top_p": 1.0,
+        "repetition_penalty": 1.0,
+        "add_BOS": True,
+        "all_probs": False,
+        "compute_logprob": False,
+        "end_strings": ["<|endoftext|>"],
+    }
+
+    _model: Any = PrivateAttr()
+    _model_cfg: Any = PrivateAttr()
+    _tokenizer: Any = PrivateAttr()
+
+    def __init__(
+        self,
+        model_path: str = None,
+        cfg: Any = None,
+        **kwargs: Any,
+    ) -> None:
+
+        # set up trainer
+        trainer_config = {
+            "devices": cfg.trainer.devices,
+            "num_nodes": 1,
+            "accelerator": "gpu",
+            "logger": False,
+            "precision": cfg.trainer.precision,
+        }
+
+        tensor_model_parallel_size = 1
+        pipeline_model_parallel_size = 1
+
+        # trainer required for restoring model parallel models
+        trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_config)
+        assert (
+            trainer_config["devices"] * trainer_config['num_nodes']
+            == tensor_model_parallel_size * pipeline_model_parallel_size
+        ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"
+
+        # setup/override model config
+        model_cfg = MegatronGPTModel.restore_from(restore_path=model_path, trainer=trainer, return_config=True)
+        model_cfg.micro_batch_size = 1
+        model_cfg.global_batch_size = cfg.trainer.devices
+        self._model_cfg = model_cfg
+        print("self._model_cfg: ", self._model_cfg)
+
+        # restore model
+        model = MegatronGPTModel.restore_from(
+            restore_path=model_path, trainer=trainer, override_config_path=model_cfg, strict=True
+        )
+        model.freeze()
+        self._model = model
+        super().__init__(**kwargs)
+
+        # update LLM metadata
+        self.context_window = self._model_cfg.encoder_seq_length
+
+        # update inference params
+        length_params: LengthParam = {
+            "max_length": cfg.generating.inference.tokens_to_generate,
+            "min_length": cfg.generating.inference.min_tokens_to_generate,
+        }
+
+        sampling_params: SamplingParam = {
+            "use_greedy": cfg.generating.inference.greedy,
+            "temperature": cfg.generating.inference.temperature,
+            "top_k": cfg.generating.inference.top_k,
+            "top_p": cfg.generating.inference.top_p,
+            "repetition_penalty": cfg.generating.inference.repetition_penalty,
+            "add_BOS": cfg.generating.inference.add_BOS,
+            "all_probs": cfg.generating.inference.all_probs,
+            "compute_logprob": cfg.generating.inference.compute_logprob,
+            "end_strings": cfg.generating.inference.end_strings,
+        }
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        """Get LLM metadata."""
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.num_output,
+            model_name=self.model_name,
+        )
+
+    @llm_completion_callback()
+    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
+        llm_response = self._model.generate(
+            inputs=[prompt], length_params=self.length_params, sampling_params=self.sampling_params
+        )
+        text_response = llm_response['sentences'][0]
+
+        return CompletionResponse(text=text_response)
+
+    @llm_completion_callback()
+    def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
+        llm_response = self._model.generate(
+            inputs=[prompt], length_params=self.length_params, sampling_params=self.sampling_params
+        )
+        text_response = llm_response['sentences'][0]
+
+        response = ""
+        for token in text_response:
+            response += token
+            yield CompletionResponse(text=response, delta=token)

From 9ca10104777f14f79e626eacb0381a59e25b896e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 10:03:37 -0600
Subject: [PATCH 110/178] Pin transformers (#9261) (#9273)

* update branch


* pin


---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 requirements/requirements_lightning.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 5ad2519cfd1a..cf996584da23 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -4,6 +4,6 @@ hydra-core>1.3,<=1.3.2
 omegaconf<=2.3
 pytorch-lightning>=2.2.1
 torchmetrics>=0.11.0
-transformers>=4.36.0
+transformers>=4.36.0,<=4.40.2
 wandb
 webdataset>=0.2.86

From 286d38704dc934cdbbb37fa3b026d04e547ba71c Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Wed, 22 May 2024 11:45:37 -0700
Subject: [PATCH 111/178] Mcore dist opt ckpt fix (#9156)

* Mcore dist opt ckpt fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* pass dp_zero_gather_scatter to starded-state-dict

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* introduce dist_ckpt_parallel_save option

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* determine sharding type from dist_ckpt_parallel_save

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* read model.disk_ckpt_parallel_save from cfg and pass it to mcore dist ckpt

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Pass is_loading to mcore_optim.py's sharded_state_dict

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Update nemo/core/optim/mcore_optim.py

Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
---
 .../conf/megatron_gpt_config.yaml               |  1 +
 .../nlp/parts/megatron_trainer_builder.py       |  1 +
 nemo/collections/nlp/parts/nlp_overrides.py     | 17 +++++++++++++----
 nemo/core/optim/mcore_optim.py                  | 11 +++++++++--
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 269aa8f55153..ca0c3f74e4c8 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -154,6 +154,7 @@ model:
   # Distributed checkpoint setup
   dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
   dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
+  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
 
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 03cf5fb755bd..f6336f6bcc71 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -90,6 +90,7 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]:
             find_unused_parameters=False,
             nccl_communicator_config_path=self.cfg.model.get('nccl_communicator_config_path', None),
             sharp=self.cfg.model.get('sharp', False),
+            dist_ckpt_parallel_save=self.cfg.model.get('dist_ckpt_parallel_save', False),
         )
 
     def _grad_scaler(self) -> GradScaler:
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index e8f7009b791c..79937c265b09 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -78,6 +78,7 @@
     from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 
     from nemo.core.optim.distributed_adam import MegatronDistributedFusedAdam
+    from nemo.core.optim.mcore_optim import McoreDistributedOptimizer
 
     HAVE_APEX = True
 
@@ -183,6 +184,7 @@ def __init__(
         no_ddp_communication_hook: bool = False,
         nccl_communicator_config_path: Optional[str] = None,
         sharp: bool = False,
+        dist_ckpt_parallel_save: bool = False,
         **kwargs: Union[Any, Dict[str, Any]],
     ) -> None:
         if not HAVE_APEX:
@@ -199,6 +201,7 @@ def __init__(
         self.no_ddp_communication_hook = no_ddp_communication_hook
         self.nccl_communicator_config_path = nccl_communicator_config_path
         self.sharp = sharp
+        self._dist_ckpt_parallel_save = dist_ckpt_parallel_save
 
     def setup(self, trainer: "pl.Trainer") -> None:
         """
@@ -276,7 +279,7 @@ def configure_ddp(self):
             else:
                 super().configure_ddp()
 
-    def optimizer_sharded_state_dict(self, unsharded_optim_state=None):
+    def optimizer_sharded_state_dict(self, unsharded_optim_state=None, is_loading=False):
         """
         Sharded state dictionary for an MainParamsOptimizerWrapper.
         Used to save and load the optimizer state when training with distributed_checkpoint.
@@ -294,8 +297,14 @@ def optimizer_sharded_state_dict(self, unsharded_optim_state=None):
         model_sharded_state_dict = {
             key: value for key, value in model_sharded_state_dict.items() if not key.endswith('_extra_state')
         }
-
-        if isinstance(optimizer, MegatronDistributedFusedAdam):
+        if isinstance(optimizer, McoreDistributedOptimizer):
+            return optimizer.sharded_state_dict(
+                model_sharded_state_dict,
+                unsharded_optim_state,
+                is_loading=is_loading,
+                dist_ckpt_parallel_save=self._dist_ckpt_parallel_save,
+            )
+        elif isinstance(optimizer, MegatronDistributedFusedAdam):
             return optimizer.sharded_state_dict(model_sharded_state_dict, unsharded_optim_state)
         elif not isinstance(optimizer, MainParamsOptimizerWrapper):
             # Regular optimizer, e.g. Adam or FusedAdam
@@ -501,7 +510,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
 
             # after dist_checkpointing.load, sharded tensors will be replaced with tensors
             checkpoint['state_dict'] = sharded_state_dict
-            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
+            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict(is_loading=True)]
 
             if self._check_param_groups_mismatch(checkpoint_path, checkpoint):
                 return self._fix_param_groups(checkpoint_path, checkpoint)
diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py
index 0d4b524049ca..234680f49249 100644
--- a/nemo/core/optim/mcore_optim.py
+++ b/nemo/core/optim/mcore_optim.py
@@ -55,8 +55,15 @@ def state_dict(self):
     def load_state_dict(self, state_dict):
         self.mcore_optimizer.load_state_dict(state_dict)
 
-    def sharded_state_dict(self, model_sharded_state_dict, is_loading: bool = False, **kwargs):
-        return self.mcore_optimizer.sharded_state_dict(model_sharded_state_dict, is_loading, **kwargs)
+    def sharded_state_dict(
+        self, model_sharded_state_dict, optimizer_state_dict=None, is_loading=False, dist_ckpt_parallel_save=False
+    ):
+        # TODO(@akoumparouli, @mikolajblaz): switch to sharding_type once support for fully_sharded_model_space merged in mcore.
+        # sharding_type = 'fully_sharded_model_space' if dist_ckpt_parallel_save else 'dp_zero_gather_scatter'
+        sharding_type = 'dp_zero_gather_scatter'
+        return self.mcore_optimizer.sharded_state_dict(
+            model_sharded_state_dict, is_loading=is_loading, sharding_type=sharding_type
+        )
 
     def step(self, closure):
         """Clip gradients (if needed) and step the base optimizer.

From 52364c1fe1d614d3a11e60fd68a428e1377e3d8e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 15:29:50 -0400
Subject: [PATCH 112/178] Fix loading github raw images on notebook (#9282)
 (#9283)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
---
 tutorials/asr/ASR_TTS_Tutorial.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/asr/ASR_TTS_Tutorial.ipynb b/tutorials/asr/ASR_TTS_Tutorial.ipynb
index 067c007ea3df..709f96d14ba5 100644
--- a/tutorials/asr/ASR_TTS_Tutorial.ipynb
+++ b/tutorials/asr/ASR_TTS_Tutorial.ipynb
@@ -38,7 +38,7 @@
     "### Architecture\n",
     "\n",
     "<img width=\"400px\" height=\"auto\"\n",
-    "     src=\"https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/_images/hybrid_asr_tts_model.png\"\n",
+    "     src=\"https://github.com/NVIDIA/NeMo/blob/stable/docs/source/asr/images/hybrid_asr_tts_model.png?raw=true\"\n",
     "     alt=\"ASR-TTS model architecture\"\n",
     "     style=\"float: right; margin-left: 20px;\">\n",
     "\n",

From 0e744c9300ca99060696b3536978ff5629312071 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 13:51:38 -0700
Subject: [PATCH 113/178] Accept None as an argument to decoder_lengths in
 GreedyBatchedCTCInfer::forward (#9278)

* Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward (#9246)

* Accept None as an argument to decoder_lengths in GreedyBatchedCTCInfer::forward

GreedyCTCInfer::forward already allowed for this, so they did not
implement the exact same interface. Now, they do.

Also warn about not passing in the decoder_lengths argument. It is
likely an error on the user's part not to pass it in explicitly.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: titu1994 <titu1994@users.noreply.github.com>

* Log warning only once for sanity.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* Apply isort and black reformatting

Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>

---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>
Co-authored-by: Daniel Galvez <galv@users.noreply.github.com>
Co-authored-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: nithinraok <nithinraok@users.noreply.github.com>
---
 .../parts/submodules/ctc_greedy_decoding.py   | 50 +++++++++++++------
 .../asr/decoding/test_ctc_decoding.py         | 22 ++++++--
 2 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index 1ef26cd7adf3..c4e9a14f6e1d 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -25,7 +25,10 @@
 from nemo.utils import logging
 
 
-def pack_hypotheses(hypotheses: List[rnnt_utils.Hypothesis], logitlen: torch.Tensor,) -> List[rnnt_utils.Hypothesis]:
+def pack_hypotheses(
+    hypotheses: List[rnnt_utils.Hypothesis],
+    logitlen: torch.Tensor,
+) -> List[rnnt_utils.Hypothesis]:
 
     if logitlen is not None:
         if hasattr(logitlen, 'cpu'):
@@ -55,6 +58,9 @@ def _states_to_device(dec_state, device='cpu'):
     return dec_state
 
 
+_DECODER_LENGTHS_NONE_WARNING = "Passing in decoder_lengths=None for CTC decoding is likely to be an error, since it is unlikely that each element of your batch has exactly the same length. decoder_lengths will default to decoder_output.shape[0]."
+
+
 class GreedyCTCInfer(Typing, ConfidenceMethodMixin):
     """A greedy CTC decoder.
 
@@ -108,8 +114,7 @@ class GreedyCTCInfer(Typing, ConfidenceMethodMixin):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         # Input can be of dimension -
         # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]
 
@@ -120,8 +125,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {"predictions": [NeuralType(elements_type=HypothesisType())]}
 
     def __init__(
@@ -145,7 +149,9 @@ def __init__(
 
     @typecheck()
     def forward(
-        self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor,
+        self,
+        decoder_output: torch.Tensor,
+        decoder_lengths: Optional[torch.Tensor],
     ):
         """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
         Output token is generated auto-repressively.
@@ -158,6 +164,15 @@ def forward(
         Returns:
             packed list containing batch number of sentences (Hypotheses).
         """
+
+        logging.warning(
+            "CTC decoding strategy 'greedy' is slower than 'greedy_batch', which implements the same exact interface. Consider changing your strategy to 'greedy_batch' for a free performance improvement.",
+            mode=logging_mode.ONCE,
+        )
+
+        if decoder_lengths is None:
+            logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
+
         with torch.inference_mode():
             hypotheses = []
             # Process each sequence independently
@@ -204,7 +219,7 @@ def forward(
         return (packed_result,)
 
     @torch.no_grad()
-    def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
+    def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: Optional[torch.Tensor]):
         # x: [T, D]
         # out_len: [seq_len]
 
@@ -234,7 +249,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
         return hypothesis
 
     @torch.no_grad()
-    def _greedy_decode_labels(self, x: torch.Tensor, out_len: torch.Tensor):
+    def _greedy_decode_labels(self, x: torch.Tensor, out_len: Optional[torch.Tensor]):
         # x: [T]
         # out_len: [seq_len]
 
@@ -324,8 +339,7 @@ class GreedyBatchedCTCInfer(Typing, ConfidenceMethodMixin):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         # Input can be of dimension -
         # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]
 
@@ -336,8 +350,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return {"predictions": [NeuralType(elements_type=HypothesisType())]}
 
     def __init__(
@@ -361,7 +374,9 @@ def __init__(
 
     @typecheck()
     def forward(
-        self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor,
+        self,
+        decoder_output: torch.Tensor,
+        decoder_lengths: Optional[torch.Tensor],
     ):
         """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
         Output token is generated auto-repressively.
@@ -374,11 +389,18 @@ def forward(
         Returns:
             packed list containing batch number of sentences (Hypotheses).
         """
+
+        input_decoder_lengths = decoder_lengths
+
+        if decoder_lengths is None:
+            logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
+            decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0])
+
         if decoder_output.ndim == 2:
             hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
         else:
             hypotheses = self._greedy_decode_logprobs_batched(decoder_output, decoder_lengths)
-        packed_result = pack_hypotheses(hypotheses, decoder_lengths)
+        packed_result = pack_hypotheses(hypotheses, input_decoder_lengths)
         return (packed_result,)
 
     @torch.no_grad()
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index 8eceb822fd38..a42d61f051ad 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -90,7 +90,9 @@ def test_constructor_subword(self, tmp_tokenizer):
         assert decoding is not None
 
     @pytest.mark.unit
-    def test_char_decoding_greedy_forward(self,):
+    def test_char_decoding_greedy_forward(
+        self,
+    ):
         cfg = CTCDecodingConfig(strategy='greedy')
         vocab = char_vocabulary()
         decoding = CTCDecoding(decoding_cfg=cfg, vocabulary=vocab)
@@ -197,7 +199,10 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
     @pytest.mark.parametrize('alignments', [False, True])
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('preserve_frame_confidence', [False, True])
-    def test_batched_decoding_logprobs(self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence):
+    @pytest.mark.parametrize('length_is_none', [False, True])
+    def test_batched_decoding_logprobs(
+        self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none
+    ):
         cfg = CTCBPEDecodingConfig(
             strategy='greedy',
             preserve_alignments=alignments,
@@ -217,7 +222,10 @@ def test_batched_decoding_logprobs(self, tmp_tokenizer, alignments, timestamps,
         # that we always handle at least a few blanks.
         input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
         input_signal[:, 1, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
-        length = torch.randint(low=1, high=T, size=[B])
+        if length_is_none:
+            length = None
+        else:
+            length = torch.randint(low=1, high=T, size=[B])
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
@@ -240,7 +248,8 @@ def test_batched_decoding_logprobs(self, tmp_tokenizer, alignments, timestamps,
 
     @pytest.mark.unit
     @pytest.mark.parametrize('timestamps', [False, True])
-    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps):
+    @pytest.mark.parametrize('length_is_none', [False, True])
+    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none):
         cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
         unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
         cfg.strategy = 'greedy_batched'
@@ -254,7 +263,10 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps):
         # at least a few blanks.
         input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size
         input_labels[:, 1] = unbatched_decoding.tokenizer.tokenizer.vocab_size
-        length = torch.randint(low=1, high=T, size=[B])
+        if length_is_none:
+            length = None
+        else:
+            length = torch.randint(low=1, high=T, size=[B])
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(

From 0f2874b270f476405f11aeb09d38a709118c67b5 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Wed, 22 May 2024 20:10:25 -0500
Subject: [PATCH 114/178] Alit/bert convert fix (#9285)

* fix extra state and post process

* move to args

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

---------

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 .../convert_bert_hf_to_nemo.py                | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
index 278f7b879b28..a81fd33f47a2 100644
--- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
@@ -19,6 +19,7 @@
      --input_name_or_path "thenlper/gte-large" \
      --output_path /path/to/output/nemo/file.nemo \
      --mcore True \
+     --post_process False \
      --precision 32
 ```
 """
@@ -62,6 +63,9 @@ def get_args():
         help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--post_process", type=bool, default=False, required=False, help="Whether to have the postprocessing modules"
+    )
     parser.add_argument(
         "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )
@@ -81,6 +85,14 @@ def convert(args):
     trainer = MegatronTrainerBuilder(nemo_config).create_trainer()
     model = MegatronBertModel(nemo_config.model, trainer)
 
+    if not args.post_process:
+        model.model.lm_head, model.model.encoder.final_layernorm, model.model.binary_head, model.model.output_layer = (
+            None,
+            None,
+            None,
+            None,
+        )
+
     nemo_state_dict = {}
     hf_config = hf_model.config.to_dict()
     hidden_size = hf_config["hidden_size"]
@@ -184,6 +196,19 @@ def convert(args):
         nemo_state_dict[LayerNorm2_weight_base_name] = param_to_weights(LayerNorm2_weight)
         nemo_state_dict[LayerNorm2_bias_base_name] = param_to_weights(LayerNorm2_bias)
 
+        nemo_state_dict[f'model.encoder.layers.{l}.self_attention.linear_proj._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.self_attention.linear_proj._extra_state'
+        ]
+        nemo_state_dict[f'model.encoder.layers.{l}.self_attention.linear_qkv._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.self_attention.linear_qkv._extra_state'
+        ]
+        nemo_state_dict[f'model.encoder.layers.{l}.mlp.linear_fc1._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.mlp.linear_fc1._extra_state'
+        ]
+        nemo_state_dict[f'model.encoder.layers.{l}.mlp.linear_fc2._extra_state'] = model.state_dict()[
+            f'model.encoder.layers.{l}.mlp.linear_fc2._extra_state'
+        ]
+
     # Non-layer dependent keys
     word_embeddings_weight = hf_model.state_dict()['embeddings.word_embeddings.weight']
     position_embeddings_weight = hf_model.state_dict()['embeddings.position_embeddings.weight']

From 9d6e4724edacb76a82767dcdd37963b7a55fe83e Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Thu, 23 May 2024 18:06:12 +0200
Subject: [PATCH 115/178] Remove .nemo instead of renaming (#9281)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Remove .nemo instead of renaming

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* add ignore_errors=True flag

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Revert "Remove .nemo instead of renaming"

This reverts commit b836410a2d369aeb231f00b651d9b0f22b355929.

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Remove backup .nemo after success

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Update tests

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Backup .nemo imediately before save_to

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: mikolajblaz <mikolajblaz@users.noreply.github.com>

* Fix CTC import

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
---
 .../parts/submodules/ctc_greedy_decoding.py   |  2 +-
 nemo/utils/callbacks/nemo_model_checkpoint.py | 33 +++++++---
 tests/core/test_exp_manager.py                | 65 ++++++++++++++-----
 3 files changed, 76 insertions(+), 24 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index c4e9a14f6e1d..a7f57c82279a 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -22,7 +22,7 @@
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin
 from nemo.core.classes import Typing, typecheck
 from nemo.core.neural_types import HypothesisType, LengthsType, LogprobsType, NeuralType
-from nemo.utils import logging
+from nemo.utils import logging, logging_mode
 
 
 def pack_hypotheses(
diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py
index 15e8a4e21f55..e1d1f2e94586 100644
--- a/nemo/utils/callbacks/nemo_model_checkpoint.py
+++ b/nemo/utils/callbacks/nemo_model_checkpoint.py
@@ -22,6 +22,8 @@
 import pytorch_lightning
 import torch
 from _weakref import proxy
+
+from lightning_fabric.utilities.cloud_io import get_filesystem
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint, _is_local_file_protocol
 from pytorch_lightning.utilities import rank_zero_info
 
@@ -198,7 +200,6 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
         if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
             logging.warning(f'always_save_nemo will slow down training for model_parallel > 1.')
         # since we are creating tarfile artifacts we need to update .nemo path
-        self._backup_existing_nemo_ckpt(trainer)
         app_state.model_restore_path = self._format_nemo_checkpoint_name()
         if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1:
             maybe_injected_best_model_path = inject_model_parallel_rank(self.best_model_path)
@@ -222,14 +223,19 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint):
             pl_module.load_state_dict(checkpoint, strict=True)
             if torch.distributed.is_initialized():
                 torch.distributed.barrier()
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=app_state.model_restore_path)
             logging.info(f"New best .nemo model saved to: {app_state.model_restore_path}")
             pl_module.load_state_dict(old_state_dict, strict=True)
         else:
             if torch.distributed.is_initialized():
                 torch.distributed.barrier()
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=app_state.model_restore_path)
             logging.info(f"New .nemo model saved to: {app_state.model_restore_path}")
+        if backup_path is not None and is_global_rank_zero():
+            logging.info(f'Removing old .nemo backup {backup_path}')
+            get_filesystem(backup_path).rm(backup_path)
         return output
 
     def on_train_end(self, trainer, pl_module):
@@ -268,16 +274,25 @@ def on_train_end(self, trainer, pl_module):
                 trainer._checkpoint_connector.restore(self.best_model_path)
 
         if self.save_nemo_on_train_end:
-            self._backup_existing_nemo_ckpt(trainer)
+            backup_path = self._backup_existing_nemo_ckpt(trainer)
             pl_module.save_to(save_path=self._format_nemo_checkpoint_name())
+            if backup_path is not None and is_global_rank_zero():
+                logging.info(f'Removing old .nemo backup {backup_path}')
+                get_filesystem(backup_path).rm(backup_path)
 
-    def _backup_existing_nemo_ckpt(self, trainer) -> str:
+    def _backup_existing_nemo_ckpt(self, trainer) -> Optional[str]:
         """Search for an available name with version infix and rename existing checkpoint.
 
         NOTE: this behavior is slightly different from regular checkpoints.
         PTL creates new regular checkpoint with the first available name.
         Here, for backward compatibility, we create .nemo checkpoint as before
         and create a backup under the first available name.
+
+        Args:
+            trainer (Trainer): trainer instance.
+
+        Returns:
+            Path to the backup checkpoint or None, if no backup was created
         """
         base_path = self._format_nemo_checkpoint_name()
         available_path = base_path
@@ -286,11 +301,13 @@ def _backup_existing_nemo_ckpt(self, trainer) -> str:
             while self.file_exists(available_path, trainer, check_dist_ckpt=False):
                 available_path = self._format_nemo_checkpoint_name(version_cnt)
                 version_cnt += 1
-        if available_path != base_path:
-            if trainer.is_global_zero:
-                logging.info(f'{base_path} already exists, moving existing checkpoint to {available_path}')
-                shutil.move(base_path, available_path)
-            trainer.strategy.barrier()
+        if available_path == base_path:
+            # no existing ckpt, no need to backup
+            return None
+        if trainer.is_global_zero:
+            logging.info(f'{base_path} already exists, moving existing checkpoint to {available_path}')
+            shutil.move(base_path, available_path)
+        trainer.strategy.barrier()
         return available_path
 
     def _format_nemo_checkpoint_name(self, ver: Optional[int] = None) -> str:
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index 8c6b33022dac..2d9bd03f0203 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -151,7 +151,7 @@ def test_omegaconf(self):
 
     @pytest.mark.unit
     def test_trainer_loggers(self, tmp_path):
-        """ Test that a trainer with logger errors out with a number of arguments. Test that it works with
+        """Test that a trainer with logger errors out with a number of arguments. Test that it works with
         create_tensorboard_logger set to False
         """
         test_trainer = pl.Trainer(accelerator='cpu')  # Should create logger and modelcheckpoint
@@ -235,7 +235,7 @@ def test_trainer_neptune_logger(self, tmp_path):
 
     @pytest.mark.unit
     def test_checkpoint_configurations(self):
-        """ Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but
+        """Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but
         is error free if only one is asked to do so.
         """
         disable_tb_logger = {"create_tensorboard_logger": False}
@@ -297,7 +297,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path):
 
     @pytest.mark.unit
     def test_resume(self, tmp_path):
-        """ Tests the resume capabilities of exp_manager"""
+        """Tests the resume capabilities of exp_manager"""
         test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
 
         # Error because explicit_log_dir does not exist
@@ -428,7 +428,8 @@ def test_nemo_checkpoint_save_best_model_1(self, tmp_path):
     def test_nemo_checkpoint_save_best_model_2(self, tmp_path):
         test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4)
         exp_manager(
-            test_trainer, {"explicit_log_dir": str(tmp_path / "test")},
+            test_trainer,
+            {"explicit_log_dir": str(tmp_path / "test")},
         )
         model = ExampleModel()
         test_trainer.fit(model)
@@ -456,6 +457,27 @@ def test_nemo_checkpoint_always_save_nemo(self, tmp_path):
         model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo"))
         assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0
 
+    @pytest.mark.unit
+    def test_nemo_checkpoint_doesnt_produce_too_many_nemo_ckpts(self, tmp_path):
+        test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4)
+        exp_manager(
+            test_trainer,
+            {
+                "checkpoint_callback_params": {"save_best_model": True, "always_save_nemo": True, "save_top_k": 2},
+                "explicit_log_dir": str(tmp_path / "test"),
+            },
+        )
+        model = ExampleModel()
+        test_trainer.fit(model)
+
+        assert Path(str(tmp_path / "test" / "checkpoints" / "default.nemo")).exists()
+        assert (
+            len(list((tmp_path / "test" / "checkpoints").glob("default*.nemo"))) == 1
+        )  # check number of `.nemo` checkpoints
+
+        model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo"))
+        assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0
+
     @pytest.mark.unit
     def test_nemo_checkpoint_make_checkpoint_dir(self, tmp_path):
         test_trainer = pl.Trainer(
@@ -511,8 +533,8 @@ def test_nemo_checkpoint_restore_model(self, tmp_path):
 
     @pytest.mark.run_only_on('GPU')
     @pytest.mark.parametrize('test_dist_ckpt', [False, True])
-    def test_checkpoints_are_not_overwritten(self, tmp_path, test_dist_ckpt):
-        """ Simulates already existing checkpoints in the ckpt directory and tests ckpt versioning """
+    def test_base_checkpoints_are_not_overwritten(self, tmp_path, test_dist_ckpt):
+        """Simulates already existing checkpoints in the ckpt directory and tests non-nemo ckpt versioning"""
         strategy = NLPDDPStrategy() if test_dist_ckpt else 'auto'
         test_trainer = pl.Trainer(
             accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=4, strategy=strategy
@@ -563,7 +585,8 @@ def _get_versioned_name(ckpt_name: Path, nemo: bool = False):
 
         assert _get_versioned_name(ckpt_1).exists(), all_checkpoints
         assert not _get_versioned_name(ckpt_2).exists(), all_checkpoints  # ckpt2 didn't exist before
-        assert _get_versioned_name(ckpt_nemo, nemo=True).exists(), all_checkpoints
+        # .nemo checkpoints are not versioned:
+        assert not _get_versioned_name(ckpt_nemo, nemo=True).exists(), all_checkpoints
 
     @pytest.mark.unit
     def test_last_checkpoint_saved(self, tmp_path):
@@ -592,6 +615,7 @@ def train_dataloader(self):
         model_path = checkpoint_dir / "val_loss=0.0300-epoch=1-step=64-last.ckpt"
         last_saved_checkpoint = torch.load(model_path)
         assert max_steps == last_saved_checkpoint['global_step']
+
         # restart training, ensure global step starts correctly
         class AssertCallback(Callback):
             def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
@@ -681,8 +705,7 @@ def test_warning_validation_skipping_when_custom_epoch_loop(self, tmp_path):
         """
         tmp_path = tmp_path / "test_3"
 
-        class CustomLoop(_TrainingEpochLoop):
-            ...
+        class CustomLoop(_TrainingEpochLoop): ...
 
         trainer = pl.Trainer(
             accelerator='cpu', enable_checkpointing=False, logger=False, max_epochs=1, val_check_interval=0.33
@@ -759,7 +782,8 @@ def test_skipped_unfinished_checkpoints_when_restoring(self, tmp_path):
 
         restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
         exp_manager(
-            restored_trainer, {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
+            restored_trainer,
+            {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
         )
 
         # Check that last complete (w/o unifinished marker) checkpoint was found
@@ -803,7 +827,8 @@ def test_skipped_unfinished_dist_checkpoints_when_restoring(self, tmp_path):
 
         restored_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
         exp_manager(
-            restored_trainer, {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
+            restored_trainer,
+            {"resume_if_exists": True, "explicit_log_dir": str(test_dir)},
         )
 
         # Check that last complete (w/o unifinished marker) checkpoint was found
@@ -850,13 +875,17 @@ def test_incomplete_checkpoints_cleanup(self, tmp_path):
 
         # unfinished checkpoint with EMA part, both parts should be removed
         self._write_fake_checkpoint(
-            checkpoints_dir / "incomplete01-EMA.ckpt", isdir=False, add_unfinished_marker=False,
+            checkpoints_dir / "incomplete01-EMA.ckpt",
+            isdir=False,
+            add_unfinished_marker=False,
         )
         self._write_fake_checkpoint(checkpoints_dir / "incomplete01.ckpt", isdir=False, add_unfinished_marker=True)
 
         # just EMA part - should be removed. NOTE marker path is the same for base part and for EMA part
         self._write_fake_checkpoint(
-            checkpoints_dir / "incomplete02-EMA.ckpt", isdir=False, add_unfinished_marker=False,
+            checkpoints_dir / "incomplete02-EMA.ckpt",
+            isdir=False,
+            add_unfinished_marker=False,
         )
         (checkpoints_dir / f"incomplete02{NeMoModelCheckpoint.UNFINISHED_CHECKPOINT_SUFFIX}").touch()
 
@@ -864,7 +893,10 @@ def test_incomplete_checkpoints_cleanup(self, tmp_path):
 
         exp_manager(
             test_trainer,
-            {"checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, "explicit_log_dir": str(test_dir),},
+            {
+                "checkpoint_callback_params": {"save_top_k": 0, "save_last": False},
+                "explicit_log_dir": str(test_dir),
+            },
         )
 
         model = ExampleModel()
@@ -909,7 +941,10 @@ def test_incomplete_dist_checkpoints_cleanup(self, tmp_path):
 
         exp_manager(
             test_trainer,
-            {"checkpoint_callback_params": {"save_top_k": 0, "save_last": False}, "explicit_log_dir": str(test_dir),},
+            {
+                "checkpoint_callback_params": {"save_top_k": 0, "save_last": False},
+                "explicit_log_dir": str(test_dir),
+            },
         )
 
         model = ExampleModel()

From a589828b7268dfb7aff505ba2a49ab151c5d5ee4 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Thu, 23 May 2024 12:13:31 -0400
Subject: [PATCH 116/178] Refactor Sequence Packing Script (#9271)

* refactor pack seq script

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add copyright header

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* update doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* minor

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 .../features/throughput_optimizations.rst     |   9 +-
 nemo/utils/sequence_packing_utils.py          | 232 ++++++++++++++++++
 .../prepare_packed_ft_dataset.py              | 206 ++++++----------
 3 files changed, 306 insertions(+), 141 deletions(-)
 create mode 100644 nemo/utils/sequence_packing_utils.py

diff --git a/docs/source/features/throughput_optimizations.rst b/docs/source/features/throughput_optimizations.rst
index 3f3ded01b1a2..dfd8b6cf9310 100644
--- a/docs/source/features/throughput_optimizations.rst
+++ b/docs/source/features/throughput_optimizations.rst
@@ -71,8 +71,8 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f
         python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
            model.data.train_ds.file_names=[/path/to/training.jsonl] \
            model.data.train_ds.max_seq_length=2048 \
-           model.restore_from_path=<path/to/nemo_model> \
-           +output_dir=<output_folder>
+           +tokenizer_path=/path/to/tokenizer.model \
+           +output_dir=/path/to/output_folder \
            +pack_sizes=[2048,4096,8192] \
         [  +packing_algorithm=first_fit_shuffle \  ]
         [  +seed=0                                 ]
@@ -86,10 +86,7 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f
     to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data,
     and can be determined by examining the distribution of sequence lengths in the dataset.
 
-    Note 3. Currently, we require a full nemo model file for simplicity and readability of code, but in theory only a
-    tokenizer file is needed. This part can be improved in a future iteration of the script.
-
-    Note 4. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
+    Note 3. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
     each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
     This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
     can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
diff --git a/nemo/utils/sequence_packing_utils.py b/nemo/utils/sequence_packing_utils.py
new file mode 100644
index 000000000000..2a5a14f83823
--- /dev/null
+++ b/nemo/utils/sequence_packing_utils.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+from typing import Dict, List
+
+import numpy as np
+from tqdm import tqdm
+
+from nemo.utils import logging
+
+PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle']
+
+
+def find_first_bin_that_fits(bins: List[List[int]], s: int, bin_size: int) -> int:
+    """
+    Finds the first bin in a list of bins that has enough space to fit a sequence of size 's'.
+
+    Args:
+      bins: A list of lists, where each inner list represents a bin and contains the current elements in that bin.
+      s: The size of the sequence to be placed in a bin.
+      bin_size: The maximum capacity of each bin.
+
+    Returns:
+      The index of the first bin that can fit the sequence 's', or -1 if no such bin exists.
+    """
+    for i, abin in enumerate(bins):
+        if sum(abin) + s <= bin_size:
+            return i
+    return -1
+
+
+def first_fit(seqlens: List[int], pack_size: int) -> List[List[int]]:
+    """
+    Packs sequences of varying lengths into bins using the First-Fit algorithm.
+
+    Args:
+      seqlens: A list of integers, representing the lengths of the sequences to be packed.
+      pack_size: The maximum capacity of each bin.
+
+    Returns:
+      A list of lists, where each inner list represents a bin and contains the indices of the sequences assigned to that bin.
+    """
+    res = []
+    for s in seqlens:
+        first_bin = find_first_bin_that_fits(res, s, pack_size)
+        if first_bin == -1:  # open a new bin
+            res.append([s])
+        else:
+            res[first_bin].append(s)
+    return res
+
+
+def first_fit_decreasing(seqlens: List[int], pack_size: int) -> List[List[int]]:
+    """
+    Packs sequences of varying lengths into bins using the First-Fit Decreasing algorithm.
+
+    This is a variation of the First-Fit algorithm where the sequences are sorted by decreasing length before packing.
+
+    Args:
+      seqlens: A list of integers, representing the lengths of the sequences to be packed.
+      pack_size: The maximum capacity of each bin.
+
+    Returns:
+      A list of lists, similar to the output of the 'first_fit' function.
+    """
+    sorted_seqlens = sorted(seqlens, reverse=True)
+    return first_fit(sorted_seqlens, pack_size)
+
+
+def first_fit_shuffle(seqlens: List[int], pack_size: int) -> List[List[int]]:
+    """
+    Packs sequences of varying lengths into bins using the First-Fit with Shuffling algorithm.
+
+    This variation shuffles the order of the sequences before applying the First-Fit algorithm.
+
+    Args:
+      seqlens: A list of integers, representing the lengths of the sequences to be packed.
+      pack_size: The maximum capacity of each bin.
+
+    Returns:
+      A list of lists, similar to the output of the 'first_fit' function.
+    """
+    shuffled_seqlens = seqlens[:]
+    np.random.shuffle(shuffled_seqlens)
+    return first_fit(shuffled_seqlens, pack_size)
+
+
+def create_hist(dataset: np.array, truncate_seq_len: int):
+    """
+    Creates a histogram of sequence lengths from a tokenized dataset.
+
+    This function analyzes the tokenized dataset and creates a histogram showing the distribution of sequence lengths.
+
+    Args:
+      dataset: A NumPy array containing the tokenized sequences. Each element is a dictionary that contains at minimum
+               the key `input_ids`.
+      truncate_seq_len: The maximum sequence length to consider in the histogram.
+
+    Returns:
+      sequences: A dictionary where keys are sequence lengths and values are lists of corresponding sequences from the dataset.
+      histogram: A list representing the histogram data (number of sequences for each length).
+    """
+    logging.info("Creating histogram from tokenized dataset...")
+
+    sequences = collections.defaultdict(list)
+    counts = [0] * truncate_seq_len
+
+    for item_dict in dataset:
+        seq_len = len(item_dict['input_ids']) - 1
+        sequences[seq_len].append(item_dict)
+        counts[seq_len] += 1
+
+    logging.debug("Histogram of sequence lengths")
+    logging.debug(counts)
+
+    histogram = []
+    for seq_len in range(truncate_seq_len):
+        histogram.append(len(sequences[seq_len]))
+
+    return sequences, histogram
+
+
+def create_packing_strategy(
+    histogram: List[int], pack_size: int, packing_algorithm: str = 'first_fit'
+) -> List[List[int]]:
+    """
+    Packs sequences into bins using the specified packing algorithm.
+
+    This function takes the histogram of sequence lengths, desired pack size, and a string representing the packing
+    algorithm to use. It then calls the corresponding function (e.g., 'first_fit_decreasing') and performs the
+    packing process using only sequence lengths as input (without the actual sequences).
+
+    Args:
+          histogram: A list representing the histogram data (number of sequences for each length).
+          pack_size: The maximum capacity of each bin.
+          packing_algorithm: One of the supported packing algorithms from ['first_fit_decreasing', 'first_fit_shuffle']
+
+    Returns:
+          assignments: A list of lists, where each inner list represents a bin and contains the indices of the
+                        sequence lengths assigned to that bin.
+    """
+
+    logging.info(f"Packing sequences to length {pack_size}...")
+
+    all_seq_lens = []
+    for i, count in enumerate(histogram):
+        all_seq_lens.extend([i] * count)
+
+    packing_fn = globals()[packing_algorithm]
+    assignments = packing_fn(all_seq_lens, pack_size)
+    packed_seq_lens = [sum(x) for x in assignments]
+    packing_factor = len(all_seq_lens) / len(packed_seq_lens)
+
+    logging.debug("Packed sequence lengths:")
+    logging.debug(packed_seq_lens)
+    logging.info(f"Packing is {sum(packed_seq_lens)/len(packed_seq_lens)/pack_size*100:.2f}% efficient")
+    logging.info(
+        f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor:.3f} <<<<<"
+    )
+    return assignments
+
+
+def fill_packing_strategy(
+    assignments: List[List[int]], sequences: Dict[int, List[Dict]], pack_size: int
+) -> List[Dict]:
+    """
+    Fills the packing strategy with actual sequence data based on assignments and sequence information.
+
+    This function takes the assignments generated by the packing algorithm (containing sequence length indices),
+    the original sequences data, and the pack size. It iterates through the assignments, retrieves the corresponding
+    sequences from the sequences dictionary, and constructs the final output data structure with input IDs, loss masks
+    (if available), and starting indices for each sequence in a packed sequence.
+
+    Args:
+          assignments: A list of lists, where each inner list represents a bin and contains the indices of the
+                        sequence lengths assigned to that bin (output of 'create_packing_strategy').
+          sequences: A dictionary where keys are sequence lengths and values are lists of corresponding sequences
+                      from the dataset (output of 'create_hist').
+          pack_size: The maximum capacity of each bin.
+
+    Returns:
+          output_data: A list of dictionaries, where each dictionary represents a packed sequence with its input IDs,
+                        loss mask (if available), and starting indices.
+    """
+    ifile_handles = dict()
+    for seq_len in tqdm(range(pack_size + 1)):
+        per_seq_data = sequences[seq_len]
+        if len(per_seq_data) > 0:
+            perm = np.random.permutation(len(per_seq_data))
+            input_ids = np.array([x['input_ids'] for x in per_seq_data])[perm].tolist()
+            try:
+                loss_mask = np.array(
+                    [[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data]
+                )[perm].tolist()
+            except KeyError:
+                loss_mask = None
+            ifile_handles[seq_len] = (input_ids, loss_mask)
+
+    input_ids, loss_mask, seq_start_id = {}, {}, {}
+
+    for oindex, assignment in tqdm(enumerate(assignments), total=len(assignments)):
+        _input_ids, _loss_mask, _seq_start_id = [], [], [0]
+
+        for seq_length in assignment:
+            _input_ids.extend(ifile_handles[seq_length][0].pop())
+            _loss_mask.extend(ifile_handles[seq_length][1].pop())
+            _seq_start_id.append(len(_input_ids))
+
+        input_ids[oindex] = _input_ids
+        loss_mask[oindex] = _loss_mask
+        seq_start_id[oindex] = _seq_start_id[:-1]
+
+    output_data = []
+    for i in range(len(input_ids)):
+        item_dict = {'input_ids': input_ids[i], 'loss_mask': loss_mask[i], 'seq_start_id': seq_start_id[i]}
+        output_data.append(item_dict)
+
+    assert all(not seq[0] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
+    assert all(not seq[1] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
+    return output_data
diff --git a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
index f01aa54fc265..b3251e75c84e 100644
--- a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
+++ b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
@@ -12,19 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import collections
 import os
 from dataclasses import dataclass
-from typing import Tuple
+from typing import TYPE_CHECKING, Tuple
 
 import numpy as np
-from tqdm import tqdm
 
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
+from nemo.utils.sequence_packing_utils import create_hist, create_packing_strategy, fill_packing_strategy
+
+if TYPE_CHECKING:
+    from omegaconf import DictConfig
 
 """ 
 Script to prepare packed dataset from a SFT/PEFT dataset in the jsonl format.
@@ -45,146 +46,71 @@
 python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
    model.data.train_ds.file_names=[/path/to/training.jsonl] \
    model.data.train_ds.max_seq_length=2048 \
-   model.restore_from_path=<path/to/nemo_model> \
-   +output_dir=<output_folder> 
+   +tokenizer_path=/path/to/tokenizer.model
+   +output_dir=/path/to/output_folder
    +pack_sizes=[2048,4096,8192]
    
 Note: 
-- pack_sizes can take in a list 
-- model.data.train_ds.max_seq_length is the length to truncate long sequences before packing, and is different from the packing sizes
-- currenlty, we require a full nemo model file for simplicity and readability of code, but in theory only a tokenizer file is needed.
-  This part can be improved in a future iteration of the script.
+  - If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will
+    need to pass in the same configs to ``model.data.train_ds`` as you would for training with unpacked dataset.
+
+  - ``model.data.train_ds.max_seq_length`` is the length to truncate each sequence before packing multiple sequences
+    to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data,
+    and can be determined by examining the distribution of sequence lengths in the dataset.
+
+  - ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
+    each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
+    This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
+    can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
+    the unpacked case.
 """
 
-PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle']
-
-
-def find_first_bin_that_fits(bins, s, bin_size):
-    for i, abin in enumerate(bins):
-        if sum(abin) + s <= bin_size:
-            return i
-    return -1
-
-
-def first_fit(seqlens, pack_size):
-    res = []
-    for s in seqlens:
-        first_bin = find_first_bin_that_fits(res, s, pack_size)
-        if first_bin == -1:  # open a new bin
-            res.append([s])
-        else:
-            res[first_bin].append(s)
-    return res
-
-
-def first_fit_decreasing(seqlens, pack_size):
-    sorted_seqlens = sorted(seqlens, reverse=True)
-    return first_fit(sorted_seqlens, pack_size)
-
 
-def first_fit_shuffle(seqlens, pack_size):
-    shuffled_seqlens = seqlens[:]
-    np.random.shuffle(shuffled_seqlens)
-    return first_fit(shuffled_seqlens, pack_size)
+def tokenize_dataset(cfg: 'DictConfig'):
+    """
+    Tokenizes a dataset using the same configuration file as finetuninng with GPTSFTDataset.
 
+    This function reads a dataset and tokenizes it using SentencePiece tokenizer based on the provided configuration.
 
-def create_assignment(output_path, assignments, ifile_handles):
-    n_samples_in_this_shard = len(assignments)
-    input_ids, loss_mask, seq_start_id = {}, {}, {}
+    Args:
+      cfg: A Hydra configuration object containing parameters for tokenization.
 
-    for oindex, assignment in tqdm(enumerate(assignments), total=n_samples_in_this_shard):
-        _input_ids, _loss_mask, _seq_start_id = [], [], [0]
+    Returns:
+      A NumPy array containing the tokenized sequences from the dataset.
+    """
 
-        for seq_length in assignment:
-            _input_ids.extend(ifile_handles[seq_length][0].pop())
-            _loss_mask.extend(ifile_handles[seq_length][1].pop())
-            _seq_start_id.append(len(_input_ids))
-
-        input_ids[oindex] = _input_ids
-        loss_mask[oindex] = _loss_mask
-        seq_start_id[oindex] = _seq_start_id[:-1]
-
-    output_data = []
-    for i in range(len(input_ids)):
-        item_dict = {'input_ids': input_ids[i], 'loss_mask': loss_mask[i], 'seq_start_id': seq_start_id[i]}
-        output_data.append(item_dict)
-
-    assert all(not seq[0] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
-    assert all(not seq[1] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
-    np.save(output_path, output_data)
-    logging.info(f"Done, output written to {output_path}")
-
-
-def tokenize_dataset(cfg):
     logging.info("Tokenizing dataset...")
     # using the same template as SFT/PEFT script. This may be overkill but guarantees the preprocess settings
     # are identical to normal SFT training
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-
-    # we set is_train=False to turn off samples mapping and get the actual length of train dataset
-    train_ds = model._build_dataset(cfg.model.data.train_ds, is_train=False)[0]
-    return np.array([train_ds[i] for i in range(len(train_ds))])
-
-
-def create_hist(dataset, truncate_seq_len):
-    logging.info("Creating histogram from tokenized dataset...")
-
-    sequences = collections.defaultdict(list)
-    counts = [0] * truncate_seq_len
-
-    for item_dict in dataset:
-        seq_len = len(item_dict['input_ids']) - 1
-        sequences[seq_len].append(item_dict)
-        counts[seq_len] += 1
-
-    logging.info("Histogram of sequence lengths")
-    logging.info(counts)
-
-    histogram = []
-    for seq_len in range(truncate_seq_len):
-        histogram.append(len(sequences[seq_len]))
-
-    return sequences, histogram
-
-
-def run_packing(sequences, histogram, output_dir, pack_size, packing_algorithm, seed=0):
-    logging.info(f"Packing sequences to length {pack_size}...")
-
-    all_seq_lens = []
-    for i, count in enumerate(histogram):
-        all_seq_lens.extend([i] * count)
-
-    packing_fn = globals()[packing_algorithm]
-    assignments = packing_fn(all_seq_lens, pack_size)
-    packed_seq_lens = [sum(x) for x in assignments]
-    packing_factor = len(all_seq_lens) / len(packed_seq_lens)
-
-    logging.info("Packed sequence lengths:")
-    logging.info(packed_seq_lens)
-    logging.info(
-        f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor} <<<<<"
+    data_cfg = cfg.model.data.train_ds
+    dataset = GPTSFTDataset(
+        file_path=data_cfg.file_names[0],
+        tokenizer=get_nmt_tokenizer(library="sentencepiece", tokenizer_model=cfg.tokenizer_path),
+        max_seq_length=data_cfg.max_seq_length,
+        min_seq_length=data_cfg.min_seq_length,
+        pad_seq_length_to_mult=16,  # adds padding in collate_fn so this value is irrelevant here
+        add_bos=data_cfg.get('add_bos', False),
+        add_eos=data_cfg.get('add_eos', True),
+        add_sep=data_cfg.get('add_sep', False),
+        sep_id=cfg.get('sep_id', 49704),
+        max_num_samples=None,
+        seed=data_cfg.get('seed', 1234),
+        label_key=data_cfg.get('label_key', 'answer'),
+        answer_only_loss=cfg.get('answer_only_loss', True),
+        truncation_field=data_cfg.get('truncation_field', 'text'),
+        pad_to_max_length=data_cfg.get('pad_to_max_length', False),
+        index_mapping_dir=data_cfg.get('index_mapping_dir', None),
+        prompt_template=data_cfg.get('prompt_template', None),
+        virtual_tokens=0,
+        tokens_to_generate=data_cfg.get('tokens_to_generate', 0),
+        memmap_workers=data_cfg.get('memmap_workers', None),
+        hf_dataset=data_cfg.get('hf_dataset', False),
+        truncation_method=data_cfg.get('truncation_method', 'right'),
+        special_tokens=data_cfg.get('chat_prompt_tokens', None),
+        is_test=True,
     )
 
-    ifile_handles = {}
-    for seq_len in tqdm(range(pack_size + 1)):
-        per_seq_data = sequences[seq_len]
-        if len(per_seq_data) > 0:
-            input_ids = np.array([x['input_ids'] for x in per_seq_data])
-            loss_mask = np.array(
-                [[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data]
-            )
-            perm = np.random.permutation(len(input_ids))
-            ifile_handles[seq_len] = (input_ids[perm].tolist(), loss_mask[perm].tolist())
-        else:
-            ifile_handles[seq_len] = [], []
-
-    os.makedirs(output_dir, exist_ok=True)
-    output_path = os.path.join(output_dir, f'packed_{pack_size}_seed{seed}.npy')
-    create_assignment(output_path, assignments, ifile_handles)
+    return np.array([dataset[i] for i in range(len(dataset))])
 
 
 @dataclass
@@ -194,7 +120,7 @@ class PackingArgs:
     packing_algorithm: str = "first_fit_shuffle"
     seed: int = 0
 
-    def from_config(self, cfg):
+    def from_config(self, cfg: 'DictConfig'):
         for required_arg in ('output_dir', 'pack_sizes'):
             assert cfg.get(required_arg, None), f"Please specify +{required_arg}=..."
         self.output_dir = cfg.output_dir
@@ -207,12 +133,20 @@ def from_config(self, cfg):
 @hydra_runner(
     config_path="../../examples/nlp/language_modeling/tuning/conf", config_name="megatron_gpt_finetuning_config"
 )
-def main(cfg) -> None:
+def main(cfg: 'DictConfig') -> None:
     args = PackingArgs().from_config(cfg)
     dataset = tokenize_dataset(cfg)
     sequences, histogram = create_hist(dataset, cfg.model.data.train_ds.max_seq_length)
     for pack_size in args.pack_sizes:
-        run_packing(sequences, histogram, args.output_dir, pack_size, args.packing_algorithm, args.seed)
+        assignments = create_packing_strategy(histogram, pack_size, args.packing_algorithm)
+        output_data = fill_packing_strategy(assignments, sequences, pack_size)
+
+        # save output data
+        os.makedirs(args.output_dir, exist_ok=True)
+        output_path = os.path.join(args.output_dir, f'packed_{pack_size}_seed{args.seed}.npy')
+        np.save(output_path, output_data)
+        logging.info(f"Done, output written to {output_path}")
+
     logging.info(
         f"""
 ✅ Packed datasets with pack sizes {args.pack_sizes} are prepared successfully.
@@ -221,7 +155,9 @@ def main(cfg) -> None:
    > +model.data.train_ds.packed_sequence=True
 2. Use the new dataset file instead of the original jsonl file
    > model.data.train_ds.file_names=/path/to/packed_dataset.npy
-3. Adjust the batch sizes. 
+3. Specify the packed sequence length. This should be one of the ``pack_sizes`` you specified during data preparation.
+   > model.data.train_ds.max_seq_length=<pack_size>
+4. Adjust the batch sizes. 
    Micro batch size has to be set to 1 as a nominal constraint. This is because batches are now concatenated 
    in the preprocessing step. You can increase the pack_size to achieve the same purpose of increasing micro batch size.
    Global batch size has to be reduced by the average number of sequences per pack `n`, 

From dddc125227413ce9f84f83515d5b99c82b2279fa Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Fri, 24 May 2024 02:16:23 +0200
Subject: [PATCH 117/178] [Nemo-UX] Move code to collections + fix some small
 bugs (#9277)

* Move io & llm

* Run linting

* Fix 2 bugs in megatron-strategy

* Use teardown inside mistral hf-importer

* Fix bug inside HF import

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Port LLM api

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* fix imports

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

---------

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py              |  43 +++++
 nemo/collections/llm/api.py                   | 161 ++++++++++++++++++
 nemo/{ => collections}/llm/gpt/__init__.py    |   0
 nemo/collections/llm/gpt/data/__init__.py     |   7 +
 nemo/{ => collections}/llm/gpt/data/core.py   |   4 +-
 nemo/{ => collections}/llm/gpt/data/dolly.py  |   4 +-
 .../llm/gpt/data/fine_tuning.py               |  12 +-
 nemo/{ => collections}/llm/gpt/data/mock.py   |   9 +-
 .../llm/gpt/data/pre_training.py              |   6 +-
 nemo/{ => collections}/llm/gpt/data/squad.py  |   4 +-
 .../llm/gpt/model}/__init__.py                |  15 +-
 nemo/{ => collections}/llm/gpt/model/base.py  |   3 +-
 .../llm/gpt/model/mistral_7b.py               |  12 +-
 nemo/collections/llm/utils.py                 |  16 ++
 nemo/io/__init__.py                           |  25 ---
 nemo/lightning/__init__.py                    |   6 +
 nemo/lightning/base.py                        |  30 +---
 nemo/lightning/data.py                        |   9 +-
 nemo/lightning/io/__init__.py                 |  25 +++
 nemo/{ => lightning}/io/api.py                |  20 +--
 nemo/{ => lightning}/io/capture.py            |   6 +-
 nemo/{ => lightning}/io/connector.py          |  30 +++-
 nemo/{ => lightning}/io/mixin.py              |  16 +-
 nemo/{ => lightning}/io/pl.py                 |  10 +-
 nemo/{ => lightning}/io/state.py              |  20 ++-
 nemo/lightning/megatron_parallel.py           |  55 +++---
 .../pytorch/plugins/mixed_precision.py        |  15 +-
 nemo/lightning/pytorch/strategies.py          |  21 ++-
 nemo/lightning/pytorch/trainer.py             |   2 +-
 nemo/llm/gpt/data/__init__.py                 |   7 -
 nemo/llm/gpt/model/__init__.py                |  12 --
 tests/{ => lightning}/io/__init__.py          |   0
 tests/{ => lightning}/io/test_api.py          |  11 +-
 tests/{ => lightning}/io/test_mixin.py        |   2 +-
 tests/{ => lightning}/io/test_state.py        |   3 +-
 tests/lightning/test_data.py                  |  28 ++-
 tests/lightning/test_megatron_parallel.py     |   2 +-
 37 files changed, 454 insertions(+), 197 deletions(-)
 create mode 100644 nemo/collections/llm/__init__.py
 create mode 100644 nemo/collections/llm/api.py
 rename nemo/{ => collections}/llm/gpt/__init__.py (100%)
 create mode 100644 nemo/collections/llm/gpt/data/__init__.py
 rename nemo/{ => collections}/llm/gpt/data/core.py (98%)
 rename nemo/{ => collections}/llm/gpt/data/dolly.py (97%)
 rename nemo/{ => collections}/llm/gpt/data/fine_tuning.py (93%)
 rename nemo/{ => collections}/llm/gpt/data/mock.py (97%)
 rename nemo/{ => collections}/llm/gpt/data/pre_training.py (97%)
 rename nemo/{ => collections}/llm/gpt/data/squad.py (97%)
 rename nemo/{llm => collections/llm/gpt/model}/__init__.py (65%)
 rename nemo/{ => collections}/llm/gpt/model/base.py (99%)
 rename nemo/{ => collections}/llm/gpt/model/mistral_7b.py (96%)
 create mode 100644 nemo/collections/llm/utils.py
 delete mode 100644 nemo/io/__init__.py
 create mode 100644 nemo/lightning/io/__init__.py
 rename nemo/{ => lightning}/io/api.py (96%)
 rename nemo/{ => lightning}/io/capture.py (96%)
 rename nemo/{ => lightning}/io/connector.py (92%)
 rename nemo/{ => lightning}/io/mixin.py (98%)
 rename nemo/{ => lightning}/io/pl.py (98%)
 rename nemo/{ => lightning}/io/state.py (97%)
 delete mode 100644 nemo/llm/gpt/data/__init__.py
 delete mode 100644 nemo/llm/gpt/model/__init__.py
 rename tests/{ => lightning}/io/__init__.py (100%)
 rename tests/{ => lightning}/io/test_api.py (65%)
 rename tests/{ => lightning}/io/test_mixin.py (91%)
 rename tests/{ => lightning}/io/test_state.py (99%)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
new file mode 100644
index 000000000000..0f60fd7438b9
--- /dev/null
+++ b/nemo/collections/llm/__init__.py
@@ -0,0 +1,43 @@
+# This is here to import it once, which improves the speed of launch when in debug-mode
+try:
+    import transformer_engine  # noqa
+except ImportError:
+    pass
+
+from nemo.collections.llm.api import export_ckpt, import_ckpt, pretrain, train, validate
+from nemo.collections.llm.gpt.data import (
+    DollyDataModule,
+    FineTuningDataModule,
+    MockDataModule,
+    PreTrainingDataModule,
+    SquadDataModule,
+)
+from nemo.collections.llm.gpt.model import (
+    GPTConfig,
+    GPTModel,
+    MaskedTokenLossReduction,
+    Mistral7BConfig,
+    Mistral7BModel,
+    gpt_data_step,
+    gpt_forward_step,
+)
+
+__all__ = [
+    "MockDataModule",
+    "GPTModel",
+    "GPTConfig",
+    "gpt_data_step",
+    "gpt_forward_step",
+    "MaskedTokenLossReduction",
+    "Mistral7BConfig",
+    "Mistral7BModel",
+    "PreTrainingDataModule",
+    "FineTuningDataModule",
+    "SquadDataModule",
+    "DollyDataModule",
+    "train",
+    "import_ckpt",
+    "export_ckpt",
+    "pretrain",
+    "validate",
+]
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
new file mode 100644
index 000000000000..824d84ffb461
--- /dev/null
+++ b/nemo/collections/llm/api.py
@@ -0,0 +1,161 @@
+from pathlib import Path
+from typing import Callable, Optional
+
+import pytorch_lightning as pl
+
+from nemo.collections.llm.utils import task
+from nemo.lightning import MegatronStrategy, Trainer, io, teardown
+
+
+@task(namespace="llm")
+def train(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    tokenizer: Optional[str] = None,
+    source: Optional[str] = None,
+    export: Optional[str] = None,
+) -> Path:
+    """
+    Trains a model using the specified data and trainer, with optional tokenizer, source, and export.
+
+    Args:
+        model (pl.LightningModule): The model to be trained.
+        data (pl.LightningDataModule): The data module containing training data.
+        trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        tokenizer (Optional[str]): Tokenizer setting to be applied. Can be 'data' or 'model'.
+        source (Optional[str]): Path to a checkpoint from which to continue training.
+        export (Optional[str]): Filename to save the exported checkpoint after training.
+
+    Returns
+    -------
+        Path: The directory path where training artifacts are saved.
+
+    Raises
+    ------
+        ValueError: If the trainer's strategy is not MegatronStrategy.
+
+    Examples
+    --------
+        >>> model = MyModel()
+        >>> data = MyDataModule()
+        >>> trainer = Trainer(strategy=MegatronStrategy())
+        >>> train(model, data, trainer, tokenizer='data', source='path/to/ckpt.ckpt', export='final.ckpt')
+        PosixPath('/path/to/log_dir')
+    """
+    if not isinstance(trainer.strategy, MegatronStrategy):
+        raise ValueError("Only MegatronStrategy is supported")
+
+    fit_kwargs = {}
+    run_dir = Path(trainer.logger.log_dir)
+    export_dir = run_dir / "export"
+
+    if hasattr(train, "__io__"):
+        _save_config_img(run_dir, train.__io__)
+
+    if tokenizer:  # TODO: Improve this
+        _use_tokenizer(model, data, tokenizer)
+    if source:
+        _add_ckpt_path(source, model, fit_kwargs)
+
+    trainer.fit(model, data, **fit_kwargs)
+
+    print(f"Saving checkpoint to: {export_dir}")
+    trainer.save_checkpoint(export_dir)
+
+    if export and trainer.strategy.is_global_zero:
+        teardown(trainer, model=model)
+        print(f"Exporting checkpoint to: {export_dir / export}")
+        export_ckpt(export_dir, export)
+
+    return run_dir
+
+
+@task(namespace="llm")
+def pretrain(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    source: Optional[str] = None,
+    # export: Optional[str] = None
+) -> Path:
+    return train(model=model, data=data, trainer=trainer, tokenizer="data", source=source)
+
+
+@task(namespace="llm")
+def validate(
+    model: pl.LightningModule,
+    data: pl.LightningDataModule,
+    trainer: Trainer,
+    tokenizer: Optional[str] = None,
+    source: Optional[str] = None,
+    export: Optional[str] = None,
+) -> Path:
+    if not isinstance(trainer.strategy, MegatronStrategy):
+        raise ValueError("Only MegatronStrategy is supported")
+
+    validate_kwargs = {}
+    run_dir = Path(trainer.logger.log_dir)
+    export_dir = run_dir / "export"
+
+    if tokenizer:  # TODO: Improve this
+        _use_tokenizer(model, data, tokenizer)
+    if source:
+        _add_ckpt_path(source, model, validate_kwargs)
+
+    trainer.validate(model, data, **validate_kwargs)
+    trainer.save_checkpoint(export_dir)
+    if export:
+        teardown(trainer)
+        del trainer, model, data
+        export_ckpt(export_dir, export)
+
+    return run_dir
+
+
+@task(name="import", namespace="llm")
+def import_ckpt(
+    model: pl.LightningModule,
+    source: str,
+    output_path: Optional[Path] = None,
+    overwrite: bool = False,
+) -> Path:
+    return io.import_ckpt(model=model, source=source, output_path=output_path, overwrite=overwrite)
+
+
+def load_connector_from_trainer_ckpt(path: Path, target: str) -> io.ModelConnector:
+    return io.load_ckpt(path).model.exporter(target, path)
+
+
+@task(name="export", namespace="llm")
+def export_ckpt(
+    path: Path,
+    target: str,
+    output_path: Optional[Path] = None,
+    overwrite: bool = False,
+    load_connector: Callable[[Path, str], io.ModelConnector] = load_connector_from_trainer_ckpt,
+) -> Path:
+    return io.export_ckpt(path, target, output_path, overwrite, load_connector)
+
+
+def _use_tokenizer(model: pl.LightningModule, data: pl.LightningDataModule, tokenizer: str) -> None:
+    if tokenizer == "data":
+        model.tokenizer = data.tokenizer
+    elif tokenizer == "model":
+        data.tokenizer = model.tokenizer
+
+
+def _add_ckpt_path(source, model, kwargs) -> None:
+    if io.is_distributed_ckpt(source):
+        kwargs["ckpt_path"] = source
+    else:
+        kwargs["ckpt_path"] = model.import_ckpt(source)
+
+
+def _save_config_img(*args, **kwargs):
+    try:
+        from nemo_sdk.utils import save_config_img
+
+        save_config_img(*args, **kwargs)
+    except ImportError:
+        pass
diff --git a/nemo/llm/gpt/__init__.py b/nemo/collections/llm/gpt/__init__.py
similarity index 100%
rename from nemo/llm/gpt/__init__.py
rename to nemo/collections/llm/gpt/__init__.py
diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py
new file mode 100644
index 000000000000..f83da73c987b
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/__init__.py
@@ -0,0 +1,7 @@
+from nemo.collections.llm.gpt.data.dolly import DollyDataModule
+from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+
+__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"]
diff --git a/nemo/llm/gpt/data/core.py b/nemo/collections/llm/gpt/data/core.py
similarity index 98%
rename from nemo/llm/gpt/data/core.py
rename to nemo/collections/llm/gpt/data/core.py
index c8ce328c1e0b..8d99583016a4 100644
--- a/nemo/llm/gpt/data/core.py
+++ b/nemo/collections/llm/gpt/data/core.py
@@ -32,7 +32,7 @@ def create_sft_dataset(
     truncation_method: str = 'right',
     memmap_workers: int = 2,
     hf_dataset: bool = False,
-    **kwargs
+    **kwargs,
 ) -> "GPTSFTDataset":
     from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
 
@@ -53,5 +53,5 @@ def create_sft_dataset(
         index_mapping_dir=index_mapping_dir,
         prompt_template=prompt_template,
         truncation_method=truncation_method,
-        **kwargs
+        **kwargs,
     )
diff --git a/nemo/llm/gpt/data/dolly.py b/nemo/collections/llm/gpt/data/dolly.py
similarity index 97%
rename from nemo/llm/gpt/data/dolly.py
rename to nemo/collections/llm/gpt/data/dolly.py
index 2e3dcaffbf0a..9632a142eb35 100644
--- a/nemo/llm/gpt/data/dolly.py
+++ b/nemo/collections/llm/gpt/data/dolly.py
@@ -5,8 +5,8 @@
 import numpy as np
 from datasets import load_dataset
 
-from nemo.llm.gpt.data.core import get_dataset_root
-from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.core import get_dataset_root
+from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
 from nemo.utils import logging
 
 if TYPE_CHECKING:
diff --git a/nemo/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
similarity index 93%
rename from nemo/llm/gpt/data/fine_tuning.py
rename to nemo/collections/llm/gpt/data/fine_tuning.py
index 1e4ab0432847..1be5c41e4919 100644
--- a/nemo/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -5,8 +5,8 @@
 import pytorch_lightning as pl
 from torch.utils.data import DataLoader
 
+from nemo.collections.llm.gpt.data.core import create_sft_dataset
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
-from nemo.llm.gpt.data.core import create_sft_dataset
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers import TokenizerSpec
@@ -74,7 +74,13 @@ def val_dataloader(self) -> DataLoader:
         return self._create_dataloader(self._create_dataset(str(self.validation_path)))
 
     def test_dataloader(self) -> DataLoader:
-        return self._create_dataloader(self._create_dataset(str(self.test_path), tokens_to_generate=32, is_test=True,))
+        return self._create_dataloader(
+            self._create_dataset(
+                str(self.test_path),
+                tokens_to_generate=32,
+                is_test=True,
+            )
+        )
 
     @lru_cache
     def _create_dataset(self, path, **kwargs):
@@ -89,7 +95,7 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
             pin_memory=self.pin_memory,
             persistent_workers=self.persistent_workers,
             collate_fn=dataset.collate_fn,
-            **kwargs
+            **kwargs,
         )
 
     @property
diff --git a/nemo/llm/gpt/data/mock.py b/nemo/collections/llm/gpt/data/mock.py
similarity index 97%
rename from nemo/llm/gpt/data/mock.py
rename to nemo/collections/llm/gpt/data/mock.py
index ff035a78453d..ccc1acfd6a2a 100644
--- a/nemo/llm/gpt/data/mock.py
+++ b/nemo/collections/llm/gpt/data/mock.py
@@ -74,7 +74,12 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
 
 class _MockGPTDataset(Dataset):
     def __init__(
-        self, tokenizer: "TokenizerSpec", name: str, num_samples: int, seq_length: int, seed: int = 42,
+        self,
+        tokenizer: "TokenizerSpec",
+        name: str,
+        num_samples: int,
+        seq_length: int,
+        seed: int = 42,
     ) -> None:
         super().__init__()
         self.name = name
@@ -118,7 +123,7 @@ def _collate_fn(self, batch):
 
     def collate_fn(self, batch):
         """Method that user pass as functor to DataLoader.
-        
+
         The method optionally performs neural type checking and add types to the outputs.
 
         Please note, subclasses of Dataset should not implement `input_types`.
diff --git a/nemo/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
similarity index 97%
rename from nemo/llm/gpt/data/pre_training.py
rename to nemo/collections/llm/gpt/data/pre_training.py
index d5d05955078b..80e099290b1d 100644
--- a/nemo/llm/gpt/data/pre_training.py
+++ b/nemo/collections/llm/gpt/data/pre_training.py
@@ -9,6 +9,7 @@
 
 if TYPE_CHECKING:
     from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 
@@ -82,7 +83,10 @@ def setup(self, stage: str = "") -> None:
 
         train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples]
         self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-            GPTDataset, train_valid_test_num_samples, is_built_on_rank=lambda: True, config=self.gpt_dataset_config,
+            GPTDataset,
+            train_valid_test_num_samples,
+            is_built_on_rank=lambda: True,
+            config=self.gpt_dataset_config,
         ).build()
 
     # uncomment once fabric API is merged
diff --git a/nemo/llm/gpt/data/squad.py b/nemo/collections/llm/gpt/data/squad.py
similarity index 97%
rename from nemo/llm/gpt/data/squad.py
rename to nemo/collections/llm/gpt/data/squad.py
index c5235905b4ed..77d48da98a0e 100644
--- a/nemo/llm/gpt/data/squad.py
+++ b/nemo/collections/llm/gpt/data/squad.py
@@ -4,8 +4,8 @@
 
 from datasets import DatasetDict, load_dataset
 
-from nemo.llm.gpt.data.core import get_dataset_root
-from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.core import get_dataset_root
+from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
 from nemo.utils import logging
 
 if TYPE_CHECKING:
diff --git a/nemo/llm/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
similarity index 65%
rename from nemo/llm/__init__.py
rename to nemo/collections/llm/gpt/model/__init__.py
index a05c96f60944..fcb78d6cd397 100644
--- a/nemo/llm/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -1,21 +1,18 @@
-from nemo.llm.gpt.data import MockDataModule
-from nemo.llm.gpt.model import (
+from nemo.collections.llm.gpt.model.base import (
     GPTConfig,
     GPTModel,
     MaskedTokenLossReduction,
-    Mistral7BConfig,
-    Mistral7BModel,
     gpt_data_step,
     gpt_forward_step,
 )
+from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
 
 __all__ = [
-    "MockDataModule",
-    "GPTModel",
     "GPTConfig",
-    "gpt_data_step",
-    "gpt_forward_step",
-    "MaskedTokenLossReduction",
+    "GPTModel",
     "Mistral7BConfig",
     "Mistral7BModel",
+    "MaskedTokenLossReduction",
+    "gpt_data_step",
+    "gpt_forward_step",
 ]
diff --git a/nemo/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
similarity index 99%
rename from nemo/llm/gpt/model/base.py
rename to nemo/collections/llm/gpt/model/base.py
index 7aaac96fdc4f..c6db9b8cbd80 100644
--- a/nemo/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -7,8 +7,7 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch.optim import Optimizer
 
-from nemo import io
-from nemo.lightning import get_vocab_size
+from nemo.lightning import get_vocab_size, io
 from nemo.lightning.base import ModelConfig
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 
diff --git a/nemo/llm/gpt/model/mistral_7b.py b/nemo/collections/llm/gpt/model/mistral_7b.py
similarity index 96%
rename from nemo/llm/gpt/model/mistral_7b.py
rename to nemo/collections/llm/gpt/model/mistral_7b.py
index 83d3b3412a39..e0035a086fbe 100644
--- a/nemo/llm/gpt/model/mistral_7b.py
+++ b/nemo/collections/llm/gpt/model/mistral_7b.py
@@ -5,8 +5,8 @@
 import torch
 import torch.nn.functional as F
 
-from nemo import io
-from nemo.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.lightning import io, teardown
 
 if TYPE_CHECKING:
     from transformers import MistralConfig, MistralForCausalLM
@@ -21,7 +21,7 @@ class Mistral7BConfig(GPTConfig):
     position_embedding_type: str = "rope"
     add_bias_linear: bool = False
     gated_linear_unit: bool = True
-    apply_query_key_layer_scaling: bool = True
+    apply_query_key_layer_scaling: bool = False  # TODO: Should this be True?
 
     num_layers: int = 32
     hidden_size: int = 4096
@@ -56,6 +56,9 @@ def apply(self, output_path: Path) -> Path:
         self.convert_state(source, target)
         self.nemo_save(output_path, trainer)
 
+        teardown(trainer, target)
+        del trainer, target
+
         return output_path
 
     def convert_state(self, source, target):
@@ -90,11 +93,12 @@ def make_vocab_size_divisible_by(mistral_vocab_size):
             return base
 
         output = Mistral7BConfig(
-            seq_length=source.max_position_embeddings,
+            seq_length=source.sliding_window,
             num_layers=source.num_hidden_layers,
             hidden_size=source.hidden_size,
             ffn_hidden_size=source.intermediate_size,
             num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.max_position_embeddings,
             init_method_std=source.initializer_range,
             layernorm_epsilon=source.rms_norm_eps,
             num_query_groups=source.num_key_value_heads,
diff --git a/nemo/collections/llm/utils.py b/nemo/collections/llm/utils.py
new file mode 100644
index 000000000000..848a83f5dc08
--- /dev/null
+++ b/nemo/collections/llm/utils.py
@@ -0,0 +1,16 @@
+from typing import Any, Callable, TypeVar
+
+T = TypeVar('T', bound=Callable[..., Any])
+
+
+def task(*args: Any, **kwargs: Any) -> Callable[[T], T]:
+    try:
+        import nemo_sdk as sdk
+
+        return sdk.task(*args, **kwargs)
+    except ImportError:
+        # Return a no-op function
+        def noop_decorator(func: T) -> T:
+            return func
+
+        return noop_decorator
diff --git a/nemo/io/__init__.py b/nemo/io/__init__.py
deleted file mode 100644
index 1b541ff7ba34..000000000000
--- a/nemo/io/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from nemo.io.api import export_ckpt, import_ckpt, load, load_ckpt, model_exporter, model_importer
-from nemo.io.capture import reinit
-from nemo.io.connector import Connector, ModelConnector
-from nemo.io.mixin import ConnectorMixin, IOMixin
-from nemo.io.pl import TrainerCheckpoint, is_distributed_ckpt
-from nemo.io.state import TransformCTX, apply_transforms, state_transform
-
-__all__ = [
-    "apply_transforms",
-    "Connector",
-    "ConnectorMixin",
-    "IOMixin",
-    "import_ckpt",
-    "is_distributed_ckpt",
-    "export_ckpt",
-    "load",
-    "load_ckpt",
-    "ModelConnector",
-    "model_importer",
-    "model_exporter",
-    'reinit',
-    "state_transform",
-    "TrainerCheckpoint",
-    "TransformCTX",
-]
diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py
index afbdb39f42d4..e54f223f91cc 100644
--- a/nemo/lightning/__init__.py
+++ b/nemo/lightning/__init__.py
@@ -3,6 +3,12 @@
 from lightning_fabric.plugins.environments import slurm
 from pytorch_lightning import plugins as _pl_plugins
 
+# This is here to import it once, which improves the speed of launch when in debug-mode
+try:
+    import transformer_engine  # noqa
+except ImportError:
+    pass
+
 from nemo.lightning.base import get_vocab_size, teardown
 from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision
 from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler
diff --git a/nemo/lightning/base.py b/nemo/lightning/base.py
index 65bc1310f426..9cf2d9a44f35 100644
--- a/nemo/lightning/base.py
+++ b/nemo/lightning/base.py
@@ -9,7 +9,7 @@
 from pytorch_lightning import LightningModule, Trainer
 from torch import nn
 
-from nemo import io
+from nemo.lightning import io
 
 DEFAULT_NEMO_CACHE_HOME = Path.home() / ".cache" / "nemo"
 NEMO_CACHE_HOME = Path(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME))
@@ -18,27 +18,7 @@
 DEFAULT_NEMO_MODELS_CACHE = NEMO_CACHE_HOME / "models"
 NEMO_MODELS_CACHE = Path(os.getenv("NEMO_MODELS_CACHE", DEFAULT_NEMO_MODELS_CACHE))
 
-#
-# @dataclass
-# class DataConfig:
-#     seq_length: int
-#     micro_batch_size: int = 4
-#     global_batch_size: int = 8
-#     rampup_batch_size: Optional[List[int]] = None
-#     train_drop_last: bool = True
-#     val_drop_last: bool = True
-#     test_drop_last: bool = True
-#     num_workers: int = 8
-#     pin_memory: bool = True
-#     persistent_workers: bool = False
-#
-#     @property
-#     def num_microbatches(self) -> int:
-#         from apex.transformer.pipeline_parallel.utils import get_num_microbatches
-#
-#         return get_num_microbatches()
-#
-#
+
 ModelT = TypeVar("ModelT", bound=LightningModule)
 
 
@@ -66,7 +46,11 @@ def init(self, *args, data=None, cpu: bool = False, **kwargs) -> ModelT:
         return model
 
 
-def get_vocab_size(config, vocab_size: int, make_vocab_size_divisible_by: int = 128,) -> int:
+def get_vocab_size(
+    config,
+    vocab_size: int,
+    make_vocab_size_divisible_by: int = 128,
+) -> int:
     from nemo.utils import logging
 
     after = vocab_size
diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index 794300db72f0..88e2f3436699 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -20,7 +20,10 @@ def create_dataloader(
 
 
 def setup_microbatch_calculator(
-    global_rank: int, micro_batch_size: int, global_batch_size: int, rampup_batch_size: Optional[List[int]] = None,
+    global_rank: int,
+    micro_batch_size: int,
+    global_batch_size: int,
+    rampup_batch_size: Optional[List[int]] = None,
 ) -> None:
     """
     Initializes the data for distributed training by setting up the microbatch calculator
@@ -41,7 +44,6 @@ def setup_microbatch_calculator(
 
     """
     from nemo.lightning._strategy_lib import NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE
-
     from nemo.utils import AppState
 
     app_state = AppState()
@@ -189,8 +191,7 @@ def __len__(self):
             return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1
 
     @abc.abstractmethod
-    def __iter__(self):
-        ...
+    def __iter__(self): ...
 
 
 class MegatronPretrainingSampler(BaseMegatronSampler):
diff --git a/nemo/lightning/io/__init__.py b/nemo/lightning/io/__init__.py
new file mode 100644
index 000000000000..d1a193c5e728
--- /dev/null
+++ b/nemo/lightning/io/__init__.py
@@ -0,0 +1,25 @@
+from nemo.lightning.io.api import export_ckpt, import_ckpt, load, load_ckpt, model_exporter, model_importer
+from nemo.lightning.io.capture import reinit
+from nemo.lightning.io.connector import Connector, ModelConnector
+from nemo.lightning.io.mixin import ConnectorMixin, IOMixin
+from nemo.lightning.io.pl import TrainerCheckpoint, is_distributed_ckpt
+from nemo.lightning.io.state import TransformCTX, apply_transforms, state_transform
+
+__all__ = [
+    "apply_transforms",
+    "Connector",
+    "ConnectorMixin",
+    "IOMixin",
+    "import_ckpt",
+    "is_distributed_ckpt",
+    "export_ckpt",
+    "load",
+    "load_ckpt",
+    "ModelConnector",
+    "model_importer",
+    "model_exporter",
+    'reinit',
+    "state_transform",
+    "TrainerCheckpoint",
+    "TransformCTX",
+]
diff --git a/nemo/io/api.py b/nemo/lightning/io/api.py
similarity index 96%
rename from nemo/io/api.py
rename to nemo/lightning/io/api.py
index c8fe3c04a811..9af1d3d2a9d6 100644
--- a/nemo/io/api.py
+++ b/nemo/lightning/io/api.py
@@ -5,8 +5,8 @@
 import fiddle as fdl
 import pytorch_lightning as pl
 
-from nemo.io.mixin import ConnectorMixin, ConnT, ModelConnector
-from nemo.io.pl import TrainerCheckpoint
+from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector
+from nemo.lightning.io.pl import TrainerCheckpoint
 
 CkptType = TypeVar("CkptType")
 
@@ -128,14 +128,14 @@ def import_ckpt(
     path for the imported checkpoint; if not provided, the importer's default path will be used.
     The 'overwrite' parameter enables the replacement of existing data at the output path, which
     is useful when updating models with new data and discarding old checkpoint files.
-    
-    For instance, using `import_ckpt(Mistral7BModel(), "hf")` initiates the import process 
-    by searching for a registered model importer tagged with "hf". In NeMo, `HFMistral7BImporter` 
+
+    For instance, using `import_ckpt(Mistral7BModel(), "hf")` initiates the import process
+    by searching for a registered model importer tagged with "hf". In NeMo, `HFMistral7BImporter`
     is registered under this tag via:
-    `@io.model_importer(Mistral7BModel, "hf", default_path="mistralai/Mistral-7B-v0.1")`. 
-    This links `Mistral7BModel` to `HFMistral7BImporter`, designed for HuggingFace checkpoints. 
-    The importer then processes and integrates these checkpoints into `Mistral7BModel` for further 
-    fine-tuning. 
+    `@io.model_importer(Mistral7BModel, "hf", default_path="mistralai/Mistral-7B-v0.1")`.
+    This links `Mistral7BModel` to `HFMistral7BImporter`, designed for HuggingFace checkpoints.
+    The importer then processes and integrates these checkpoints into `Mistral7BModel` for further
+    fine-tuning.
 
     Args:
         model (pl.LightningModule): The model into which the checkpoint will be imported.
@@ -188,7 +188,7 @@ def export_ckpt(
 ) -> Path:
     """
     Exports a checkpoint from a model using the model's associated exporter, typically for
-    the purpose of sharing a model that has been fine-tuned or customized within NeMo. 
+    the purpose of sharing a model that has been fine-tuned or customized within NeMo.
     This function leverages the ConnectorMixin interface to seamlessly integrate
     the model's state into an external checkpoint format.
 
diff --git a/nemo/io/capture.py b/nemo/lightning/io/capture.py
similarity index 96%
rename from nemo/io/capture.py
rename to nemo/lightning/io/capture.py
index 2a65d18c15e3..910506f13147 100644
--- a/nemo/io/capture.py
+++ b/nemo/lightning/io/capture.py
@@ -42,14 +42,12 @@ def wrapper(*args, **kwargs):
 @runtime_checkable
 class IOProtocol(Protocol, Generic[SelfT]):
     @property
-    def __io__(self) -> fdl.Config[SelfT]:
-        ...
+    def __io__(self) -> fdl.Config[SelfT]: ...
 
 
 @runtime_checkable
 class ReInitProtocol(Protocol, Generic[SelfT]):
-    def reinit(self) -> SelfT:
-        ...
+    def reinit(self) -> SelfT: ...
 
 
 def reinit(configurable: IOProtocol[SelfT]) -> SelfT:
diff --git a/nemo/io/connector.py b/nemo/lightning/io/connector.py
similarity index 92%
rename from nemo/io/connector.py
rename to nemo/lightning/io/connector.py
index bf5f88f95992..cd77abf9dc1c 100644
--- a/nemo/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -29,19 +29,19 @@ class Connector(BasePath, Generic[SourceT, TargetT]):
     -------
         init() -> TargetT:
             Should be implemented to initialize the target type from the source type.
-        
+
         apply(output_path: Path) -> Path:
             Should be implemented to apply the transformation and save the result at the output path.
-        
+
         __new__(cls, *args, **kwargs) -> 'Connector':
             Creates a new instance of the connector, using default_path if no path is provided.
-        
+
         __call__(output_path: Optional[Path] = None, overwrite: bool = False) -> Path:
             Processes the transformation and handles file operations like overwriting.
-        
+
         local_path(base_path: Optional[Path] = None) -> Path:
             Computes the local path for storage based on a base path or a default cache home.
-        
+
         is_in_cache(base_path: Optional[Path] = None) -> bool:
             Checks if the transformed data is already cached at the specified base path.
     """
@@ -96,10 +96,10 @@ class ModelConnector(Connector, Generic[SourceT, TargetT]):
     -------
         nemo_setup(model: pl.LightningModule, trainer: Optional[pl.Trainer] = None) -> pl.Trainer:
             Sets up the model and trainer using a specified strategy, preparing it for training or inference.
-        
+
         nemo_save(output_path: Path, trainer: pl.Trainer):
             Saves the model's state to the specified path using the trainer's current strategy.
-        
+
         nemo_load(path: Path, trainer: Optional[pl.Trainer] = None, cpu: bool = True) -> Tuple[Any, pl.Trainer]:
             Loads a model from the specified path, optionally using a CPU-focused strategy, and returns the model and trainer.
     """
@@ -118,7 +118,9 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
         """
         from nemo.lightning import MegatronStrategy, Trainer
 
-        _trainer = trainer or Trainer(devices=1, accelerator="cpu", strategy=MegatronStrategy())
+        _trainer = trainer or Trainer(
+            devices=1, accelerator="cpu", strategy=MegatronStrategy(store_optimizer_states=False)
+        )
 
         _trainer.strategy.connect(model)
         _trainer.strategy.setup_environment()
@@ -156,8 +158,8 @@ def nemo_load(
         -------
             Tuple[pl.LightningModule, pl.Trainer]: The loaded model and the trainer configured with the model.
         """
-        from nemo.io.api import load_ckpt
         from nemo.lightning import MegatronStrategy, Trainer, _strategy_lib
+        from nemo.lightning.io.api import load_ckpt
 
         model = load_ckpt(path).model
         _trainer = trainer or Trainer(devices=1, accelerator="cpu" if cpu else "gpu", strategy=MegatronStrategy())
@@ -177,3 +179,13 @@ def nemo_load(
         _trainer.strategy.load_checkpoint(path)
 
         return model, _trainer
+
+    def local_path(self, base_path: Optional[Path] = None) -> Path:
+        if base_path:
+            _base = base_path
+        else:
+            from nemo.lightning.base import NEMO_MODELS_CACHE
+
+            _base = Path(NEMO_MODELS_CACHE)
+
+        return _base / str(self).replace("://", "/")
diff --git a/nemo/io/mixin.py b/nemo/lightning/io/mixin.py
similarity index 98%
rename from nemo/io/mixin.py
rename to nemo/lightning/io/mixin.py
index bba6677b452b..b5ee76a2fe03 100644
--- a/nemo/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -8,8 +8,8 @@
 from cloudpickle import dump
 from typing_extensions import Self
 
-from nemo.io.capture import IOProtocol
-from nemo.io.connector import ModelConnector
+from nemo.lightning.io.capture import IOProtocol
+from nemo.lightning.io.connector import ModelConnector
 
 ConnT = TypeVar('ConnT', bound=ModelConnector)
 
@@ -35,8 +35,8 @@ class IOMixin:
 
     Examples
     --------
-        from nemo import io
-        
+        from nemo.lightning import io
+
         class ExampleClass(io.IOMixin):
             def __init__(self, param1, param2):
                 super().__init__()
@@ -46,7 +46,7 @@ def __init__(self, param1, param2):
         # Creating an instance of ExampleClass
         example = ExampleClass('value1', 'value2')
         example_copy = io.reinit(example)
-        
+
 
     Note:
         For more information on `fdl.Config`, refer to the Fiddle library documentation at
@@ -168,9 +168,9 @@ def import_from(cls, path: str) -> Self:
 
         Args:
             path (str): The path to the model file to be imported.
-            
+
         Example:
-            from nemo import llm
+            from nemo.collections import llm
             model = llm.Mistral7BModel.import_from("hf")
 
         Returns
@@ -285,7 +285,7 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa
     @classmethod
     def _get_connector(cls, ext, path=None, importer=True) -> ModelConnector:
         """
-        Retrieves the appropriate model connector based on the file extension and path, 
+        Retrieves the appropriate model connector based on the file extension and path,
         distinguishing between importers and exporters.
 
         Args:
diff --git a/nemo/io/pl.py b/nemo/lightning/io/pl.py
similarity index 98%
rename from nemo/io/pl.py
rename to nemo/lightning/io/pl.py
index ba9b5be72cab..fba94f5e3a55 100644
--- a/nemo/io/pl.py
+++ b/nemo/lightning/io/pl.py
@@ -11,8 +11,8 @@
 from torch import nn
 from typing_extensions import Self, override
 
-from nemo.io.capture import IOProtocol
-from nemo.io.mixin import IOMixin
+from nemo.lightning.io.capture import IOProtocol
+from nemo.lightning.io.mixin import IOMixin
 
 if TYPE_CHECKING:
     from nemo.lightning.pytorch.strategies import MegatronStrategy
@@ -53,11 +53,9 @@ def construct_extra(cls, strategy: "MegatronStrategy") -> Dict[str, Any]:
 
 class TrainerCkptProtocol(Protocol):
     @classmethod
-    def from_strategy(cls, strategy: "MegatronStrategy") -> Self:
-        ...
+    def from_strategy(cls, strategy: "MegatronStrategy") -> Self: ...
 
-    def io_dump(self, output: Path):
-        ...
+    def io_dump(self, output: Path): ...
 
 
 class MegatronCheckpointIO(CheckpointIO):
diff --git a/nemo/io/state.py b/nemo/lightning/io/state.py
similarity index 97%
rename from nemo/io/state.py
rename to nemo/lightning/io/state.py
index d978cd0ade8e..ed481cfcfe08 100644
--- a/nemo/io/state.py
+++ b/nemo/lightning/io/state.py
@@ -26,11 +26,11 @@ def apply_transforms(
     transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None,
 ) -> TargetModuleT:
     """
-    Applies a series of transformations to adapt the state dictionary of a source module to 
+    Applies a series of transformations to adapt the state dictionary of a source module to
     match the structure of a target module's state dictionary.
 
     This function renames keys according to a provided mapping and modifies values using a list
-    of transformation functions. Each transformation function typically is decorated 
+    of transformation functions. Each transformation function typically is decorated
     with `io.state_transform`.
 
     Args:
@@ -91,7 +91,12 @@ def scale_weights(ctx):
         _target = target.module
 
     target_state = _target.state_dict()
-    ctx = TransformCTX(source=_source, source_state=_source.state_dict(), target=_target, target_state=target_state,)
+    ctx = TransformCTX(
+        source=_source,
+        source_state=_source.state_dict(),
+        target=_target,
+        target_state=target_state,
+    )
 
     for key, val in mapping.items():
         ctx = StateDictTransform(key, val)(ctx)
@@ -349,16 +354,15 @@ def _match_keys(keys: List[str], pattern: str) -> np.ndarray:
 
 @overload
 def state_transform(
-    source_key: Union[str, Tuple[str, ...], Dict[str, str]], target_key: Union[str, Tuple[str, ...]],
-) -> Callable[[F], StateDictTransform[F]]:
-    ...
+    source_key: Union[str, Tuple[str, ...], Dict[str, str]],
+    target_key: Union[str, Tuple[str, ...]],
+) -> Callable[[F], StateDictTransform[F]]: ...
 
 
 @overload
 def state_transform(
     source_key: Union[str, Tuple[str, ...], Dict[str, str]], target_key: Union[str, Tuple[str, ...]], fn: F
-) -> StateDictTransform[F]:
-    ...
+) -> StateDictTransform[F]: ...
 
 
 def state_transform(
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 899f2fb2c06c..8106b83a41d1 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -31,11 +31,9 @@
 
 @runtime_checkable
 class PrecisionPluginProtocol(Protocol[DataT]):
-    def convert_input(self, data: DataT) -> DataT:
-        ...
+    def convert_input(self, data: DataT) -> DataT: ...
 
-    def convert_output(self, output: torch.Tensor) -> torch.Tensor:
-        ...
+    def convert_output(self, output: torch.Tensor) -> torch.Tensor: ...
 
 
 def default_data_step(dataloader_iter: Iterator[DataT]) -> DataT:
@@ -122,7 +120,7 @@ def __init__(
 
         if vp_size is not None:
             if len(_pipeline) == 1 and parallel_state.get_pipeline_model_parallel_world_size() > 1:
-                from nemo import io
+                from nemo.lightning import io
 
                 parallel_state.set_virtual_pipeline_model_parallel_world_size(vp_size)
                 for i in range(1, vp_size):
@@ -212,7 +210,10 @@ def forward(
         if wrap_forward_step:
             _data_step = data_step or self.data_step
             forward_step_func = self.wrapped_forward_step(
-                _forward_step, data_step=_data_step, loss_reduction=loss_reduction, context=context,
+                _forward_step,
+                data_step=_data_step,
+                loss_reduction=_loss_reduction,
+                context=context,
             )
         else:
             forward_step_func = _forward_step
@@ -259,7 +260,11 @@ def forward(
         return loss_mean
 
     def wrapped_forward_step(
-        self, forward_step, loss_reduction, context, data_step,
+        self,
+        forward_step,
+        loss_reduction,
+        context,
+        data_step,
     ) -> Callable[[nn.Module, DataT], Tuple[torch.Tensor, "MegatronCallbackProtocol"]]:
         """The method wraps the forward step function and returns a callable.
 
@@ -309,7 +314,11 @@ def wrapped_forward_step_func(dataloader_iter, model):
 
             # callback
             self._setup_module(
-                forward_callback, batch=batch, model=self, forward_module=model, tensor=output_tensor,
+                forward_callback,
+                batch=batch,
+                model=self,
+                forward_module=model,
+                tensor=output_tensor,
             )
 
             if self.precision_plugin and parallel_state.is_pipeline_last_stage():
@@ -728,29 +737,21 @@ def __contains__(self, callback_object) -> bool:
 
 
 class CallbackMethods:
-    def on_megatron_step_start(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_step_start(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_microbatch_start(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_microbatch_start(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_microbatch_callback(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_microbatch_callback(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_microbatch_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_microbatch_end(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_reduce_microbatches_start(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_reduce_microbatches_start(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_reduce_microbatches_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_reduce_microbatches_end(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_log_step_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_log_step_end(self, *args, **kwargs) -> None: ...
 
-    def on_megatron_step_end(self, *args, **kwargs) -> None:
-        ...
+    def on_megatron_step_end(self, *args, **kwargs) -> None: ...
 
 
 ReductionT = TypeVar("ReductionT")
@@ -778,8 +779,7 @@ def reduce(self, losses_reduced_per_micro_batch: Sequence[ReductionT]) -> torch.
 
 @runtime_checkable
 class MegatronCallbackProtocol(Protocol):
-    def __call__(self, tensor: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        ...
+    def __call__(self, tensor: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: ...
 
 
 @runtime_checkable
@@ -796,8 +796,7 @@ def __call__(
         decoder_seq_length: Optional[int] = None,
         forward_only: bool = False,
         collect_non_loss_data: bool = False,
-    ) -> list:
-        ...
+    ) -> list: ...
 
 
 def _calc_number_of_params(model: List[nn.Module]) -> int:
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
index af7054526957..6c3d556816d2 100644
--- a/nemo/lightning/pytorch/plugins/mixed_precision.py
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -27,11 +27,16 @@
 
 
 class MegatronMixedPrecision(MixedPrecision):
-    def __init__(self, precision: Literal["16-mixed", "bf16-mixed"], amp_O2: bool = True, device="cuda",) -> None:
+    def __init__(
+        self,
+        precision: Literal["16-mixed", "bf16-mixed"],
+        amp_O2: bool = False,
+        device="cuda",
+    ) -> None:
         if precision == "bf16-mixed":
             scaler = None
         else:
-            scaler = GradScaler(init_scale=2 ** 32, growth_interval=1000, hysteresis=2)
+            scaler = GradScaler(init_scale=2**32, growth_interval=1000, hysteresis=2)
 
         super().__init__(precision, device, scaler)
 
@@ -94,7 +99,11 @@ def convert_optimizer(self, optimizer: Optimizer) -> Optimizer:
         if isinstance(optimizer, MainParamsOptimizerWrapper) or not self.amp_O2:
             return optimizer
 
-        return MainParamsOptimizerWrapper(optimizer, fp32_grad_accum=True, contiguous_grad_bucket=True,)
+        return MainParamsOptimizerWrapper(
+            optimizer,
+            fp32_grad_accum=True,
+            contiguous_grad_bucket=True,
+        )
 
     def convert_input(self, data: AnyT) -> AnyT:
         """Convert model inputs (forward) to the floating point precision type of this plugin.
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 65986b2a4855..c002ecf7fd68 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -27,9 +27,8 @@
 from torch.utils.data import DataLoader
 from typing_extensions import override
 
-from nemo import io
-from nemo.io.pl import MegatronCheckpointIO, TrainerCheckpoint, TrainerCkptProtocol
-from nemo.lightning import _strategy_lib
+from nemo.lightning import _strategy_lib, io
+from nemo.lightning.io.pl import MegatronCheckpointIO, TrainerCheckpoint, TrainerCkptProtocol
 from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
 from nemo.lightning.pytorch.callbacks import MegatronProgressBar
 
@@ -63,6 +62,7 @@ def __init__(
         find_unused_parameters: bool = False,
         enable_nemo_ckpt_io: bool = True,
         ckpt_type: TrainerCkptProtocol = TrainerCheckpoint,
+        ckpt_include_optimizer: bool = False,
         lazy_init: bool = False,
         **kwargs,
     ) -> None:
@@ -83,6 +83,7 @@ def __init__(
         self.enable_nemo_ckpt_io = enable_nemo_ckpt_io
         self.ckpt_type = ckpt_type
         self.lazy_init = lazy_init
+        self.ckpt_include_optimizer = ckpt_include_optimizer
 
         # used in NVIDIA NGC PyTorch containers
         _strategy_lib.enable_nvidia_optimizations()
@@ -174,6 +175,7 @@ def setup_distributed(self) -> None:
         super().setup_distributed()
 
         from megatron.core import parallel_state
+
         from nemo.utils import AppState
 
         # init model parallel if needed
@@ -227,6 +229,7 @@ def configure_ddp(self) -> None:
     def _setup_model(self, model: nn.Module) -> DistributedDataParallel:
         """Only called when we need to wrap the model for pytorch's ddp."""
         from megatron.core import parallel_state
+
         from nemo.utils import AppState
 
         app_state = AppState()
@@ -345,10 +348,10 @@ def optimizer_sharded_state_dict(self):
     def save_checkpoint(
         self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None
     ) -> None:
-        checkpoint['state_dict'] = OrderedDict([])  # remove device state_dict
-        checkpoint['sharded_state_dict'] = self.megatron_parallel.sharded_state_dict()
+        checkpoint["state_dict"] = OrderedDict([])  # remove device state_dict
+        checkpoint["sharded_state_dict"] = self.megatron_parallel.sharded_state_dict()
         if self.trainer.state.fn == TrainerFn.FITTING:
-            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
+            checkpoint["optimizer_states"] = [self.optimizer_sharded_state_dict()]
 
         self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
         if self.enable_nemo_ckpt_io and self.is_global_zero and self.ckpt_type:
@@ -367,9 +370,9 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
         sharded_state_dict = {}
         sharded_state_dict["state_dict"] = self.megatron_parallel.sharded_state_dict()
 
-        # if self.trainer.state.fn == TrainerFn.FITTING:
-        #     if self.lightning_module.optimizers(use_pl_optimizer=False):
-        #         sharded_state_dict["optimizer_states"] = [self.optimizer_sharded_state_dict()]
+        if self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING:
+            if self.lightning_module.optimizers(use_pl_optimizer=False):
+                sharded_state_dict["optimizer_states"] = [self.optimizer_sharded_state_dict()]
 
         checkpoint = self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=sharded_state_dict)
 
diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py
index da04a93eef05..b4483d4af4b9 100644
--- a/nemo/lightning/pytorch/trainer.py
+++ b/nemo/lightning/pytorch/trainer.py
@@ -4,7 +4,7 @@
 import pytorch_lightning as pl
 from typing_extensions import Self
 
-from nemo.io.mixin import IOMixin
+from nemo.lightning.io.mixin import IOMixin
 
 
 class Trainer(pl.Trainer, IOMixin):
diff --git a/nemo/llm/gpt/data/__init__.py b/nemo/llm/gpt/data/__init__.py
deleted file mode 100644
index 1c1c9ce5d525..000000000000
--- a/nemo/llm/gpt/data/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from nemo.llm.gpt.data.dolly import DollyDataModule
-from nemo.llm.gpt.data.fine_tuning import FineTuningDataModule
-from nemo.llm.gpt.data.mock import MockDataModule
-from nemo.llm.gpt.data.pre_training import PreTrainingDataModule
-from nemo.llm.gpt.data.squad import SquadDataModule
-
-__all__ = ["FineTuningDataModule", "SquadDataModule", "DollyDataModule", "MockDataModule", "PreTrainingDataModule"]
diff --git a/nemo/llm/gpt/model/__init__.py b/nemo/llm/gpt/model/__init__.py
deleted file mode 100644
index 05c3e9928fab..000000000000
--- a/nemo/llm/gpt/model/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from nemo.llm.gpt.model.base import GPTConfig, GPTModel, MaskedTokenLossReduction, gpt_data_step, gpt_forward_step
-from nemo.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
-
-__all__ = [
-    "GPTConfig",
-    "GPTModel",
-    "Mistral7BConfig",
-    "Mistral7BModel",
-    "MaskedTokenLossReduction",
-    "gpt_data_step",
-    "gpt_forward_step",
-]
diff --git a/tests/io/__init__.py b/tests/lightning/io/__init__.py
similarity index 100%
rename from tests/io/__init__.py
rename to tests/lightning/io/__init__.py
diff --git a/tests/io/test_api.py b/tests/lightning/io/test_api.py
similarity index 65%
rename from tests/io/test_api.py
rename to tests/lightning/io/test_api.py
index d4c317bf2e9f..9872d0860193 100644
--- a/tests/io/test_api.py
+++ b/tests/lightning/io/test_api.py
@@ -1,6 +1,6 @@
-from nemo import io
 from nemo import lightning as nl
-from nemo import llm
+from nemo.collections import llm
+from nemo.lightning import io
 
 
 class TestLoad:
@@ -8,7 +8,12 @@ def test_reload_ckpt(self, tmpdir):
         trainer = nl.Trainer(devices=1, accelerator="cpu", strategy=nl.MegatronStrategy())
         # model = llm.Mistral7BModel()
         model = llm.GPTModel(
-            llm.GPTConfig(num_layers=2, hidden_size=1024, ffn_hidden_size=4096, num_attention_heads=8,)
+            llm.GPTConfig(
+                num_layers=2,
+                hidden_size=1024,
+                ffn_hidden_size=4096,
+                num_attention_heads=8,
+            )
         )
 
         ckpt = io.TrainerCheckpoint(model, trainer)
diff --git a/tests/io/test_mixin.py b/tests/lightning/io/test_mixin.py
similarity index 91%
rename from tests/io/test_mixin.py
rename to tests/lightning/io/test_mixin.py
index ed898d435609..824608db6bf0 100644
--- a/tests/io/test_mixin.py
+++ b/tests/lightning/io/test_mixin.py
@@ -1,4 +1,4 @@
-from nemo import io
+from nemo.lightning import io
 
 
 class DummyClass(io.IOMixin):
diff --git a/tests/io/test_state.py b/tests/lightning/io/test_state.py
similarity index 99%
rename from tests/io/test_state.py
rename to tests/lightning/io/test_state.py
index bb5dc4a9af3d..f368f3ce02ce 100644
--- a/tests/io/test_state.py
+++ b/tests/lightning/io/test_state.py
@@ -1,7 +1,7 @@
 import pytest
 from torch import nn
 
-from nemo.io.state import StateDictTransform, TransformCTX, state_transform
+from nemo.lightning.io.state import StateDictTransform, TransformCTX, state_transform
 
 
 class TestStateDictTransform:
@@ -141,6 +141,7 @@ def test_transform_with_tuple_target_key_and_multiple_outputs(self, mock_multi_t
         Test transformation where the target_key is a tuple and the transform function
         returns multiple values that are then unrolled to these target keys.
         """
+
         # Define a transformation that splits the input into two parts
         def split_transform(ctx, x):
             return x - 1, x + 1
diff --git a/tests/lightning/test_data.py b/tests/lightning/test_data.py
index e3143b6da03c..7acdcc91b486 100644
--- a/tests/lightning/test_data.py
+++ b/tests/lightning/test_data.py
@@ -6,11 +6,15 @@
     'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
 )
 def test_finetuning_module(mock_gpt_sft_dataset) -> None:
-    from nemo.llm.gpt.data import FineTuningDataModule
+    from nemo.collections.llm.gpt.data import FineTuningDataModule
 
     dataset_root = 'random_root'
     datamodule = FineTuningDataModule(
-        dataset_root, seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,
+        dataset_root,
+        seq_length=2048,
+        micro_batch_size=4,
+        global_batch_size=8,
+        seed=1234,
     )
 
     datamodule.train_dataloader()
@@ -21,9 +25,14 @@ def test_finetuning_module(mock_gpt_sft_dataset) -> None:
     'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
 )
 def test_dolly_module(mock_gpt_sft_dataset) -> None:
-    from nemo.llm.gpt.data import DollyDataModule
+    from nemo.collections.llm.gpt.data import DollyDataModule
 
-    datamodule = DollyDataModule(seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,)
+    datamodule = DollyDataModule(
+        seq_length=2048,
+        micro_batch_size=4,
+        global_batch_size=8,
+        seed=1234,
+    )
 
     datamodule.train_dataloader()
     mock_gpt_sft_dataset.assert_called_once()
@@ -33,9 +42,14 @@ def test_dolly_module(mock_gpt_sft_dataset) -> None:
     'nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset.__init__', return_value=None
 )
 def test_squad_module(mock_gpt_sft_dataset) -> None:
-    from nemo.llm.gpt.data import SquadDataModule
+    from nemo.collections.llm.gpt.data import SquadDataModule
 
-    datamodule = SquadDataModule(seq_length=2048, micro_batch_size=4, global_batch_size=8, seed=1234,)
+    datamodule = SquadDataModule(
+        seq_length=2048,
+        micro_batch_size=4,
+        global_batch_size=8,
+        seed=1234,
+    )
 
     datamodule.train_dataloader()
     mock_gpt_sft_dataset.assert_called_once()
@@ -45,7 +59,7 @@ def test_squad_module(mock_gpt_sft_dataset) -> None:
 # @patch('megatron.core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder')
 # @patch('nemo.lightning.pytorch.trainer.Trainer')
 # def test_pretraining_module(mock_pretraining_dataset_builder, mock_trainer) -> None:
-#     from nemo.llm.gpt.data import PreTrainingDataModule
+#     from nemo.collections.llm.gpt.data import PreTrainingDataModule
 #
 #     datamodule = PreTrainingDataModule(
 #         path=Path('random_path'),
diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py
index 877e6a39a976..31d20170c0b6 100644
--- a/tests/lightning/test_megatron_parallel.py
+++ b/tests/lightning/test_megatron_parallel.py
@@ -103,7 +103,7 @@ def test_init_with_virtual_pipeline(self, mocker, mock_pipeline):
         mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=True)
         mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size')
         mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank')
-        mocker.patch('nemo.io.reinit', return_value=mock_pipeline)
+        mocker.patch('nemo.lightning.io.reinit', return_value=mock_pipeline)
 
         megatron_parallel = mp.MegatronParallel(mock_pipeline, vp_size=2, cpu=True)
 

From a2a75c5da06b21a24e83328cb55e7cb017d9faa4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 23 May 2024 19:06:24 -0700
Subject: [PATCH 118/178] Fix typo in HF tutorial (#9302) (#9304)

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
---
 tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
index 1771d65c5e50..73a8ebc29ee3 100644
--- a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
+++ b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
@@ -916,7 +916,7 @@
     {
       "cell_type": "code",
       "source": [
-        "hf_model2 = nemo_asr.models.ASRModel.from_pretrained(hf_model_name + \"v2\")"
+        "hf_model2 = nemo_asr.models.ASRModel.from_pretrained(hf_model_name + \"_v2\")"
       ],
       "metadata": {
         "id": "WDgwrr2aQyUS"

From cde0b2b226fb519798008f96b0b70d271e503d49 Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Fri, 24 May 2024 13:28:16 -0700
Subject: [PATCH 119/178] Expand documentation for data parallelism and
 distributed optimizer (#9227)

* Add distributed optimizer to docs

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Debug RST table

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Review suggestions from @jgerh

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Copyedits and formatting changes

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: Jennifer Gerhold <jgerhold@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: Jennifer Gerhold <jgerhold@nvidia.com>
---
 docs/source/features/parallelisms.rst | 112 ++++++++++++++++++++------
 1 file changed, 88 insertions(+), 24 deletions(-)

diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst
index d5e86e46a49d..4cc493f40024 100644
--- a/docs/source/features/parallelisms.rst
+++ b/docs/source/features/parallelisms.rst
@@ -3,22 +3,87 @@
 Parallelisms
 ------------
 
-NeMo Megatron supports 5 types of parallelisms (which can be mixed together arbitrarily):
+NeMo Megatron supports five types of parallelism (which can be mixed together arbitrarily).
+
+Data Parallelism
+^^^^^^^^^^^^^^^^
+
+Data Parallelism (DP) creates identical copies of the model across
+multiple GPUs. Data batches are distributed between GPUs so that the
+GPUs can process them independently. While compute is efficiently
+distributed between GPUs, communication is required in order to keep
+the model copies consistent with each other.
 
 Distributed Data Parallelism
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Distributed Data Parallelism (DDP) creates idential copies of the model across multiple GPUs.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Distributed Data Parallelism (DDP) keeps model copies consistent by
+synchronizing parameter gradients before each optimization step. More
+specifically, it sums gradients over all model copies using an
+all-reduce communication collective.
 
 .. image:: ../nlp/nemo_megatron/images/ddp.gif
     :align: center
     :width: 800px
     :alt: Distributed Data Parallel
 
+Distributed Optimizer (ZeRO-1)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ZeRO-1 algorithm keeps model copies consistent by sharding the
+optimizer state between GPUs. During each optimization step, the
+parameter gradients are first summed and sharded (with a
+reduce-scatter collective), each GPU applies an optimization to its
+local shard of the parameters, and the updated parameter shards are
+broadcast to update all of the model copies (with an all-gather
+collective). This approach is attractive for large models since
+sharding the optimizer state can significantly reduce its memory
+footprint on individual GPUs. It also has, in theory, the same
+communication volume as DDP and its communication pattern has more
+opportunities for overlapping with compute.
+
+Enable Data Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~
+
+DDP is the default parallelism scheme when NeMo is run on multiple
+GPUs. Enabling other parallelism schemes in the model configuration
+will decrease the size of the DP group, that is the number of
+identical model copies.
+
+To enable the distributed optimizer, set
+``model.optim.name=distributed_fused_adam`` in the model
+configuration. It can be configured with the following options:
+
+===========================  =========  ==================================================================================================================================
+Option                       Default    Description
+===========================  =========  ==================================================================================================================================
+``dtype``                    fp32       Optimizer state datatype
+``grad_sync_dtype``          ``dtype``  Gradient reduce-scatter datatype
+``overlap_grad_sync``        True       Overlap gradient reduce-scatter with compute
+``overlap_param_sync``       False      Overlap parameter all-gather with compute
+``bucket_cap_mb``            100        Buffer size (in MiB) for internal state and workspaces. Larger buckets have lower runtime overheads but may increase memory usage.
+``contiguous_param_buffer``  False      Allocate parameters as views into a large buffer. Helps avoid some data copies.
+``contiguous_grad_buffer``   True       Allocate parameter gradients as views into a large buffer. Helps avoid some data copies.
+===========================  =========  ==================================================================================================================================
+
+See the keyword arguments in `Apex DistributedFusedAdam <https://github.com/NVIDIA/apex/blob/master/apex/contrib/optimizers/distributed_fused_adam.py>`_ and `NeMo MegatronDistributedFusedAdam <https://github.com/NVIDIA/NeMo/blob/main/nemo/core/optim/distributed_adam.py>`_ for a full list of distributed optimizer options.
+
+Implement Data Parallelism
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DDP in NeMo either uses PyTorch
+`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`_
+(default) or a custom implementation (if custom multi-precision
+training is enabled with ``megatron_amp_O2``).
+
+The distributed optimizer in NeMo is built on top of
+`DistributedFusedAdam <https://github.com/NVIDIA/apex/blob/master/apex/contrib/optimizers/distributed_fused_adam.py>`_
+from Apex.
 
 Tensor Parallelism
 ^^^^^^^^^^^^^^^^^^
 
-**Tensor Parallelism (TP)** is a method for distributing a model's computation across multiple GPUs by splitting tensors into non-overlapping pieces. This allows different parts of the tensor to be processed simultaneously on separate GPUs, enhancing performance and enabling the training of larger models.
+Tensor Parallelism (TP) is a method for distributing a model's computation across multiple GPUs by splitting tensors into non-overlapping pieces. This allows different parts of the tensor to be processed simultaneously on separate GPUs, enhancing performance and enabling the training of larger models.
 
 .. image:: ../nlp/nemo_megatron/images/tp.gif
     :align: center
@@ -31,7 +96,8 @@ Enable Tensor Parallelism
 To enable TP in the NeMo framework, configure the ``tensor_model_parallel_size`` parameter in the model configuration. This parameter determines the number of GPUs among which the model's tensors are partitioned.
 
 **For Tensor Parallelism**:
-   - Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism.
+
+Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism.
 
    .. code-block:: yaml
 
@@ -49,7 +115,7 @@ For detailed API usage and additional configurations, consult the `Megatron Core
 Pipeline Parallelism
 ^^^^^^^^^^^^^^^^^^^^
 
-**Pipeline Parallelism (PP)** is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially.
+Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially.
 
 .. image:: ../nlp/nemo_megatron/images/pp.gif
     :align: center
@@ -63,7 +129,8 @@ Enable Pipeline Parallelism
 To utilize PP in the NeMo framework, you need to set the ``pipeline_model_parallel_size`` parameter in the model's configuration. This parameter specifies the number of GPUs among which the model's layers are distributed.
 
 **For Pipeline Parallelism**:
-   - Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism.
+
+Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism.
 
    .. code-block:: yaml
 
@@ -74,7 +141,7 @@ Adjust the configuration accordingly here: `NeMo Megatron GPT Config <https://gi
 Interleaved Pipeline Parallel Schedule
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. For instance, instead of each GPU processing a continuous set of four layers, it might handle two model chunks with two layers each. This method ensures that each GPU in the pipeline manages multiple stages rather than on a single contiguous block.
+To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. For instance, instead of each GPU processing a continuous set of four layers, it might handle two model chunks with two layers each.
 
    .. code-block:: yaml
 
@@ -85,14 +152,14 @@ For more insights into this approach, see our detailed blog: `Scaling Language M
 Implement Pipeline Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo's implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
+The NeMo implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/transformer_block.py>`_.
 
 For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide <https://docs.nvidia.com/Megatron-Core/developer-guide/latest/api-guide/tensor_parallel.html>`_.
 
 Sequence Parallelism
 ^^^^^^^^^^^^^^^^^^^^
 
-**Sequence Parallelism** extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
+Sequence Parallelism extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
 
 .. image:: ../nlp/nemo_megatron/images/sp.gif
     :align: center
@@ -113,12 +180,12 @@ For further information on configuration, refer to the following documentation:
 Implement Sequence Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo's implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py>`_.
+The NeMo implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py>`_.
 
 Context Parallelism
 ^^^^^^^^^^^^^^^^^^^
 
-**Context Parallelism (CP)** is a method for parallelizing the processing of neural network activations across multiple GPUs, focusing on the sequence dimension of the input data. Unlike Sequence Parallelism (SP) that only partitions specific types of activations, CP divides all network activations along the sequence dimension.
+Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs, focusing on the sequence dimension of the input data. Unlike Sequence Parallelism (SP) that only partitions specific types of activations, CP divides all network activations along the sequence dimension.
 
 Enable Context Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -126,7 +193,8 @@ Enable Context Parallelism
 To activate CP in the NeMo framework, set the ``context_parallel_size`` parameter in the model configuration. This parameter specifies the number of GPUs among which the model's sequence activations are distributed.
 
 **For Context Parallelism**:
-   - Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism.
+
+Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism.
 
    .. code-block:: yaml
 
@@ -137,18 +205,16 @@ The configuration can be found and modified here: `NeMo Megatron Core Context Co
 Implement Context Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo leverages functionalities from both Megatron Core and transformer-engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
-
-Additionally, NeMo's CP supports integration with various forms of model parallelism such as TP (Tensor Parallelism), PP (Pipeline Parallelism), and DP (Data Parallelism), ensuring broad usability and flexibility in large-scale model training environments.
+NeMo leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
 
 Visit our source code for more insights into the implementation:
-- Megatron Core transformer engine: `Megatron Core <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py>`_
-- Transformer Engine repository: `Transformer Engine Code <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_
+- `Megatron Core wrappers for Transformer Engine <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py>`_
+- `Transformer Engine attention modules <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_
 
 
 Expert Parallelism
 ^^^^^^^^^^^^^^^^^^
-**Expert Parallelism (EP)** is a type of model parallelism that distributes experts of an MoE across GPUs.
+Expert Parallelism (EP) is a type of model parallelism that distributes experts of an MoE across GPUs.
 
 .. image:: ../nlp/nemo_megatron/images/ep.png
     :align: center
@@ -158,9 +224,7 @@ Expert Parallelism
 Enable Expert Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To enable it users can pass ``model.expert_model_parallel_size=k``, where k is an integer with the desired
-expert parallelism level, for example if the model has three experts (i.e. ``model.num_moe_experts=3``), we can specify
-k=3 (i.e. via CLI using ``model.expert_model_parallel_size=3``). The number of experts should be exactly divisible by the ``expert_model_parallel_size``.
+To enable EP, set ``model.expert_model_parallel_size`` to the desired expert parallel size. For example, if the model has six experts (``model.num_moe_experts=6``), then setting ``model.expert_model_parallel_size=3`` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size.
 
    .. code-block:: yaml
 
@@ -172,13 +236,13 @@ For further information on configuration, refer to the following documentation:
 Implement Expert Parallelism
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-NeMo's expert parallelism functionality is provided by Megatron-LM repository, please consult the corresponding `Moe-layer <https://github.com/NVIDIA/Megatron-LM/blob/e2ec14ab5690fead7e33760b0f8fb20c83b4fd1f/megatron/core/transformer/moe/moe_layer.py#L29>`_ for more moe implementation details.
+The NeMo implementation of Expert Parallelism uses functionality from Megatron Core. Please consult the `Megatron Core MoE layer <https://github.com/NVIDIA/Megatron-LM/blob/e2ec14ab5690fead7e33760b0f8fb20c83b4fd1f/megatron/core/transformer/moe/moe_layer.py#L29>`_ for more MoE implementation details.
 
 
 Parallelism nomenclature
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-When reading and modifying NeMo Megatron code you will encounter the following terms.
+The following figure illustrates some terms that you may encounter in the NeMo Megatron codebase.
 
 .. image:: ../nlp/nemo_megatron/images/pnom.gif
     :align: center

From c3f19e928bb040351b58f66b5642030a5aea14df Mon Sep 17 00:00:00 2001
From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
Date: Fri, 24 May 2024 13:28:31 -0700
Subject: [PATCH 120/178] Update flash attention section in
 memory_optimizations.rst (#9188)

* Update flash attention section in memory_optimizations.rst

Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com>

* update changes based on comments

Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>

---------

Signed-off-by: cyanguwa <8636796+cyanguwa@users.noreply.github.com>
Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com>
---
 docs/source/features/memory_optimizations.rst | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst
index d87cb1e191ca..4d363670fedf 100644
--- a/docs/source/features/memory_optimizations.rst
+++ b/docs/source/features/memory_optimizations.rst
@@ -11,14 +11,26 @@ Flash Attention
 Overview
 ^^^^^^^^
 
-Flash Attention is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as Natural Language Processing (NLP). Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. Flash Attention is an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high-bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms.
+Flash attention is an algorithm designed to improve the efficiency of the attention mechanism in transformer models such as GPT and BERT. The attention mechanism has quadratic time and memory complexity in sequence length and can present significant runtime and memory challenges for longer sequences.
+
+Compared to the standard, non-flash algorithm, flash attention applies two techniques to lower the memory requirement and improve compute efficiency.
+
+The tiling technique decomposes the inputs based on the shared memory size and calculates the softmax one tile at a time. Instead of working on the entire query, key, value tensors at once, it makes several passes at these tensors and then combines the results in a subsequent step.
+
+The recomputation technique stores the softmax normalization factors (linear to sequence length), instead of the softmax results (qudratic to sequence length), and uses these normalization factors to recompute the attention scores. This saves the amount of data to write to global memory and reduces both the memory requirement and I/O traffic between global memory and shared memory.
+
+Flash attention lowers the memory footprint and computational complexity from quadratic to linear, and greatly extending the range of sequence length allowed in large language models.
+
+The flash attention algorithm was first propsed `here <https://arxiv.org/pdf/2205.14135>`_. Two of its implementations are `flash-attention <https://github.com/Dao-AILab/flash-attention>`_ by Tri Dao *et al*, and `fused flash attention <https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-897/developer-guide/index.html#flash-fused-multi-head-att-fprop>`_ by NVIDIA cuDNN.
 
 Turn Flash Attention On and Off
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-In the NeMo Framework, Flash Attention is supported through the Transformer Engine with the inclusion of Flash Attention 2. By default, Flash Attention is enabled, but the Transformer Engine may switch to a different kernel if the tensor dimensions are not optimal for Flash Attention. Users can completely disable Flash Attention by setting the environment variable ``NVTE_FLASH_ATTN=0``.
+In the NeMo framework, flash attention is supported through `Transformer Engine <https://github.com/NVIDIA/TransformerEngine/tree/main>`_, including both of the implementations mentioned above. Transformer Engine selects the appropriate implementation based on input information such as sequence length, number of heads and head dimension. When both implementations are applicable, Transformer Engine prefers cuDNN flash attention on Hopper+ architectures and Tri Dao flash attention on Ampere architectures.
+
+To disable Tri Dao flash attention, set the environment variable ``NVTE_FLASH_ATTN=0``. To disable cuDNN flash attention, set ``NVTE_FUSED_ATTN=0``.
 
-For more details on the supported Dot Attention backend, please refer to the Transformer Engine source code available at `Transformer Engine's Attention Mechanism <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_.
+For more details on the Dot Product Attention backends supported in Transformer Engine, please refer to the source code at `Transformer Engine's Attention Mechanism <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_.
 
 Activation Recomputation
 ------------------------
@@ -28,15 +40,15 @@ Overview
 
 Full Activation Recomputation
 """""""""""""""""""""""""""""
-This method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed.
+The full activation recomputation method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed.
 
 Partial Activation Recomputation
 """"""""""""""""""""""""""""""""
-This method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency.
+The partial activation recomputation method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency.
 
 Selective Activation Recomputation
 """"""""""""""""""""""""""""""""""
-This method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
+The selective activation recomputation method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
 
 Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198.
 

From 251cf66910a24a76ed39ac8e66c9387e5ebfa7fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 25 May 2024 00:31:25 +0200
Subject: [PATCH 121/178] Install alerting (#9311)

* ci: Send Slack alerts on CI failure

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* chore: Set live

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/scripts/slackHelper.sh  | 23 +++++++++++++++++++++++
 .github/workflows/cicd-main.yml | 21 +++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)
 create mode 100644 .github/scripts/slackHelper.sh

diff --git a/.github/scripts/slackHelper.sh b/.github/scripts/slackHelper.sh
new file mode 100644
index 000000000000..4696cebcf13b
--- /dev/null
+++ b/.github/scripts/slackHelper.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+function sendSlackMessage() {
+
+  WEBHOOK_URL="$1"
+  PIPELINE_URL="$2"
+
+  curl -X POST -H "Content-type: application/json" --data "{
+      \"blocks\": [
+        {
+			\"type\": \"section\",
+			\"text\": {
+				\"type\": \"mrkdwn\",
+				\"text\": \"\
+🚨 *CI/CD failure at <$PIPELINE_URL|NeMo CI>*:
+
+\"
+			}
+		}
+      ]
+    }" $WEBHOOK_URL
+
+}
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index dbc7d907580a..53e92e976240 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -6482,9 +6482,8 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-
   Nemo_CICD_Test:
-    needs:
+    needs: 
       - L0_Unit_Tests_GPU
       - L0_Unit_Tests_CPU
       - L2_Community_LLM_Checkpoints_tests_Llama
@@ -6598,8 +6597,22 @@ jobs:
       - L2_TTS_Fast_dev_runs_1_Mixer-TTS
       - L2_TTS_Fast_dev_runs_1_Hifigan
       - Speech_Checkpoints_tests
-
+    if: always()
     runs-on: ubuntu-latest
     steps:
         # This should depend on all the tests so we block/unblock based on all tests passing
-      - run: exit 0
+      - if: ${{ contains(needs.*.result, 'success') }}
+        run: exit 0
+
+      - if: ${{ contains(needs.*.result, 'failure') }}
+        name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - if: ${{ contains(needs.*.result, 'failure') }}
+        run: |
+          source .github/scripts/slackHelper.sh
+
+          WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK }}
+          PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+          sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL"

From 1fa961ba03ab5f8c91b278640e29807079373372 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sat, 25 May 2024 00:18:43 -0400
Subject: [PATCH 122/178] typos (#9314) (#9315)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
---
 tutorials/00_NeMo_Primer.ipynb                           | 2 +-
 tutorials/asr/ASR_Confidence_Estimation.ipynb            | 4 ++--
 tutorials/asr/ASR_Context_Biasing.ipynb                  | 2 +-
 tutorials/asr/Speech_Commands.ipynb                      | 4 ++--
 tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb
index 50aa60260b35..07d7f6b46539 100644
--- a/tutorials/00_NeMo_Primer.ipynb
+++ b/tutorials/00_NeMo_Primer.ipynb
@@ -588,7 +588,7 @@
         "id": "U7Eezf_sAVS0"
       },
       "source": [
-        "You might wonder why we didnt explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
+        "You might wonder why we didn't explicitly set `citrinet.cfg.optim = cfg.optim`. \n",
         "\n",
         "This is because the `setup_optimization()` method does it for you! You can still update the config manually."
       ]
diff --git a/tutorials/asr/ASR_Confidence_Estimation.ipynb b/tutorials/asr/ASR_Confidence_Estimation.ipynb
index eb8cd7b11688..9b925adbd777 100644
--- a/tutorials/asr/ASR_Confidence_Estimation.ipynb
+++ b/tutorials/asr/ASR_Confidence_Estimation.ipynb
@@ -284,7 +284,7 @@
     "            eps_padded_hyp, labels, padded_labels, fill_confidence_deletions(confidence_scores, labels)\n",
     "        ):\n",
     "            word_len = len(word)\n",
-    "            # shield angle brakets for <eps>\n",
+    "            # shield angle brackets for <eps>\n",
     "            if html and word == \"<eps>\":\n",
     "                word = \"&lt;eps&gt;\"\n",
     "            if current_line_len + word_len + 1 <= terminal_width:\n",
@@ -307,7 +307,7 @@
     "        current_word_line = \"\"\n",
     "        for word, score in zip(transcript_list, confidence_scores):\n",
     "            word_len = len(word)\n",
-    "            # shield angle brakets for <eps>\n",
+    "            # shield angle brackets for <eps>\n",
     "            if html and word == \"<eps>\":\n",
     "                word = \"&lt;eps&gt;\"\n",
     "            if current_line_len + word_len + 1 <= terminal_width:\n",
diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb
index dd2e8176ad33..7171510f4e0d 100644
--- a/tutorials/asr/ASR_Context_Biasing.ipynb
+++ b/tutorials/asr/ASR_Context_Biasing.ipynb
@@ -361,7 +361,7 @@
    "source": [
     "## Create a context-biasing list\n",
     "\n",
-    "Now, we need to select the words, recognition of wich we want to improve by CTC-WS context-biasing.\n",
+    "Now, we need to select the words, recognition of which we want to improve by CTC-WS context-biasing.\n",
     "Usually, we select only nontrivial words with the lowest recognition accuracy.\n",
     "Such words should have a character length >= 3 because short words in a context-biasing list may produce high false-positive recognition.\n",
     "In this toy example, we will select all the words that look like names with a recognition accuracy less than 1.0.\n",
diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb
index 58b719a867fa..438533f0f03a 100644
--- a/tutorials/asr/Speech_Commands.ipynb
+++ b/tutorials/asr/Speech_Commands.ipynb
@@ -1431,10 +1431,10 @@
                 "# Lets change the scheduler\n",
                 "optim_sched_cfg.sched.name = \"CosineAnnealing\"\n",
                 "\n",
-                "# \"power\" isnt applicable to CosineAnnealing so let's remove it\n",
+                "# \"power\" isn't applicable to CosineAnnealing so let's remove it\n",
                 "optim_sched_cfg.sched.pop('power')\n",
                 "\n",
-                "# \"hold_ratio\" isnt applicable to CosineAnnealing, so let's remove it\n",
+                "# \"hold_ratio\" isn't applicable to CosineAnnealing, so let's remove it\n",
                 "optim_sched_cfg.sched.pop('hold_ratio')\n",
                 "\n",
                 "# Set \"min_lr\" to lower value\n",
diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
index 675fdfd5351c..608685254a0d 100644
--- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
+++ b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
@@ -749,7 +749,7 @@
             "source": [
                 "### Optimizing Threshold\n",
                 "\n",
-                "As mentioned above, when classifiying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
+                "As mentioned above, when classifying a given query such as `show all flights and fares from denver to san francisco`, our model checks whether each individual intent would be suitable. Before assigning the final labels for a query, the model assigns a probability an intent matches the query. For example, if our `dict.intents.csv` had 5 different intents, then the model could output for a given query \\[0.52, 0.38, 0.21, 0.67. 0.80\\] where each value represents the probability that query matches that particular intent. \n",
                 "\n",
                 "We need to use these probabilities to generate final label predictions of 0 or 1 for each label. While we can use 0.5 as the probability threshold, it is usually the case that there is a better threshold to use depending on the metric we want to optimize. For this tutorial, we will be finding the threshold that gives us the best micro-F1 score on the validation set. After running the `optimize_threshold` method, the threshold attribute for our model will be updated."
             ]

From c39204d67c5b28f63bc5b9eed30a4c93002c1584 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 28 May 2024 01:28:03 -0700
Subject: [PATCH 123/178] call set_expert_model_parallel_world_size instead of
 set_cpu_expert_model_parallel_world_size (#9275)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
index dbcbb80a7fda..ca9e44f82922 100644
--- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
@@ -233,7 +233,7 @@ def convert(in_file, precision=None) -> None:
 
 if __name__ == '__main__':
     args = get_args()
-    parallel_state.set_cpu_expert_model_parallel_world_size(1)
+    parallel_state.set_expert_model_parallel_world_size(1)
     hf_state_dict, nemo_config = convert(args.input_name_or_path, args.precision)
 
     config = load_config(args.hf_model_name, nemo_config)

From 7a8da171ed072433db9d615cc0eca132bc8351ca Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Tue, 28 May 2024 12:57:08 -0500
Subject: [PATCH 124/178] conv1d stable version (#9330)

---
 requirements/requirements_nlp.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 9fd75ad8a95a..494a9ab6d672 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -1,6 +1,6 @@
 accelerated-scan
 boto3
-causal-conv1d>=1.2.0
+causal-conv1d==1.2.0.post2
 einops
 faiss-cpu
 fasttext

From 5f7b0304f23cbcb64d92d9f511a22bffb7a5cb28 Mon Sep 17 00:00:00 2001
From: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Date: Tue, 28 May 2024 14:05:20 -0700
Subject: [PATCH 125/178] FP8 feature documentation (#9265)

* Create fp8.rst

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Update fp8.rst

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* add fp8_params

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* Update mixed_precision.rst

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* review comments

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

* rm file

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

---------

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 docs/source/features/mixed_precision.rst | 42 ++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/docs/source/features/mixed_precision.rst b/docs/source/features/mixed_precision.rst
index d193752e5475..ba0dfb4e945b 100644
--- a/docs/source/features/mixed_precision.rst
+++ b/docs/source/features/mixed_precision.rst
@@ -4,3 +4,45 @@ Mixed Precision Training
 ------------------------
 
 Mixed precision training significantly enhances computational efficiency by conducting operations in half-precision and fp8 formats, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. NeMo now supports FP16, BF16, and FP8 (via Transformer Engine) across most models. Further details will be provided shortly.
+
+
+FP8 usage
+=========
+
+Overview
+^^^^^^^^
+
+NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. NeMo uses the NVIDIA `TransformerEngine <https://github.com/NVIDIA/TransformerEngine>`_ (TE) in order to leverage speedups from FP8. The following table summarizes the FP8 related arguments that can be configured in NeMo (`example config setting <https://github.com/NVIDIA/NeMo/blob/2e1814c9f031ad2aeeebad44597365e97253d2c4/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml/#L192-L200>`_). For a more detailed overview, refer to the TE `documentation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_, specifically the FP8 `format <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#transformer_engine.common.recipe.Format>`_ and `recipe <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html#transformer_engine.common.recipe.DelayedScaling>`_.
+
+.. list-table:: FP8 arguments
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Argument
+     - Description
+   * - transformer_engine
+     - TE and related functionality can be enabled by setting this boolean argument to True. If this argument is not set to True, all subsequent arguments will be ignored.
+   * - fp8
+     - Enables FP8 training. For transformer networks, the QKV, projection, FC1, and FC2 matrix multiplications are executed using the 4th generation H100 tensor cores with FP8 support.
+   * - fp8_e4m3
+     - Training recipe format for FP8. Activations, weights, and gradient tensors use the E4M3 format.
+   * - fp8_hybrid
+     - Training recipe format for FP8. Activations and weight tensors use the E4M3 format, whereas gradient use the E5M2 format to satisfy the additional dynamic range requirement for backward tensors. This is the default setting.
+   * - fp8_margin
+     - The scaling factor for FP8 tensors can be shifted by a factor of $2 ^ {margin}$ using this argument.
+   * - fp8_amax_history_len
+     - Window size for amax history. The window size determines how many instances of the most recent absolute max values (amaxes) are stored per tensor.
+   * - fp8_amax_compute_algo
+     - The choice between “max” and “most_recent” specifies how to select an amax value from the given history.
+   * - reduce_amax
+     - Indicates whether or not to perform an allreduce on the amax (absolute max) values for the FP8 tensors. Since the amax is directly used to compute the scaling factor for FP8 tensors, setting this argument ensures that the scaling factors for a tensor remain synchronized across devices in multi-GPU training configurations.
+   * - fp8_params
+     - Indicates whether or not to store module level parameters in FP8. Enabling this option can lead to reduced memory consumption. It eliminates the need to store a copy of weights in higher precision (> half) for cases where these weights are externally maintained, such as master parameters in the optimizer. For more information, refer to the `fp8_model_init <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/pytorch.html#transformer_engine.pytorch.fp8_model_init>`_ API in TE.
+
+Resources
+^^^^^^^^^
+
+- `TE documentation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_
+- `Intro to FP8, floating point formats, and mixed precision training <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html#Introduction-to-FP8>`_
+- `Performance optimizations <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/advanced_optimizations.html>`_ that are natively supported in NeMo by enabling FP8 training with TE
+- `TE installation <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_

From 8a8c45319ef9e2a0e803918c6bb09745341d2647 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Tue, 28 May 2024 15:17:43 -0700
Subject: [PATCH 126/178] comment out flaky tests (#9333)

---
 .github/workflows/cicd-main.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 53e92e976240..1e977a7e717d 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -133,7 +133,7 @@ jobs:
     #      chmod -R 777 .
 
 
-  L0_Unit_Tests_GPU:
+  OPTIONAL_L0_Unit_Tests_GPU:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
@@ -325,7 +325,7 @@ jobs:
   # this test is using a 7B model which is too large for GitHub CI
   # replace the model in this test with a toy model or move the test
   # to the nightly CI
-  # L2_Community_LLM_Checkpoints_tests_Baichuan2:
+  # OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
@@ -6484,12 +6484,12 @@ jobs:
 
   Nemo_CICD_Test:
     needs: 
-      - L0_Unit_Tests_GPU
+      #- OPTIONAL_L0_Unit_Tests_GPU
       - L0_Unit_Tests_CPU
       - L2_Community_LLM_Checkpoints_tests_Llama
       - L2_Community_LLM_Checkpoints_tests_StarCoder
       - L2_Community_LLM_Checkpoints_tests_Falcon
-      #- L2_Community_LLM_Checkpoints_tests_Baichuan2
+      #- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2
       - ASR_dev_run_Speech_to_Text
       - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet
       - ASR_dev_run_Speech_Pre-training_-_CitriNet

From 136aeee276568122745f29a1d58de30c207df0a1 Mon Sep 17 00:00:00 2001
From: Eduardo Vellasques <evellasques@gmail.com>
Date: Wed, 29 May 2024 04:10:31 +0200
Subject: [PATCH 127/178] fix typos in convert_mixtral_nemo_to_hf.py and
 convert_starcoder2_nemo_to_hf.py (#9325)

Signed-off-by: evellasques <evellasques@gmail.com>
---
 scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py    | 3 ++-
 scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
index ca9e44f82922..58311d0324c2 100644
--- a/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
@@ -61,7 +61,7 @@ def load_config(hf_model_name, nemo_config):
     hf_config.num_key_value_heads = nemo_config.num_query_groups
     hf_config.num_local_experts = nemo_config.num_moe_experts
     assert hf_config.num_local_experts > 0, "num_experts must be greater than zero."
-    hf_config.num_experts_per_tok = nemo_config.num_experts_per_token
+    hf_config.num_experts_per_tok = nemo_config.moe_router_topk
     assert hf_config.num_experts_per_tok > 0, "num_experts_per_token must be greater than zero."
     if nemo_config.activation == 'fast-swiglu':
         hf_config.activation = 'silu'
@@ -122,6 +122,7 @@ def convert(in_file, precision=None) -> None:
         embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
     state_dict[hf_embed_weight_name] = param_to_weights(ckpt[embed_weights_base_name])
 
+    head_num = model.cfg.num_attention_heads
     if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
         num_query_groups = head_num
     else:
diff --git a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
index b7b85ee826a8..043d1fd35261 100644
--- a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
@@ -266,7 +266,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
     config = load_config(args.hf_model_name, nemo_config)
     model = AutoModelForCausalLM.from_config(config)
     model.load_state_dict(hf_state_dict, strict=True)
-    model.save_pretrained(args.out_file)
+    model.save_pretrained(args.output_path)
     hf_tokenizer = AutoTokenizer.from_pretrained('bigcode/starcoder2-tokenizer')
     hf_tokenizer.save_pretrained(args.output_path)
     logging.info(f'HF checkpoint saved to: {args.output_path}')

From a1173eb1884969812a20d58d5be4ccf73b09b036 Mon Sep 17 00:00:00 2001
From: Deva Kumar Gajulamandyam <37027138+gdevakumar@users.noreply.github.com>
Date: Wed, 29 May 2024 01:25:48 -0700
Subject: [PATCH 128/178] typos fixed in READMe.rst (#9322)

Signed-off-by: Deva Kumar Gajulamandyam <gdevakumar267@gmail.com>
---
 README.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.rst b/README.rst
index 0b05bd0390f8..a2c595d62137 100644
--- a/README.rst
+++ b/README.rst
@@ -108,7 +108,7 @@ Latest News
 Introduction
 ------------
 
-NVIDIA NeMo Framework is a generative AI framework built for researchers and pytorch developers
+NVIDIA NeMo Framework is a generative AI framework built for researchers and PyTorch developers
 working on large language models (LLMs), multimodal models (MM), automatic speech recognition (ASR),
 and text-to-speech synthesis (TTS).
 The primary objective of NeMo is to provide a scalable framework for researchers and developers from industry and academia
@@ -219,8 +219,8 @@ The NeMo Framework can be installed in a variety of ways, depending on your need
   * NeMo LLM & Multimodal Container - `nvcr.io/nvidia/nemo:24.03.framework`
   * NeMo Speech Container - `nvcr.io/nvidia/nemo:24.01.speech`
 
-* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for isntallation instructions.
-  * It's higly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`
+* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for installation instructions.
+  * It's highly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`
 
 Conda
 ~~~~~
@@ -452,9 +452,9 @@ Megatron Core
 ~~~~~~~~~~~~~
 
 The NeMo LLM Multimodal Domains require that NVIDIA Megatron Core to be installed.
-Megatron core is a library for scaling large transfromer base models. 
+Megatron core is a library for scaling large transformer base models. 
 NeMo LLM and Multimodal models leverage Megatron Core for model parallelism, 
-transformer architectures, and optimized pytorch datasets.
+transformer architectures, and optimized PyTorch datasets.
 
 NeMo LLM and Multimodal may need Megatron Core to be updated to a recent version.
 

From cff6b95e74f9048409584092f9891e8de2f455d5 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Wed, 29 May 2024 09:36:00 -0700
Subject: [PATCH 129/178] Fix trainer builder when exp_manager is not in config
 (#9293)

* fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* rollback changes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 .../modules/stable_diffusion/attention.py     |   8 +-
 .../diffusionmodules/model.py                 |  11 +-
 .../diffusionmodules/openaimodel.py           | 124 ++++++++++++------
 .../stable_diffusion/diffusionmodules/util.py |  19 ++-
 .../nlp/parts/megatron_trainer_builder.py     |   4 +-
 5 files changed, 117 insertions(+), 49 deletions(-)

diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
index c70b59d39481..2eeed97db781 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
@@ -122,7 +122,11 @@ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0, use_te=Fal
         if use_te:
             activation = 'gelu' if not glu else 'geglu'
             # TODO: more parameters to be confirmed, dropout, seq_length
-            self.net = LayerNormMLP(hidden_size=dim, ffn_hidden_size=inner_dim, activation=activation,)
+            self.net = LayerNormMLP(
+                hidden_size=dim,
+                ffn_hidden_size=inner_dim,
+                activation=activation,
+            )
         else:
             norm = nn.LayerNorm(dim)
             project_in = nn.Sequential(LinearWrapper(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
@@ -264,7 +268,7 @@ def __init__(
         self.query_dim = query_dim
         self.dim_head = dim_head
 
-        self.scale = dim_head ** -0.5
+        self.scale = dim_head**-0.5
         self.heads = heads
 
         self.to_k = LinearWrapper(context_dim, self.inner_dim, bias=False, lora_network_alpha=lora_network_alpha)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
index 644efafaf06a..5b874f5f10ad 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py
@@ -233,7 +233,10 @@ def __init__(
             # timestep embedding
             self.temb = nn.Module()
             self.temb.dense = nn.ModuleList(
-                [torch.nn.Linear(self.ch, self.temb_ch), torch.nn.Linear(self.temb_ch, self.temb_ch),]
+                [
+                    torch.nn.Linear(self.ch, self.temb_ch),
+                    torch.nn.Linear(self.temb_ch, self.temb_ch),
+                ]
             )
 
         # downsampling
@@ -669,7 +672,11 @@ def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
             ]
         )
 
-        self.conv_out = nn.Conv2d(mid_channels, out_channels, kernel_size=1,)
+        self.conv_out = nn.Conv2d(
+            mid_channels,
+            out_channels,
+            kernel_size=1,
+        )
 
     def forward(self, x):
         x = self.conv_in(x)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index 3e301f0b8fc1..30ff0e1a9ff3 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -115,10 +115,14 @@ class AttentionPool2d(nn.Module):
     """
 
     def __init__(
-        self, spacial_dim: int, embed_dim: int, num_heads_channels: int, output_dim: int = None,
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
     ):
         super().__init__()
-        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
+        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5)
         self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
         self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
         self.num_heads = embed_dim // num_heads_channels
@@ -332,7 +336,10 @@ def __init__(
             self.emb_layers = None
             self.exchange_temb_dims = False
         else:
-            self.emb_layers = nn.Sequential(nn.SiLU(), linear(emb_channels, self.emb_out_channels),)
+            self.emb_layers = nn.Sequential(
+                nn.SiLU(),
+                linear(emb_channels, self.emb_out_channels),
+            )
         self.out_layers = nn.Sequential(
             normalization(self.out_channels, act="silu", gn_groups=resblock_gn_groups),
             nn.Dropout(p=dropout),
@@ -400,7 +407,12 @@ class AttentionBlock(nn.Module):
     """
 
     def __init__(
-        self, channels, num_heads=1, num_head_channels=-1, use_checkpoint=False, use_new_attention_order=False,
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
     ):
         super().__init__()
         self.channels = channels
@@ -451,7 +463,7 @@ def count_flops_attn(model, _x, y):
     # We perform two matmuls with the same number of ops.
     # The first computes the weight matrix, the second computes
     # the combination of the value vectors.
-    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    matmul_ops = 2 * b * (num_spatial**2) * c
     model.total_ops += th.DoubleTensor([matmul_ops])
 
 
@@ -653,7 +665,10 @@ def __init__(
         if num_attention_blocks is not None:
             assert len(num_attention_blocks) == len(self.num_res_blocks)
             assert all(
-                map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks)),)
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
+                    range(len(num_attention_blocks)),
+                )
             )
             logging.info(
                 f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
@@ -674,7 +689,9 @@ def __init__(
         self.predict_codebook_ids = n_embed is not None
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
         )
 
         self.time_embeddings = torch.Tensor(build_timestep_embedding(model_channels, timesteps))
@@ -691,7 +708,9 @@ def __init__(
                 self.label_emb = nn.Sequential(
                     Timestep(model_channels),
                     nn.Sequential(
-                        linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+                        linear(model_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
                     ),
                 )
             elif self.num_classes == "sequential":
@@ -699,7 +718,9 @@ def __init__(
                 self.adm_in_channels = adm_in_channels
                 self.label_emb = nn.Sequential(
                     nn.Sequential(
-                        linear(adm_in_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
                     )
                 )
             else:
@@ -810,26 +831,28 @@ def __init__(
                 use_scale_shift_norm=use_scale_shift_norm,
                 resblock_gn_groups=resblock_gn_groups,
             ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            )
-            if not use_spatial_transformer
-            else SpatialTransformer(
-                ch,
-                num_heads,
-                dim_head,
-                depth=transformer_depth_middle,
-                context_dim=context_dim,
-                disable_self_attn=disable_middle_self_attn,
-                use_linear=use_linear_in_transformer,
-                use_checkpoint=use_checkpoint,
-                use_flash_attention=use_flash_attention,
-                use_te=self.use_te_fp8,
-                lora_network_alpha=lora_network_alpha,
+            (
+                AttentionBlock(
+                    ch,
+                    use_checkpoint=use_checkpoint,
+                    num_heads=num_heads,
+                    num_head_channels=dim_head,
+                    use_new_attention_order=use_new_attention_order,
+                )
+                if not use_spatial_transformer
+                else SpatialTransformer(
+                    ch,
+                    num_heads,
+                    dim_head,
+                    depth=transformer_depth_middle,
+                    context_dim=context_dim,
+                    disable_self_attn=disable_middle_self_attn,
+                    use_linear=use_linear_in_transformer,
+                    use_checkpoint=use_checkpoint,
+                    use_flash_attention=use_flash_attention,
+                    use_te=self.use_te_fp8,
+                    lora_network_alpha=lora_network_alpha,
+                )
             ),
             ResBlock(
                 ch,
@@ -1123,9 +1146,15 @@ def te_fp8_key_mapping(self, unet_dict):
             # norm_to_q.layer_norm_{weight|bias} -> norm.{weight|bias}
             # norm_to_q.weight -> to_q.weight
             new_key = key.replace('attn1.norm.', 'attn1.norm_to_q.layer_norm_')
-            new_key = new_key.replace('attn1.to_q.weight', 'attn1.norm_to_q.weight',)
+            new_key = new_key.replace(
+                'attn1.to_q.weight',
+                'attn1.norm_to_q.weight',
+            )
             new_key = new_key.replace('attn2.norm.', 'attn2.norm_to_q.layer_norm_')
-            new_key = new_key.replace('attn2.to_q.weight', 'attn2.norm_to_q.weight',)
+            new_key = new_key.replace(
+                'attn2.to_q.weight',
+                'attn2.norm_to_q.weight',
+            )
 
             ### LayerNormMLP
             # ff.net.layer_norm_{weight|bias} -> ff.net.0.{weight|bias}
@@ -1214,7 +1243,10 @@ def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False, from
         unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
         def _find_mismatched_keys(
-            state_dict, model_state_dict, loaded_keys, ignore_mismatched_sizes,
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
         ):
             mismatched_keys = []
             if ignore_mismatched_sizes:
@@ -1234,7 +1266,10 @@ def _find_mismatched_keys(
         if state_dict is not None:
             # Whole checkpoint
             mismatched_keys = _find_mismatched_keys(
-                state_dict, model_state_dict, original_loaded_keys, ignore_mismatched_sizes,
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
             )
             error_msgs = self._load_state_dict_into_model(state_dict)
         return missing_keys, unexpected_keys, mismatched_keys, error_msgs
@@ -1329,9 +1364,14 @@ def _forward(self, x, timesteps=None, context=None, y=None, **kwargs):
             return self.out(h)
 
     def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
-        with transformer_engine.pytorch.fp8_autocast(
-            enabled=self.use_te_fp8, fp8_recipe=self.fp8_recipe,
-        ) if self.use_te_fp8 else nullcontext():
+        with (
+            transformer_engine.pytorch.fp8_autocast(
+                enabled=self.use_te_fp8,
+                fp8_recipe=self.fp8_recipe,
+            )
+            if self.use_te_fp8
+            else nullcontext()
+        ):
             out = self._forward(x, timesteps, context, y, **kwargs)
         return out
 
@@ -1387,7 +1427,9 @@ def __init__(
 
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
         )
 
         self.input_blocks = nn.ModuleList(
@@ -1489,11 +1531,15 @@ def __init__(
         elif pool == "attention":
             assert num_head_channels != -1
             self.out = nn.Sequential(
-                normalization(ch), nn.SiLU(), AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
             )
         elif pool == "spatial":
             self.out = nn.Sequential(
-                nn.Linear(self._feature_size, 2048), nn.ReLU(), nn.Linear(2048, self.out_channels),
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, self.out_channels),
             )
         elif pool == "spatial_v2":
             self.out = nn.Sequential(
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
index 53f9669a0b8f..69700a43614e 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
@@ -44,7 +44,7 @@
 
 def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
     if schedule == "linear":
-        betas = torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        betas = torch.linspace(linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64) ** 2
 
     elif schedule == "cosine":
         timesteps = torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
@@ -169,7 +169,10 @@ def backward(ctx, *output_grads):
             shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
             output_tensors = ctx.run_function(*shallow_copies)
         input_grads = torch.autograd.grad(
-            output_tensors, ctx.input_tensors + ctx.input_params, output_grads, allow_unused=True,
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
         )
         del ctx.input_tensors
         del ctx.input_params
@@ -319,7 +322,11 @@ def interpolate_fn(x, xp, yp):
     start_idx = torch.where(
         torch.eq(x_idx, 0),
         torch.tensor(1, device=x.device),
-        torch.where(torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
     )
     end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
     start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
@@ -327,7 +334,11 @@ def interpolate_fn(x, xp, yp):
     start_idx2 = torch.where(
         torch.eq(x_idx, 0),
         torch.tensor(0, device=x.device),
-        torch.where(torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
     )
     y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
     start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index f6336f6bcc71..194168008dc4 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -146,7 +146,7 @@ def _plugins(self) -> list:
         use_dist_ckpt = not self.cfg.model.get('fsdp', False) and (
             self.cfg.model.get('mcore_gpt', False) or self.cfg.model.get('mcore_bert', False)
         )
-        async_save = self.cfg.exp_manager.get('checkpoint_callback_params', {}).get('async_save', False)
+        async_save = self.cfg.get('exp_manager', {}).get('checkpoint_callback_params', {}).get('async_save', False)
         if use_dist_ckpt:
             checkpoint_io = DistributedCheckpointIO.from_config(self.cfg.model, async_save)
             if async_save:
@@ -171,7 +171,7 @@ def _callbacks(self, callbacks: Optional[list]) -> list:
         if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
             callbacks.append(CustomProgressBar())
 
-        if self.cfg.exp_manager.get('checkpoint_callback_params', {}).get('async_save', False):
+        if self.cfg.get('exp_manager', {}).get('checkpoint_callback_params', {}).get('async_save', False):
             callbacks.append(AsyncFinalizerCallback())
         return callbacks
 

From 962b846be205562b047c3c4842cbb3db3757677e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Simon=20Wei=C3=9F?= <SimonCW@users.noreply.github.com>
Date: Wed, 29 May 2024 18:56:47 +0200
Subject: [PATCH 130/178] Update README.rst to clarify installation via Conda
 (#9323)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Simon Weiß <SimonCW@users.noreply.github.com>
---
 README.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.rst b/README.rst
index a2c595d62137..121c82b8590f 100644
--- a/README.rst
+++ b/README.rst
@@ -240,6 +240,8 @@ Install PyTorch using their `configurator <https://pytorch.org/get-started/local
 
 The command used to install PyTorch may depend on your system. Please use the configurator linked above to find the right command for your system.
 
+Then, install NeMo via Pip or from Source. We do not provide NeMo on the conda-forge or any other Conda channel.
+
 Pip
 ~~~
 Use this installation mode if you want the latest released version.

From 008e5a05721bc38605ab714a7252949ae639779f Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 29 May 2024 11:00:00 -0700
Subject: [PATCH 131/178] [Nemo CICD] update flaky test (#9339)

* comment out flaky tests

* optional test should not cancel workflow
---
 .github/workflows/cicd-main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 1e977a7e717d..b924cf975b18 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -152,8 +152,8 @@ jobs:
     - name: "L0: Unit Tests GPU"
       run: |
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
-    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-      if: "failure()"
+    #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+    #  if: "failure()"
       
 
   L0_Unit_Tests_CPU:

From da720ae38ba2b47d10f365c6760357d504fd9039 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Wed, 29 May 2024 14:08:50 -0700
Subject: [PATCH 132/178] Fix peft weights loading (#9341)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 .../collections/nlp/parts/mixins/multimodal_adapter_mixins.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
index 1a5321065fa9..00552cb7f96e 100644
--- a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
@@ -133,10 +133,10 @@ def load_adapters(
             state_dict = torch.load(filepath, map_location)['state_dict']
         else:
             raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
-        if self.cfg.megatron_amp_O2:
-            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
         if not self.ptuning_only_and_non_first_stage:
             assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
+        if self.cfg.megatron_amp_O2:
+            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
 
         missing_keys, unexpected_keys = NLPModel.load_state_dict(self, state_dict, strict=False)
 

From 4aba557bd23c27b6eeca7cf0da91845a4532178c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 29 May 2024 15:11:55 -0700
Subject: [PATCH 133/178] fix lora and ptuning and isort/black (#9290) (#9295)

* fix lora and ptuning and isort/black


* remove raise error when multiple config files


* Apply isort and black reformatting


* fix script issues


---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
---
 nemo/deploy/deploy_pytriton.py            |  4 --
 nemo/deploy/nlp/query_llm.py              |  3 +-
 nemo/export/tensorrt_llm.py               | 18 +++++-
 nemo/export/trt_llm/decoder/decoder.py    |  6 +-
 nemo/export/trt_llm/decoder/falcon.py     | 29 ++++++++--
 nemo/export/trt_llm/decoder/gemma.py      | 29 ++++++++--
 nemo/export/trt_llm/decoder/gpt.py        | 28 ++++++++--
 nemo/export/trt_llm/decoder/gptj.py       | 18 +++++-
 nemo/export/trt_llm/decoder/llama.py      | 29 ++++++++--
 nemo/export/trt_llm/model_config.py       | 55 +++++++++++++-----
 nemo/export/trt_llm/nemo/nemo.py          | 10 ++--
 nemo/export/trt_llm/tensorrt_llm_model.py |  7 ++-
 nemo/export/trt_llm/tensorrt_llm_run.py   | 68 +++++++++++++++++------
 scripts/deploy/nlp/deploy_triton.py       | 40 +++++--------
 scripts/export/export_to_trt_llm.py       | 10 ++--
 15 files changed, 252 insertions(+), 102 deletions(-)

diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py
index 22dea8ac47cd..25e09cf3eacc 100644
--- a/nemo/deploy/deploy_pytriton.py
+++ b/nemo/deploy/deploy_pytriton.py
@@ -24,7 +24,6 @@
 
 
 class DeployPyTriton(DeployBase):
-
     """
     Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy.
 
@@ -102,7 +101,6 @@ def __init__(
         )
 
     def deploy(self):
-
         """
         Deploys any models to Triton Inference Server.
         """
@@ -148,7 +146,6 @@ def deploy(self):
             print(e)
 
     def serve(self):
-
         """
         Starts serving the model and waits for the requests
         """
@@ -163,7 +160,6 @@ def serve(self):
             print(e)
 
     def run(self):
-
         """
         Starts serving the model asynchronously.
         """
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 6a4337024eeb..c8387914c2e9 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -71,7 +71,8 @@ class NemoQueryLLM(NemoQueryLLMBase):
 
     def __init__(self, url, model_name):
         super().__init__(
-            url=url, model_name=model_name,
+            url=url,
+            model_name=model_name,
         )
 
     def query_llm(
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index cad7b821b3b4..b030165a3d45 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -82,15 +82,24 @@ class TensorRTLLM(ITritonDeployable):
 
     """
 
-    def __init__(self, model_dir: str, lora_ckpt_list: List[str] = None, load_model: bool = True):
+    def __init__(
+        self,
+        model_dir: str,
+        lora_ckpt_list: List[str] = None,
+        load_model: bool = True,
+        use_python_runtime: bool = True,
+    ):
         """
         Args:
             model_dir (str): path for storing the TensorRT-LLM model files.
+            lora_ckpt_list (List[str]): lora checkpoint paths.
             load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir.
+            use_python_runtime (bool): whether to use python or c++ runtime.
         """
 
         self.model_dir = model_dir
         self.lora_ckpt_list = lora_ckpt_list
+        self.use_python_runtime = use_python_runtime
         self.model = None
         self.tokenizer = None
         self.n_gpus = None
@@ -623,7 +632,7 @@ def _prep_ptuning_table(self):
         if len(vtokens_embeddings) > 0:
             self.p_table = torch.stack(vtokens_embeddings, dim=0).view(-1, self.get_hidden_size)
 
-            max_prompt_embedding_table_size = self.config['builder_config']['max_prompt_embedding_table_size']
+            max_prompt_embedding_table_size = self.config['build_config']['max_prompt_embedding_table_size']
             actual_prompt_table_size = self.p_table.shape[0]
 
             if actual_prompt_table_size > max_prompt_embedding_table_size:
@@ -754,7 +763,10 @@ def _load(self):
                     self._load_config_file()
                     self.tokenizer = get_tokenzier(Path(os.path.join(self.model_dir)))
                     self.model = load(
-                        tokenizer=self.tokenizer, engine_dir=self.model_dir, lora_ckpt_list=self.lora_ckpt_list
+                        tokenizer=self.tokenizer,
+                        engine_dir=self.model_dir,
+                        lora_ckpt_list=self.lora_ckpt_list,
+                        use_python_runtime=self.use_python_runtime,
                     )
                     self._load_prompt_tables()
                 except Exception as error:
diff --git a/nemo/export/trt_llm/decoder/decoder.py b/nemo/export/trt_llm/decoder/decoder.py
index b3c0e2257e9f..2d1993fd74c0 100644
--- a/nemo/export/trt_llm/decoder/decoder.py
+++ b/nemo/export/trt_llm/decoder/decoder.py
@@ -90,7 +90,11 @@ def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
         pass
 
     def __init__(
-        self, decoder_type: str, dtype: trt.DataType = trt.float16, rank: int = 0, tensor_parallel: int = 1,
+        self,
+        decoder_type: str,
+        dtype: trt.DataType = trt.float16,
+        rank: int = 0,
+        tensor_parallel: int = 1,
     ):
         """Initializes the DecoderLayerConfigBuilder."""
         self.decoder_type = decoder_type
diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py
index 91edc7794607..e05979fa75a0 100644
--- a/nemo/export/trt_llm/decoder/falcon.py
+++ b/nemo/export/trt_llm/decoder/falcon.py
@@ -69,7 +69,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.self_attn.o_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -78,13 +82,25 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.gate_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.down_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.up_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -130,4 +146,7 @@ def build_decoder(self, layer):
         config.set_if_not_exist('bias', False)
         config.set_if_not_exist('moe_num_experts', 0)
 
-        return FalconDecoderLayer(config=config, layer_idx=self.layer_id,)
+        return FalconDecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/decoder/gemma.py b/nemo/export/trt_llm/decoder/gemma.py
index 10301c7a47d7..37f843dcf0ca 100644
--- a/nemo/export/trt_llm/decoder/gemma.py
+++ b/nemo/export/trt_llm/decoder/gemma.py
@@ -64,7 +64,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.self_attn.o_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -73,13 +77,25 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.gate_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.down_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.up_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -128,4 +144,7 @@ def build_decoder(self, layer):
         config.set_if_not_exist('dense_context_fmha', False)
         config.set_if_not_exist('moe_num_experts', 0)
 
-        return GemmaDecoderLayer(config=config, layer_idx=self.layer_id,)
+        return GemmaDecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py
index 8af4e4ef01e4..a405aabbbd48 100644
--- a/nemo/export/trt_llm/decoder/gpt.py
+++ b/nemo/export/trt_llm/decoder/gpt.py
@@ -54,11 +54,18 @@ def build_input_layernorm(self, layer) -> LayernormConfig:
     def build_attention(self, layer) -> AttentionConfig:
         config = AttentionConfig()
         config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.attn.c_attn], rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            [layer.attn.c_attn],
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.attn.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.attn.c_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -67,10 +74,18 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.c_fc, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.c_fc,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.c_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -126,4 +141,7 @@ def build_decoder(self, layer):
         config.set_if_not_exist('rotary_pct', rotary_pct)
         config.set_if_not_exist('moe_num_experts', 0)
 
-        return GPTDecoderLayer(config=config, layer_idx=self.layer_id,)
+        return GPTDecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/decoder/gptj.py b/nemo/export/trt_llm/decoder/gptj.py
index aa65ca385a47..327a31fdd35c 100644
--- a/nemo/export/trt_llm/decoder/gptj.py
+++ b/nemo/export/trt_llm/decoder/gptj.py
@@ -60,7 +60,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.attn.out_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.attn.out_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         config.rotary_dim = layer.attn.rotary_dim
@@ -71,10 +75,18 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.fc_in, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.fc_in,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.fc_out, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.fc_out,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py
index 873c0306375b..b37d62e214de 100644
--- a/nemo/export/trt_llm/decoder/llama.py
+++ b/nemo/export/trt_llm/decoder/llama.py
@@ -66,7 +66,11 @@ def build_attention(self, layer) -> AttentionConfig:
         )
 
         config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.self_attn.o_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -75,13 +79,25 @@ def build_attention(self, layer) -> AttentionConfig:
     def build_mlp(self, layer) -> MLPConfig:
         config = MLPConfig()
         config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.gate_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.down_proj,
+            LINEAR_ROW,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
         config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+            layer.mlp.up_proj,
+            LINEAR_COLUMN,
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
         )
 
         return config
@@ -147,4 +163,7 @@ def build_decoder(self, layer):
                 config.moe_tp_mode = layer.moe_tp_mode
                 config.moe_normalization_mode = layer.moe_renorm_mode
 
-        return LLaMADecoderLayer(config=config, layer_idx=self.layer_id,)
+        return LLaMADecoderLayer(
+            config=config,
+            layer_idx=self.layer_id,
+        )
diff --git a/nemo/export/trt_llm/model_config.py b/nemo/export/trt_llm/model_config.py
index dd360afd6b8a..0f120dc56153 100644
--- a/nemo/export/trt_llm/model_config.py
+++ b/nemo/export/trt_llm/model_config.py
@@ -122,7 +122,11 @@ def from_nn_module(module: nn.Module, linear_type: str, rank=0, tensor_parallel=
         if hasattr(module, "bias") and module.bias is not None:
             if linear_type == LINEAR_COLUMN:
                 config.bias = np.ascontiguousarray(
-                    split(torch_to_numpy_with_dtype(module.bias, dtype), tensor_parallel, rank,)
+                    split(
+                        torch_to_numpy_with_dtype(module.bias, dtype),
+                        tensor_parallel,
+                        rank,
+                    )
                 )
             else:
                 config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
@@ -234,7 +238,9 @@ class AttentionConfig:
 
     @staticmethod
     def from_nemo(
-        weights_dict: Dict[str, np.ndarray], layer_id: int, rank: int = 0,
+        weights_dict: Dict[str, np.ndarray],
+        layer_id: int,
+        rank: int = 0,
     ):
         """Converts the nemo weights and config to `AttentionConfig`."""
         attention = AttentionConfig()
@@ -243,12 +249,16 @@ def from_nemo(
             weights_dict, f"layers.{layer_id}.attention.query_key_value.weight.{rank}"
         )
         attention.qkv.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.attention.query_key_value.bias.{rank}",
+            weights_dict,
+            f"layers.{layer_id}.attention.query_key_value.bias.{rank}",
         )
 
         attention.dense = LinearConfig(linear_type=LINEAR_ROW)
         attention.dense.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.weight.{rank}")
-        attention.dense.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.bias",)
+        attention.dense.bias = get_tensor_from_dict(
+            weights_dict,
+            f"layers.{layer_id}.attention.dense.bias",
+        )
         return attention
 
 
@@ -276,7 +286,10 @@ def from_nemo(
 
         # print("********** mlp.fc.weight : ", mlp.fc.weight )
 
-        mlp.fc.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",)
+        mlp.fc.bias = get_tensor_from_dict(
+            weights_dict,
+            f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",
+        )
 
         gated = is_gated_activation(mlp.hidden_act)
         is_fast_glu = mlp.hidden_act in ['fast-geglu', 'fast-swiglu', 'fast-reglu']
@@ -287,9 +300,13 @@ def from_nemo(
                 if isinstance(llm_config, LlamaConfig) and not is_mcore and not is_fast_glu
                 else f"layers.{layer_id}.mlp.dense_h_to_4h.gate.weight.{rank}"
             )
-            mlp.gate.weight = get_tensor_from_dict(weights_dict, layer_name,)
+            mlp.gate.weight = get_tensor_from_dict(
+                weights_dict,
+                layer_name,
+            )
             mlp.gate.bias = get_tensor_from_dict(
-                weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}",
+                weights_dict,
+                f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}",
             )
 
         mlp.proj = LinearConfig(linear_type=LINEAR_ROW)
@@ -382,19 +399,23 @@ def from_nemo(
             LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
         )
         layer_config.input_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.input_layernorm.weight",
+            weights_dict,
+            f"layers.{layer_id}.input_layernorm.weight",
         )
         layer_config.input_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.input_layernorm.bias",
+            weights_dict,
+            f"layers.{layer_id}.input_layernorm.bias",
         )
 
         layer_config.mlp_layernorm = LayernormConfig()
         layer_config.mlp_layernorm.layernorm_type = LAYERNORM_DEFAULT  # Falcon uses default layernorm
         layer_config.mlp_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.weight",
+            weights_dict,
+            f"layers.{layer_id}.pre_mlp_layernorm.weight",
         )
         layer_config.mlp_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.bias",
+            weights_dict,
+            f"layers.{layer_id}.pre_mlp_layernorm.bias",
         )
 
         layer_config.post_layernorm = LayernormConfig()
@@ -403,10 +424,12 @@ def from_nemo(
         )
 
         layer_config.post_layernorm.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.post_attention_layernorm.weight",
+            weights_dict,
+            f"layers.{layer_id}.post_attention_layernorm.weight",
         )
         layer_config.post_layernorm.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.post_attention_layernorm.bias",
+            weights_dict,
+            f"layers.{layer_id}.post_attention_layernorm.bias",
         )
 
         if layer_config.post_layernorm.weight is None:  # Falcon doesn't have post layernorm
@@ -415,7 +438,11 @@ def from_nemo(
         if layer_config.mlp_layernorm.weight is None:
             layer_config.mlp_layernorm = None
 
-        layer_config.attention = AttentionConfig.from_nemo(weights_dict, layer_id, rank,)
+        layer_config.attention = AttentionConfig.from_nemo(
+            weights_dict,
+            layer_id,
+            rank,
+        )
 
         moe = False
         if llm_config.moe_num_experts is not None:
diff --git a/nemo/export/trt_llm/nemo/nemo.py b/nemo/export/trt_llm/nemo/nemo.py
index 9026cd9cfba9..c3564f1c4e8e 100644
--- a/nemo/export/trt_llm/nemo/nemo.py
+++ b/nemo/export/trt_llm/nemo/nemo.py
@@ -106,7 +106,9 @@ def extract_layers_with_prefix(model_, prefix):
 
 class UnpackedNemoCheckpointDir:
     def __init__(
-        self, checkpoints_dir: typing.Union[pathlib.Path, TarPath], load_checkpoints_to_cpu: bool = False,
+        self,
+        checkpoints_dir: typing.Union[pathlib.Path, TarPath],
+        load_checkpoints_to_cpu: bool = False,
     ):
         assert isinstance(checkpoints_dir, (pathlib.Path, TarPath))
         self._checkpoints_dir = checkpoints_dir
@@ -121,11 +123,7 @@ def model_config(self):
         model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename))
         if model_configs_paths:
             if len(model_configs_paths) > 1:
-                raise RuntimeError(
-                    f"There are more than single {model_config_filename} in"
-                    f" {self._checkpoints_dir}:"
-                    f" {', '.join(map(lambda p: p.as_posix(), model_configs_paths))}"
-                )
+                LOGGER.debug(f"There are more than single {model_config_filename} in" f" {self._checkpoints_dir}")
             model_config_path = model_configs_paths[0]
             LOGGER.debug("Loading model config from %s", model_config_path)
             with model_config_path.open("r") as model_config_file:
diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py
index 736d6180807e..f4b44552af63 100644
--- a/nemo/export/trt_llm/tensorrt_llm_model.py
+++ b/nemo/export/trt_llm/tensorrt_llm_model.py
@@ -144,7 +144,12 @@ def forward(
         if attention_mask is not None:
             attention_mask = expand_mask(attention_mask, shape(input_ids, -1))
 
-        for layer_idx, (layer, past) in enumerate(zip(self.layers, kv_cache_params.past_key_value,)):
+        for layer_idx, (layer, past) in enumerate(
+            zip(
+                self.layers,
+                kv_cache_params.past_key_value,
+            )
+        ):
 
             decoder_params = {
                 "hidden_states": hidden_states,
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index 92fc36272f7c..fe0189b10628 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -16,17 +16,19 @@
 import json
 import logging
 import os
+import tempfile
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 
+import numpy as np
 import tensorrt_llm
 import torch
 from mpi4py.futures import MPIPoolExecutor
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraManager
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import ModelConfig, ModelRunnerCpp, SamplingConfig
+from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
 from transformers import PreTrainedTokenizer
 
 from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
@@ -55,7 +57,7 @@ class TensorrtLLMHostContext:
 class TensorrtLLMWorkerContext:
     """The MPI worker side context for TRT LLM inference."""
 
-    decoder: ModelRunnerCpp = None
+    decoder: ModelRunner = None
     sampling_config: SamplingConfig = None
     lora_manager: LoraManager = None
     max_batch_size: int = 0
@@ -128,7 +130,13 @@ def _read_config(config_path: Path):
     return model_config, world_size, tensor_parallel_size, pipeline_parallel_size, dtype, max_input_len, max_batch_size
 
 
-def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_beams=1):
+def _load(
+    tokenizer: PreTrainedTokenizer,
+    engine_dir,
+    lora_ckpt_list=None,
+    num_beams=1,
+    use_python_runtime: bool = True,
+):
     """The impl of `load` API for on a single GPU worker."""
     try:
         tensorrt_llm.logger.set_level("info")
@@ -147,17 +155,26 @@ def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_b
 
         runtime_rank = tensorrt_llm.mpi_rank()
 
-        decoder = ModelRunnerCpp.from_dir(
-            engine_dir=engine_dir,
-            lora_dir=lora_ckpt_list,
-            lora_ckpt_source="nemo",
-            rank=runtime_rank,
-            max_batch_size=max_batch_size,
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_beam_width=max_beam_width,
-            debug_mode=False,
-        )
+        if use_python_runtime:
+            decoder = ModelRunner.from_dir(
+                engine_dir=engine_dir,
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source="nemo",
+                rank=runtime_rank,
+                debug_mode=False,
+            )
+        else:
+            decoder = ModelRunnerCpp.from_dir(
+                engine_dir=engine_dir,
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source="nemo",
+                rank=runtime_rank,
+                max_batch_size=max_batch_size,
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_beam_width=max_beam_width,
+                debug_mode=False,
+            )
 
         sampling_config = SamplingConfig(
             end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams
@@ -218,6 +235,13 @@ def _forward(
         with torch.no_grad():
             prompt_tasks = None if task_ids is None else ",".join(str(task) for task in task_ids)
 
+            if prompt_table is not None:
+                prompt_table = prompt_table.reshape(1, *prompt_table.shape)
+                tmp_dir = tempfile.TemporaryDirectory()
+                prompt_table_path = os.path.join(tmp_dir.name, 'prompt_table.npy')
+                np.save(prompt_table_path, prompt_table.cpu().float().numpy())
+                prompt_table = prompt_table_path
+
             outputs = decoder.generate(
                 input_tensors,
                 max_new_tokens=max_output_len,
@@ -230,6 +254,7 @@ def _forward(
                 stop_words_list=stop_words_list,
                 bad_words_list=bad_words_list,
                 lora_uids=lora_uids,
+                prompt_table_path=prompt_table,
                 prompt_table=prompt_table,
                 prompt_tasks=prompt_tasks,
                 streaming=streaming,
@@ -239,6 +264,9 @@ def _forward(
 
             torch.cuda.synchronize()
 
+            if prompt_table is not None:
+                tmp_dir.cleanup()
+
         runtime_rank = tensorrt_llm.mpi_rank()
         if runtime_rank == 0 or multiprocessed_env:
             return outputs
@@ -251,7 +279,11 @@ def _forward(
 
 
 def load(
-    tokenizer: PreTrainedTokenizer, engine_dir: str, lora_ckpt_list: List[str] = None, num_beams: int = 1
+    tokenizer: PreTrainedTokenizer,
+    engine_dir: str,
+    lora_ckpt_list: List[str] = None,
+    num_beams: int = 1,
+    use_python_runtime: bool = True,
 ) -> TensorrtLLMHostContext:
     """Loaded the compiled LLM model and run it.
 
@@ -263,17 +295,17 @@ def load(
         config = json.load(f)
     world_size = config["pretrained_config"]["mapping"]["world_size"]
     if world_size == 1:
-        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams)
+        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
         executor = None
     elif tensorrt_llm.mpi_world_size() > 1:
-        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams)
+        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
         executor = None
         tensorrt_llm.mpi_barrier()
     else:
         executor = MPIPoolExecutor(max_workers=world_size)
         futures = []
         for _ in range(world_size):
-            future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams)
+            future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
             futures.append(future)
         for future in futures:
             future.result()
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 7370731ec996..0a9604a73cdc 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -80,7 +80,7 @@ def get_args(argv):
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
     parser.add_argument(
-        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+        "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
         "-drip",
@@ -133,6 +133,13 @@ def get_args(argv):
     parser.add_argument(
         "-lc", "--lora_ckpt", default=None, type=str, nargs="+", help="The checkpoint list of LoRA weights"
     )
+    parser.add_argument(
+        "-ucr",
+        '--use_cpp_runtime',
+        default=False,
+        action='store_true',
+        help='Use TensorRT LLM C++ runtime',
+    )
     parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
 
     args = parser.parse_args(argv)
@@ -206,32 +213,13 @@ def nemo_deploy(argv):
                 )
                 return
 
-    trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt)
+    trt_llm_exporter = TensorRTLLM(
+        model_dir=trt_llm_path,
+        lora_ckpt_list=args.lora_ckpt,
+        use_python_runtime=(not args.use_cpp_runtime),
+    )
 
     if args.nemo_checkpoint is not None:
-
-        trt_llm_exporter.export(
-            nemo_checkpoint_path=args.nemo_checkpoint,
-            model_type=args.model_type,
-            n_gpus=args.num_gpus,
-            tensor_parallel_size=args.num_gpus,
-            pipeline_parallel_size=1,
-            max_input_token=args.max_input_len,
-            max_output_token=args.max_output_len,
-            max_batch_size=args.max_batch_size,
-            max_num_tokens=args.max_num_tokens,
-            opt_num_tokens=args.opt_num_tokens,
-            max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            paged_kv_cache=args.use_paged_kv_cache,
-            remove_input_padding=(not args.disable_remove_input_padding),
-            dtype=args.dtype,
-            enable_multi_block_mode=args.multi_block_mode,
-            use_lora_plugin=args.use_lora_plugin,
-            lora_target_modules=args.lora_target_modules,
-            max_lora_rank=args.max_lora_rank,
-            save_nemo_model_config=True,
-        )
-
         try:
             LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
             trt_llm_exporter.export(
@@ -246,7 +234,7 @@ def nemo_deploy(argv):
                 max_num_tokens=args.max_num_tokens,
                 opt_num_tokens=args.opt_num_tokens,
                 max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-                paged_kv_cache=args.use_paged_kv_cache,
+                paged_kv_cache=(not args.no_paged_kv_cache),
                 remove_input_padding=(not args.disable_remove_input_padding),
                 dtype=args.dtype,
                 enable_multi_block_mode=args.multi_block_mode,
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
index e9741516cf00..ce9ef6a1e132 100644
--- a/scripts/export/export_to_trt_llm.py
+++ b/scripts/export/export_to_trt_llm.py
@@ -45,8 +45,8 @@ def get_args(argv):
     parser.add_argument(
         "-dt",
         "--dtype",
-        choices=["bf16", "fp16", "fp8", "int8"],
-        default="bf16",
+        choices=["bfloat16", "float16", "fp8", "int8"],
+        default="bfloat16",
         type=str,
         help="dtype of the model on TensorRT-LLM",
     )
@@ -59,7 +59,7 @@ def get_args(argv):
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
     parser.add_argument(
-        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+        "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
     )
     parser.add_argument(
         "-drip",
@@ -123,7 +123,7 @@ def nemo_export_trt_llm(argv):
     LOGGER.info("Logging level set to {}".format(loglevel))
     LOGGER.info(args)
 
-    if args.dtype != "bf16":
+    if args.dtype != "bfloat16":
         LOGGER.error(
             "Only bf16 is currently supported for the optimized deployment with TensorRT-LLM. "
             "Support for the other precisions will be added in the coming releases."
@@ -146,7 +146,7 @@ def nemo_export_trt_llm(argv):
             max_num_tokens=args.max_num_tokens,
             opt_num_tokens=args.opt_num_tokens,
             max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            paged_kv_cache=args.use_paged_kv_cache,
+            paged_kv_cache=(not args.no_paged_kv_cache),
             remove_input_padding=(not args.disable_remove_input_padding),
             dtype=args.dtype,
             enable_multi_block_mode=args.multi_block_mode,

From deb613adc7b7ad0a540f5cc1f0bc5032ddb345ff Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Wed, 29 May 2024 20:03:32 -0700
Subject: [PATCH 134/178] Skip sequence_parallel allreduce when using Mcore
 DistOpt (#9344)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 3660a5145b10..b3e3c231de52 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -843,9 +843,11 @@ def training_step(self, dataloader_iter):
 
         # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
-            self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)
-            self.allreduce_sequence_parallel_gradients()
-            self.megatron_timer_stop('allreduce_sequence_parallel_gradients')
+            # Mcore DistOpt handles this, so we don't have to
+            if not self.use_mcore_dist_optim:
+                self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)
+                self.allreduce_sequence_parallel_gradients()
+                self.megatron_timer_stop('allreduce_sequence_parallel_gradients')
 
         self.megatron_timer_start('gradient_allreduce', log_level=1)
         if self.use_fsdp:

From 2e396060f6f95b6a848f4f260b5bbafa8ed52107 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 30 May 2024 07:45:56 +0200
Subject: [PATCH 135/178] Fix FSDP gradient reduction with orig params (#9335)

The `param.grad is not None` check also fixes gradient reduction in the
case of parameters not having acquired gradients (as parameters could
become empty tensors in FSDP).

Thanks to @ofivite for suggesting that `use_orig_params=True` could be
the cause of the issue, which greatly helped with analysis.

Signed-off-by: janEbert <janpublicebert@posteo.net>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index b3e3c231de52..a5b4450c7b44 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1001,8 +1001,8 @@ def allreduce_fsdp_sharding_omitted_gradients(self):
         """All-reduce gradients of FSDP-sharding-omitted parameters in sharding domain (data-parallel domain)."""
         assert isinstance(self.model, torch.nn.Module)
         grads = []
-        for param in self.model.parameters():
-            if not isinstance(param, torch.distributed.fsdp.FlatParameter) and param.requires_grad:
+        for param in self.model._ignored_params:
+            if param.requires_grad and param.grad is not None:
                 grad = param.grad
                 grads.append(grad.data)
         if len(grads) > 0:

From b6595cbae2226ff553b44ff2b66527738ea4bdf2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 30 May 2024 07:25:12 -0700
Subject: [PATCH 136/178] Fix P-tuning for Llama based models (#9300)

* Fix P-tuning for Llama based models (#9297)

* Added the BOS token for Llama, Mistral and Mixtral.

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>

* Don't load an existing TRT-LLM model before export to speed up the export process and avoid possible contamination from previous runs.

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: apanteleev <apanteleev@users.noreply.github.com>

---------

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>
Signed-off-by: apanteleev <apanteleev@users.noreply.github.com>
Co-authored-by: apanteleev <apanteleev@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>

* Fix the export test

---------

Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com>
Signed-off-by: apanteleev <apanteleev@users.noreply.github.com>
Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Alexey Panteleev <alpanteleev@nvidia.com>
Co-authored-by: apanteleev <apanteleev@users.noreply.github.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
---
 nemo/export/trt_llm/tensorrt_llm_run.py | 8 +++++++-
 scripts/deploy/nlp/deploy_triton.py     | 1 +
 scripts/export/export_to_trt_llm.py     | 2 +-
 tests/export/test_nemo_export.py        | 2 +-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index fe0189b10628..1bdfd5237caf 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -312,7 +312,13 @@ def load(
 
     max_batch_size = config["build_config"]["max_batch_size"]
     max_input_len = config["build_config"]["max_input_len"]
-    add_bos = True if config["pretrained_config"]["architecture"] == "GemmaForCausalLM" else False
+    architectures_that_need_bos_token = [
+        "GemmaForCausalLM",
+        "LLaMAForCausalLM",
+        "MistralForCausalLM",
+        "MixtralForCausalLM",
+    ]
+    add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token
 
     return TensorrtLLMHostContext(
         executor=executor,
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 0a9604a73cdc..5a2440b0fa2f 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -216,6 +216,7 @@ def nemo_deploy(argv):
     trt_llm_exporter = TensorRTLLM(
         model_dir=trt_llm_path,
         lora_ckpt_list=args.lora_ckpt,
+        load_model=(args.nemo_checkpoint is None),
         use_python_runtime=(not args.use_cpp_runtime),
     )
 
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
index ce9ef6a1e132..a9c16bf8cff6 100644
--- a/scripts/export/export_to_trt_llm.py
+++ b/scripts/export/export_to_trt_llm.py
@@ -131,7 +131,7 @@ def nemo_export_trt_llm(argv):
         return
 
     try:
-        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository)
+        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository, load_model=False)
 
         LOGGER.info("Export to TensorRT-LLM function is called.")
         trt_llm_exporter.export(
diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
index b3e186433561..97a06a1f6887 100644
--- a/tests/export/test_nemo_export.py
+++ b/tests/export/test_nemo_export.py
@@ -200,7 +200,7 @@ def run_trt_llm_inference(
                 print("---- LoRA could not be enabled and skipping the test.")
                 return None, None, None, None, None
 
-        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list)
+        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False)
 
         trt_llm_exporter.export(
             nemo_checkpoint_path=checkpoint_path,

From aed9d071c700080b3eb024e8a5d7f091f20f0183 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Thu, 30 May 2024 10:08:24 -0700
Subject: [PATCH 137/178] Fix GreedyBatchedCTCInfer regression from
 GreedyCTCInfer. (#9347)

* Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer.

decoder_lengths is allowed to be on CPU even when decoder_output is on
GPU. This matches the behavior of GreedyCTCInfer. Even though that
behavior is unintentional, there is code depending on that behavior,
including our jupyter notebooks.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: titu1994 <titu1994@users.noreply.github.com>

---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
---
 .../parts/submodules/ctc_greedy_decoding.py   | 12 +++-
 .../asr/decoding/test_ctc_decoding.py         | 71 +++++++++++++++++--
 2 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index a7f57c82279a..74204cf73d8e 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -394,7 +394,17 @@ def forward(
 
         if decoder_lengths is None:
             logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
-            decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0])
+            decoder_lengths = torch.tensor(
+                [decoder_output.shape[1]], dtype=torch.long, device=decoder_output.device
+            ).expand(decoder_output.shape[0])
+
+        # GreedyCTCInfer::forward(), by accident, works with
+        # decoder_lengths on either CPU or GPU when decoder_output is
+        # on GPU. For the sake of backwards compatibility, we also
+        # allow decoder_lengths to be on the CPU device. In this case,
+        # we simply copy the decoder_lengths from CPU to GPU. If both
+        # tensors are already on the same device, this is a no-op.
+        decoder_lengths = decoder_lengths.to(decoder_output.device)
 
         if decoder_output.ndim == 2:
             hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index a42d61f051ad..580344fed395 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -200,8 +200,41 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('preserve_frame_confidence', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
+    @pytest.mark.parametrize(
+        "logprobs_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "length_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
     def test_batched_decoding_logprobs(
-        self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none
+        self,
+        tmp_tokenizer,
+        alignments,
+        timestamps,
+        preserve_frame_confidence,
+        length_is_none,
+        logprobs_device,
+        length_device,
     ):
         cfg = CTCBPEDecodingConfig(
             strategy='greedy',
@@ -217,7 +250,7 @@ def test_batched_decoding_logprobs(
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_signal = torch.randn(size=(B, T, V))
+        input_signal = torch.randn(size=(B, T, V), device=logprobs_device)
         # Set the blank index to a very high probability to make sure
         # that we always handle at least a few blanks.
         input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
@@ -225,7 +258,7 @@ def test_batched_decoding_logprobs(
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B])
+            length = torch.randint(low=1, high=T, size=[B], device=length_device)
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
@@ -249,7 +282,33 @@ def test_batched_decoding_logprobs(
     @pytest.mark.unit
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
-    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none):
+    @pytest.mark.parametrize(
+        "labels_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "length_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none, labels_device, length_device):
         cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
         unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
         cfg.strategy = 'greedy_batched'
@@ -258,7 +317,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_labels = torch.randint(V, size=(B, T))
+        input_labels = torch.randint(V, size=(B, T), device=labels_device)
         # Set some indices to blank to make sure that we always handle
         # at least a few blanks.
         input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size
@@ -266,7 +325,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B])
+            length = torch.randint(low=1, high=T, size=[B], device=length_device)
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(

From f397086f4a580e42633a89db99885eb07b511c3d Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Thu, 30 May 2024 10:25:07 -0700
Subject: [PATCH 138/178] Revert "Fix GreedyBatchedCTCInfer regression from
 GreedyCTCInfer. (#9347)" (#9351)

This reverts commit aed9d071c700080b3eb024e8a5d7f091f20f0183.
---
 .../parts/submodules/ctc_greedy_decoding.py   | 12 +---
 .../asr/decoding/test_ctc_decoding.py         | 71 ++-----------------
 2 files changed, 7 insertions(+), 76 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index 74204cf73d8e..a7f57c82279a 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -394,17 +394,7 @@ def forward(
 
         if decoder_lengths is None:
             logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
-            decoder_lengths = torch.tensor(
-                [decoder_output.shape[1]], dtype=torch.long, device=decoder_output.device
-            ).expand(decoder_output.shape[0])
-
-        # GreedyCTCInfer::forward(), by accident, works with
-        # decoder_lengths on either CPU or GPU when decoder_output is
-        # on GPU. For the sake of backwards compatibility, we also
-        # allow decoder_lengths to be on the CPU device. In this case,
-        # we simply copy the decoder_lengths from CPU to GPU. If both
-        # tensors are already on the same device, this is a no-op.
-        decoder_lengths = decoder_lengths.to(decoder_output.device)
+            decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0])
 
         if decoder_output.ndim == 2:
             hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index 580344fed395..a42d61f051ad 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -200,41 +200,8 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('preserve_frame_confidence', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
-    @pytest.mark.parametrize(
-        "logprobs_device",
-        [
-            torch.device("cpu"),
-            pytest.param(
-                torch.device("cuda"),
-                marks=pytest.mark.skipif(
-                    not torch.cuda.is_available(),
-                    reason='CUDA required for test.',
-                ),
-            ),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "length_device",
-        [
-            torch.device("cpu"),
-            pytest.param(
-                torch.device("cuda"),
-                marks=pytest.mark.skipif(
-                    not torch.cuda.is_available(),
-                    reason='CUDA required for test.',
-                ),
-            ),
-        ],
-    )
     def test_batched_decoding_logprobs(
-        self,
-        tmp_tokenizer,
-        alignments,
-        timestamps,
-        preserve_frame_confidence,
-        length_is_none,
-        logprobs_device,
-        length_device,
+        self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none
     ):
         cfg = CTCBPEDecodingConfig(
             strategy='greedy',
@@ -250,7 +217,7 @@ def test_batched_decoding_logprobs(
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_signal = torch.randn(size=(B, T, V), device=logprobs_device)
+        input_signal = torch.randn(size=(B, T, V))
         # Set the blank index to a very high probability to make sure
         # that we always handle at least a few blanks.
         input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
@@ -258,7 +225,7 @@ def test_batched_decoding_logprobs(
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B], device=length_device)
+            length = torch.randint(low=1, high=T, size=[B])
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
@@ -282,33 +249,7 @@ def test_batched_decoding_logprobs(
     @pytest.mark.unit
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
-    @pytest.mark.parametrize(
-        "labels_device",
-        [
-            torch.device("cpu"),
-            pytest.param(
-                torch.device("cuda"),
-                marks=pytest.mark.skipif(
-                    not torch.cuda.is_available(),
-                    reason='CUDA required for test.',
-                ),
-            ),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "length_device",
-        [
-            torch.device("cpu"),
-            pytest.param(
-                torch.device("cuda"),
-                marks=pytest.mark.skipif(
-                    not torch.cuda.is_available(),
-                    reason='CUDA required for test.',
-                ),
-            ),
-        ],
-    )
-    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none, labels_device, length_device):
+    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none):
         cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
         unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
         cfg.strategy = 'greedy_batched'
@@ -317,7 +258,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_labels = torch.randint(V, size=(B, T), device=labels_device)
+        input_labels = torch.randint(V, size=(B, T))
         # Set some indices to blank to make sure that we always handle
         # at least a few blanks.
         input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size
@@ -325,7 +266,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B], device=length_device)
+            length = torch.randint(low=1, high=T, size=[B])
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(

From bf53aaa9e154f7068c49637e71b23a8d0bac513e Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Thu, 30 May 2024 15:26:51 -0400
Subject: [PATCH 139/178] TRT-LLM Export Code Cleanup (#9270)

* Init code cleanup for the trt-llm export

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Removed model config

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* clearn futher

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Done more cleaning and addressing the reviews

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py                   |  71 +--
 nemo/export/trt_llm/decoder/__init__.py       |  82 ---
 nemo/export/trt_llm/decoder/decoder.py        | 264 ---------
 nemo/export/trt_llm/decoder/falcon.py         | 152 -----
 nemo/export/trt_llm/decoder/gemma.py          | 150 -----
 nemo/export/trt_llm/decoder/gpt.py            | 147 -----
 nemo/export/trt_llm/decoder/gptj.py           | 117 ----
 nemo/export/trt_llm/decoder/llama.py          | 169 ------
 nemo/export/trt_llm/model_config.py           | 555 ------------------
 nemo/export/trt_llm/model_config_trt.py       |  82 ---
 nemo/export/trt_llm/nemo/convert.py           | 149 -----
 nemo/export/trt_llm/nemo/nemo.py              |  16 +-
 nemo/export/trt_llm/nemo/nemo_ckpt_convert.py |   2 +-
 nemo/export/trt_llm/nemo_utils.py             | 239 +++-----
 nemo/export/trt_llm/quantization_utils.py     | 128 ----
 nemo/export/trt_llm/tensor_utils.py           |  59 --
 nemo/export/trt_llm/tensorrt_llm_build.py     | 320 ----------
 nemo/export/trt_llm/tensorrt_llm_model.py     | 406 -------------
 nemo/export/trt_llm/tensorrt_llm_run.py       | 109 ----
 nemo/export/trt_llm/tensorrt_llm_utils.py     |  85 ---
 nemo/export/trt_llm/utils.py                  |  78 ---
 21 files changed, 86 insertions(+), 3294 deletions(-)
 delete mode 100644 nemo/export/trt_llm/decoder/__init__.py
 delete mode 100644 nemo/export/trt_llm/decoder/decoder.py
 delete mode 100644 nemo/export/trt_llm/decoder/falcon.py
 delete mode 100644 nemo/export/trt_llm/decoder/gemma.py
 delete mode 100644 nemo/export/trt_llm/decoder/gpt.py
 delete mode 100644 nemo/export/trt_llm/decoder/gptj.py
 delete mode 100644 nemo/export/trt_llm/decoder/llama.py
 delete mode 100644 nemo/export/trt_llm/model_config.py
 delete mode 100644 nemo/export/trt_llm/model_config_trt.py
 delete mode 100644 nemo/export/trt_llm/quantization_utils.py
 delete mode 100644 nemo/export/trt_llm/tensor_utils.py
 delete mode 100644 nemo/export/trt_llm/tensorrt_llm_model.py
 delete mode 100644 nemo/export/trt_llm/tensorrt_llm_utils.py
 delete mode 100644 nemo/export/trt_llm/utils.py

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index b030165a3d45..401ac2e930a6 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -28,14 +28,11 @@
 
 from nemo.deploy import ITritonDeployable
 from nemo.export.tarutils import TarPath, unpack_tarball
-from nemo.export.trt_llm.model_config_trt import model_config_to_tensorrt_llm
-from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer
-from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_to_trtllm_config
+from nemo.export.trt_llm.nemo_utils import get_tokenzier, is_nemo_file, nemo_to_trtllm_config
 from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
 from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
 from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine
-from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_refit
-from nemo.export.trt_llm.utils import is_nemo_file
+from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load
 
 use_deploy = True
 try:
@@ -278,70 +275,6 @@ def export(
         if load_model:
             self._load()
 
-    def build(
-        self,
-        nemo_model,
-        nemo_model_config,
-        tokenizer=None,
-        max_input_token: int = 256,
-        max_output_token: int = 256,
-        max_batch_size: int = 8,
-        use_refit: bool = False,
-        model_type: str = "gptnext",
-    ):
-        from megatron.core import parallel_state
-
-        self.use_refit = use_refit
-        self.stream = torch.cuda.Stream()
-        self.model_type = model_type
-        self.tokenizer = build_tokenizer(tokenizer)
-
-        # Each model shard has its own directory
-        if parallel_state.get_data_parallel_world_size() > 1:
-            self.model_dir = os.path.join(self.model_dir, f"dp{parallel_state.get_data_parallel_rank()}")
-        if parallel_state.get_tensor_model_parallel_world_size() > 1:
-            self.model_dir = os.path.join(self.model_dir, f"tp{parallel_state.get_tensor_model_parallel_rank()}")
-        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-            self.model_dir = os.path.join(self.model_dir, f"pp{parallel_state.get_pipeline_model_parallel_rank()}")
-
-        # Build or refit TRT-LLM engine from a nemo model.
-        model_configs = nemo_llm_model_to_model_config(
-            nemo_model=nemo_model,
-            decoder_type=model_type,
-            nemo_model_config=nemo_model_config,
-        )
-
-        model_config_to_tensorrt_llm(
-            model_configs,
-            self.model_dir,
-            max_input_len=max_input_token,
-            max_output_len=max_output_token,
-            max_batch_size=max_batch_size,
-            max_beam_width=1,
-            max_prompt_embedding_table_size=0,
-            use_refit=self.use_refit,
-        )
-        # Use load_refit to handle multiprocessed environment
-        self.model = load_refit(
-            tokenizer=self.tokenizer, engine_dir=self.model_dir, model_configs=model_configs, stream=self.stream
-        )
-
-    def refit(
-        self,
-        nemo_model,
-        nemo_model_config,
-    ):
-        assert self.use_refit, "TRT-LLM model must be built() with refit=True"
-
-        # Build or refit TRT-LLM engine from a nemo model.
-        model_configs = nemo_llm_model_to_model_config(
-            nemo_model=nemo_model, decoder_type=self.model_type, nemo_model_config=nemo_model_config
-        )
-
-        self.model = load_refit(
-            tokenizer=self.tokenizer, engine_dir=self.model_dir, model_configs=model_configs, stream=self.stream
-        )
-
     def forward(
         self,
         input_texts: List[str],
diff --git a/nemo/export/trt_llm/decoder/__init__.py b/nemo/export/trt_llm/decoder/__init__.py
deleted file mode 100644
index b5e22b5e513e..000000000000
--- a/nemo/export/trt_llm/decoder/__init__.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Type
-
-import tensorrt as trt
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.falcon import FALCONDecoderLayerBuilder, FALCONDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.gemma import GemmaDecoderLayerBuilder, GemmaDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.gpt import GPTDecoderLayerBuilder, GPTDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.gptj import GPTJDecoderLayerBuilder, GPTJDecoderLayerConfigBuilder
-from nemo.export.trt_llm.decoder.llama import LLAMADecoderLayerBuilder, LLAMADecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    DECODER_FALCON,
-    DECODER_GEMMA,
-    DECODER_GPT2,
-    DECODER_GPTJ,
-    DECODER_GPTNEXT,
-    DECODER_LLAMA,
-    QUANTIZATION_NONE,
-)
-
-DECODER_CONFIG_REGISTRY: Dict[str, Type[DecoderLayerConfigBuilder]] = {
-    DECODER_GPT2: GPTDecoderLayerConfigBuilder,
-    DECODER_GPTJ: GPTJDecoderLayerConfigBuilder,
-    DECODER_LLAMA: LLAMADecoderLayerConfigBuilder,
-    DECODER_FALCON: FALCONDecoderLayerConfigBuilder,
-    DECODER_GEMMA: GemmaDecoderLayerConfigBuilder,
-}
-
-DECODER_MODEL_TYPE = {
-    DECODER_GPT2: 'GPTForCausalLM',
-    DECODER_GPTNEXT: 'GPTForCausalLM',
-    DECODER_LLAMA: 'LLaMAForCausalLM',
-    DECODER_GEMMA: 'GemmaForCausalLM',
-    DECODER_FALCON: 'FalconForCausalLM',
-}
-
-
-def build_decoder_layer_config(layer, decoder: str, dtype=trt.float16, rank=0, tensor_parallel=1):
-    """Builds the decoder layer config with the input torch module."""
-    assert decoder in DECODER_CONFIG_REGISTRY, f"{decoder} not supported"
-    return DECODER_CONFIG_REGISTRY[decoder](decoder, dtype, rank, tensor_parallel).build_layer(layer)
-
-
-DECODER_REGISTRY: Dict[str, Type[DecoderLayerBuilder]] = {
-    DECODER_GPT2: GPTDecoderLayerBuilder,
-    DECODER_GPTJ: GPTJDecoderLayerBuilder,
-    DECODER_LLAMA: LLAMADecoderLayerBuilder,
-    DECODER_GPTNEXT: GPTDecoderLayerBuilder,
-    DECODER_FALCON: FALCONDecoderLayerBuilder,
-    DECODER_GEMMA: GemmaDecoderLayerBuilder,
-}
-
-
-def build_decoder_layer(
-    layer,
-    layer_id: int,
-    num_layers: int,
-    dtype=trt.float16,
-    quantization=QUANTIZATION_NONE,
-    rank=0,
-    tensor_parallel=1,
-    tp_group=None,
-):
-    """Builds the tensorrt llm decoder layer module with the layer config as the input."""
-    assert layer.decoder_type in DECODER_REGISTRY, f"{layer.decoder_type} not supported"
-    builder = DECODER_REGISTRY[layer.decoder_type]
-    decoder_builder = builder(layer, layer_id, num_layers, dtype, quantization, rank, tensor_parallel, tp_group)
-    return decoder_builder.decoder
diff --git a/nemo/export/trt_llm/decoder/decoder.py b/nemo/export/trt_llm/decoder/decoder.py
deleted file mode 100644
index 2d1993fd74c0..000000000000
--- a/nemo/export/trt_llm/decoder/decoder.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from abc import ABC, abstractmethod
-from typing import Optional
-
-import tensorrt as trt
-from transformers.activations import ACT2FN
-
-from nemo.export.trt_llm.model_config import (
-    QUANTIZATION_NONE,
-    AttentionConfig,
-    DecoderLayerConfig,
-    LayernormConfig,
-    MLPConfig,
-)
-from nemo.export.trt_llm.quantization_utils import quantize_linear
-from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
-
-
-def _get_hidden_act(act_func):
-    """Returns the name of the hidden activation functon based on ACT2FN."""
-    if isinstance(act_func, str):
-        return act_func
-
-    for name, func in ACT2FN.items():
-        if isinstance(func, tuple):
-            if isinstance(act_func, func[0]):
-                return name
-        elif isinstance(act_func, func):
-            return name
-    assert False, f"Cannot find name for {act_func}"
-
-
-class DecoderLayerConfigBuilder(ABC):
-    """A config builder that translate the LLM decoder layer to the DecoderLayerConfig."""
-
-    @abstractmethod
-    def hidden_act_fn(self, layer):
-        """Returns the hidden act fn in the MLP layer, e.g. SiLUActivation or NewGELUActivation."""
-        pass
-
-    @abstractmethod
-    def infer_num_attention_heads(self, layer):
-        """Returns the num of attention heads of the layer."""
-        pass
-
-    @abstractmethod
-    def infer_max_position_embeddings(self, layer):
-        """Returns the max positional embeddings of the layer."""
-        pass
-
-    @abstractmethod
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        """Returns the built input layernorm layer."""
-        pass
-
-    @abstractmethod
-    def build_mlp_layernorm(
-        self, layer
-    ) -> LayernormConfig:  # Force all other models to implement. But seems this builder is not used.
-        """Returns the built mlp layernorm layer."""
-        pass
-
-    @abstractmethod
-    def build_attention(self, layer) -> AttentionConfig:
-        """Returns the built attention layer."""
-        pass
-
-    @abstractmethod
-    def build_mlp(self, layer) -> MLPConfig:
-        """Returns the built mlp layer."""
-        pass
-
-    @abstractmethod
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        """Returns the built post layernorm."""
-        pass
-
-    def __init__(
-        self,
-        decoder_type: str,
-        dtype: trt.DataType = trt.float16,
-        rank: int = 0,
-        tensor_parallel: int = 1,
-    ):
-        """Initializes the DecoderLayerConfigBuilder."""
-        self.decoder_type = decoder_type
-        self.dtype = dtype
-        self.rank = rank
-        self.tensor_parallel = tensor_parallel
-
-    def build_layer(self, layer) -> DecoderLayerConfig:
-        """Builds the decoder layer and returns the DecoderLayer."""
-        decoder = DecoderLayerConfig()
-
-        decoder.decoder_type = self.decoder_type
-        decoder.num_attention_heads = self.infer_num_attention_heads(layer)
-        decoder.num_kv_heads = self.infer_num_kv_heads(layer)
-        decoder.max_position_embeddings = self.infer_max_position_embeddings(layer)
-
-        decoder.input_layernorm = self.build_input_layernorm(layer)
-        decoder.mlp_layernorm = self.build_mlp_layernorm(layer)
-        decoder.attention = self.build_attention(layer)
-        decoder.post_layernorm = self.build_post_layernorm(layer)
-        decoder.mlp = self.build_mlp(layer)
-        decoder.mlp.hidden_act = _get_hidden_act(self.hidden_act_fn(layer)).split("_")[0]
-
-        return decoder
-
-    def infer_num_kv_heads(self, layer):
-        """Returns the num of key value heads of the layer."""
-        return self.infer_num_attention_heads(layer)
-
-
-class DecoderLayerBuilder(ABC):
-    """An abstracted transformer decoder layer with tensorrt_llm implementation taking DecoderLayerConfig as the input.
-
-    Individual decoder layers are supposed to extend this class and implement the customized
-    abstracted method.
-    """
-
-    @abstractmethod
-    def build_decoder(self, layer):
-        """Returns the built decoder layer."""
-        pass
-
-    def __init__(
-        self,
-        layer: DecoderLayerConfig,
-        layer_id: int,
-        num_layers: int,
-        dtype: trt.DataType = trt.float16,
-        quantization: str = QUANTIZATION_NONE,
-        rank: int = 0,
-        tensor_parallel: int = 1,
-        tp_group=None,
-    ):
-        """Initializes the DecoderLayer."""
-        super().__init__()
-        assert isinstance(dtype, trt.DataType)
-        self.layer_id = layer_id
-        self.num_layers = num_layers
-        self.dtype = dtype
-        self.quantization = quantization
-        self.rank = rank
-        self.tensor_parallel = tensor_parallel
-
-        if tp_group is None:
-            self.tp_group = get_tensor_parallel_group(tensor_parallel)
-        else:
-            self.tp_group = tp_group
-
-        self.hidden_size = layer.hidden_size
-        self.num_attention_heads = layer.num_attention_heads
-        self.num_kv_heads = layer.num_kv_heads if layer.num_kv_heads > 0 else layer.num_attention_heads
-
-        assert (
-            self.num_attention_heads % self.num_kv_heads
-        ) == 0, "MQA/GQA requires the number of heads to be divisible by the number of K/V heads."
-        assert (self.num_kv_heads % self.tensor_parallel) == 0 or (self.tensor_parallel % self.num_kv_heads) == 0, (
-            "MQA/GQA requires either the number of K/V heads to be divisible by the number of GPUs"
-            " OR the number of GPUs to be divisible by the number of K/V heads."
-        )
-
-        self.max_position_embeddings = layer.max_position_embeddings
-        self.hidden_act = layer.mlp.hidden_act
-
-        self.decoder = self.build_decoder(layer)
-        self.assign_weights(layer)
-
-        is_moe = (
-            hasattr(self.decoder, "config")
-            and self.decoder.config.moe_num_experts is not None
-            and self.decoder.config.moe_num_experts > 1
-        )
-        if not is_moe:
-            self.quantize(layer)
-
-    def assign_weights(self, layer: DecoderLayerConfig):
-        """Assign the weights to the attention tensorrt_llm layer."""
-        is_moe = (
-            hasattr(self.decoder, "config")
-            and self.decoder.config.moe_num_experts is not None
-            and self.decoder.config.moe_num_experts > 1
-        )
-
-        self.decoder.input_layernorm.weight.value = layer.input_layernorm.weight
-        if layer.input_layernorm.bias is not None:
-            self.decoder.input_layernorm.bias.value = layer.input_layernorm.bias
-
-        if layer.mlp_layernorm is not None:  # Falcon has mlp layer norm
-            if is_moe:
-                assert layer.post_layernorm is None
-                self.decoder.post_layernorm.weight.value = layer.mlp_layernorm.weight
-                if layer.mlp_layernorm.bias is not None:
-                    self.decoder.post_layernorm.bias.value = layer.mlp_layernorm.bias
-            else:
-                self.decoder.mlp_layernorm.weight.value = layer.mlp_layernorm.weight
-                if layer.mlp_layernorm.bias is not None:
-                    self.decoder.mlp_layernorm.bias.value = layer.mlp_layernorm.bias
-
-        self.decoder.attention.qkv.weight.value = layer.attention.qkv.weight
-        if layer.attention.qkv.bias is not None:
-            self.decoder.attention.qkv.bias.value = layer.attention.qkv.bias
-
-        self.decoder.attention.dense.weight.value = layer.attention.dense.weight
-        if self.decoder.attention.dense.bias is not None:
-            self.decoder.attention.dense.bias.value = layer.attention.dense.bias
-
-        if layer.post_layernorm is not None:
-            self.decoder.post_layernorm.weight.value = layer.post_layernorm.weight
-            if layer.post_layernorm.bias is not None:
-                self.decoder.post_layernorm.bias.value = layer.post_layernorm.bias
-
-        if is_moe:
-            self.decoder.mlp.router.weight.value = layer.mlp.router.weight
-            self.decoder.mlp.experts_weight_1.value = layer.mlp.fc1.weight
-            self.decoder.mlp.experts_weight_2.value = layer.mlp.fc2.weight
-
-            if layer.mlp.fc1.bias is not None:
-                self.decoder.mlp.experts_bias_1.value = layer.mlp.fc1.bias
-
-            if layer.mlp.fc2.bias is not None:
-                self.decoder.mlp.experts_bias_2.value = layer.mlp.fc2.bias
-
-        else:
-            self.decoder.mlp.fc.weight.value = layer.mlp.fc.weight
-            self.decoder.mlp.proj.weight.value = layer.mlp.proj.weight
-            bias = layer.mlp.fc.bias is not None
-            if bias:
-                self.decoder.mlp.fc.bias.value = layer.mlp.fc.bias
-                self.decoder.mlp.proj.bias.value = layer.mlp.proj.bias
-
-            if layer.mlp.gate:
-                self.decoder.mlp.gate.weight.value = layer.mlp.gate.weight
-                if bias:
-                    self.decoder.mlp.gate.bias.value = layer.mlp.gate.bias
-
-    def quantize(self, layer: DecoderLayerConfig):
-        """Quantizes the decoder layer based on the layer config."""
-        self.decoder.attention.qkv = quantize_linear(
-            self.decoder.attention.qkv, self.quantization, layer.attention.qkv
-        )
-        self.decoder.attention.dense = quantize_linear(
-            self.decoder.attention.dense, self.quantization, layer.attention.dense
-        )
-        self.decoder.mlp.fc = quantize_linear(self.decoder.mlp.fc, self.quantization, layer.mlp.fc)
-        self.decoder.mlp.proj = quantize_linear(self.decoder.mlp.proj, self.quantization, layer.mlp.proj)
-
-        if hasattr(self.decoder.mlp, "gate"):
-            self.decoder.mlp.gate = quantize_linear(self.decoder.mlp.gate, self.quantization, layer.mlp.gate)
diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py
deleted file mode 100644
index e05979fa75a0..000000000000
--- a/nemo/export/trt_llm/decoder/falcon.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.models.falcon.model import FalconDecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class FALCONDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The FALCON implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act_fn
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.self_attn.num_heads
-
-    @override
-    def infer_num_kv_heads(self, layer):
-        return layer.self_attn.num_key_value_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.self_attn.max_position_embeddings
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
-
-    @override
-    def build_mlp_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.mlp_layernorm, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
-
-
-class FALCONDecoderLayerBuilder(DecoderLayerBuilder):
-    """The FALCON implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        # Falcon 7B: parallel_attention=True, new_decoder_architecture=False
-        # Falcon 40B/180B: parallel_attention=True, new_decoder_architecture=True
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            hidden_act=non_gated_version(self.hidden_act),
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type="rope_gpt_neox",
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            quantization=QuantConfig(),
-            max_lora_rank=layer.max_lora_rank,
-            use_parallel_embedding=False,
-        )
-
-        # No other way to pass in model variant config, determine model variant by num_layers (7B: 32 layers)
-        config.set_if_not_exist('new_decoder_architecture', False if self.num_layers == 32 else True)
-        config.set_if_not_exist('parallel_attention', True)
-        config.set_if_not_exist('layernorm_epsilon', 1e-5)
-        config.set_if_not_exist('bias', False)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        return FalconDecoderLayer(
-            config=config,
-            layer_idx=self.layer_id,
-        )
diff --git a/nemo/export/trt_llm/decoder/gemma.py b/nemo/export/trt_llm/decoder/gemma.py
deleted file mode 100644
index 37f843dcf0ca..000000000000
--- a/nemo/export/trt_llm/decoder/gemma.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.models.gemma.model import GemmaDecoderLayer, QuantConfig
-from tensorrt_llm.models.modeling_utils import PretrainedConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class GemmaDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The LLAMA implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act_fn
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.self_attn.num_heads
-
-    @override
-    def infer_num_kv_heads(self, layer):
-        return layer.self_attn.num_key_value_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.self_attn.max_position_embeddings
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
-
-
-class GemmaDecoderLayerBuilder(DecoderLayerBuilder):
-    """The Gemma implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        rotary_scaling = None
-        if layer.rotary_scaling is not None:
-            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
-
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            head_size=layer.kv_channels,
-            hidden_act=self.hidden_act.split("-")[-1] if layer.moe_num_experts else non_gated_version(self.hidden_act),
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type="rope_gpt_neox",
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            quantization=QuantConfig(),
-            max_lora_rank=layer.max_lora_rank,
-        )
-
-        config.set_if_not_exist('mlp_bias', False)
-        config.set_if_not_exist('attn_bias', False)
-        config.set_if_not_exist('rotary_base', layer.rotary_base)
-        config.set_if_not_exist('rotary_scaling', rotary_scaling)
-        config.set_if_not_exist('enable_pos_shift', False)
-        config.set_if_not_exist('dense_context_fmha', False)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        return GemmaDecoderLayer(
-            config=config,
-            layer_idx=self.layer_id,
-        )
diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py
deleted file mode 100644
index a405aabbbd48..000000000000
--- a/nemo/export/trt_llm/decoder/gpt.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.layers import AttentionMaskType, PositionEmbeddingType
-from tensorrt_llm.models.gpt.model import GPTDecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class GPTDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The GPT2 implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.attn.num_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.attn.bias.shape[2]
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.ln_1, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.attn.c_attn],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.attn.c_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.c_fc,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.c_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.ln_2, dtype=self.dtype)
-
-
-class GPTDecoderLayerBuilder(DecoderLayerBuilder):
-    """The GPT implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        rotary_pct = layer.rotary_pct
-
-        position_embedding_type = "rope_gpt_neox" if layer.position_embedding_type == "rope" else "learned_absolute"
-
-        assert not (position_embedding_type == "rope_gpt_neox" and rotary_pct == 0.0)
-
-        bias_qkv = layer.attention.qkv.bias is not None
-
-        rotary_scaling = None
-        if layer.rotary_scaling is not None:
-            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
-
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            hidden_act=self.hidden_act,
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type=position_embedding_type,
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            max_lora_rank=layer.max_lora_rank,
-            quantization=QuantConfig(),
-        )
-
-        config.set_if_not_exist('hidden_act', self.hidden_act)
-        config.set_if_not_exist('apply_query_key_layer_scaling', False)
-        config.set_if_not_exist('bias', bias_qkv)
-        config.set_if_not_exist('rotary_base', layer.rotary_base)
-        config.set_if_not_exist('rotary_scaling', rotary_scaling)
-        config.set_if_not_exist('rotary_pct', rotary_pct)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        return GPTDecoderLayer(
-            config=config,
-            layer_idx=self.layer_id,
-        )
diff --git a/nemo/export/trt_llm/decoder/gptj.py b/nemo/export/trt_llm/decoder/gptj.py
deleted file mode 100644
index 327a31fdd35c..000000000000
--- a/nemo/export/trt_llm/decoder/gptj.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.models.gptj.model import GPTJDecoderLayer
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class GPTJDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The GPTJ implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        """Returns the hidden act fn in the MLP layer, e.g. SiLUActivation or NewGELUActivation."""
-        return layer.mlp.act
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.attn.num_attention_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.attn.bias.shape[2]
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.ln_1, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.attn.q_proj, layer.attn.k_proj, layer.attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.attn.out_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.rotary_dim = layer.attn.rotary_dim
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.fc_in,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.fc_out,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        # GPTJ do not have post layer_norm
-        return None
-
-
-class GPTJDecoderLayerBuilder(DecoderLayerBuilder):
-    """The GPTJ implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        assert self.tensor_parallel == 1 and self.rank == 0, "Only single GPU is supported for GPTJ"
-
-        return GPTJDecoderLayer(
-            hidden_size=self.hidden_size,
-            num_attention_heads=self.num_attention_heads,
-            max_position_embeddings=self.max_position_embeddings,
-            rotary_dim=layer.attention.rotary_dim,
-            dtype=self.dtype,
-            hidden_act=self.hidden_act,
-            tp_group=self.tp_group,
-            tp_size=self.tensor_parallel,
-            max_lora_rank=layer.max_lora_rank,
-        )
diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py
deleted file mode 100644
index b37d62e214de..000000000000
--- a/nemo/export/trt_llm/decoder/llama.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional
-
-from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.layers import MoeConfig
-from tensorrt_llm.models.llama.model import LLaMADecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
-from typing_extensions import override
-
-from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
-from nemo.export.trt_llm.model_config import (
-    LINEAR_COLUMN,
-    LINEAR_ROW,
-    AttentionConfig,
-    LayernormConfig,
-    LinearConfig,
-    MLPConfig,
-)
-
-
-class LLAMADecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
-    """The LLAMA implementation of the DecoderLayerConfigBuilder."""
-
-    @override
-    def hidden_act_fn(self, layer):
-        return layer.mlp.act_fn
-
-    @override
-    def infer_num_attention_heads(self, layer):
-        return layer.self_attn.num_heads
-
-    @override
-    def infer_num_kv_heads(self, layer):
-        return layer.self_attn.num_key_value_heads
-
-    @override
-    def infer_max_position_embeddings(self, layer):
-        return layer.self_attn.max_position_embeddings
-
-    @override
-    def build_input_layernorm(self, layer) -> LayernormConfig:
-        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
-
-    @override
-    def build_attention(self, layer) -> AttentionConfig:
-        config = AttentionConfig()
-        config.qkv = LinearConfig.from_qkv_nn_modules(
-            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        config.dense = LinearConfig.from_nn_module(
-            layer.self_attn.o_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_mlp(self, layer) -> MLPConfig:
-        config = MLPConfig()
-        config.fc = LinearConfig.from_nn_module(
-            layer.mlp.gate_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.proj = LinearConfig.from_nn_module(
-            layer.mlp.down_proj,
-            LINEAR_ROW,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-        config.gate = LinearConfig.from_nn_module(
-            layer.mlp.up_proj,
-            LINEAR_COLUMN,
-            rank=self.rank,
-            tensor_parallel=self.tensor_parallel,
-            dtype=self.dtype,
-        )
-
-        return config
-
-    @override
-    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
-        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
-
-
-class LLAMADecoderLayerBuilder(DecoderLayerBuilder):
-    """The LLAMA implementation of the DecoderLayer."""
-
-    @override
-    def build_decoder(self, layer):
-        rotary_scaling = None
-        if layer.rotary_scaling is not None:
-            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
-
-        config = PretrainedConfig(
-            architecture=None,
-            dtype=self.dtype,
-            logits_dtype=self.dtype,
-            vocab_size=layer.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_kv_heads,
-            hidden_act=self.hidden_act.split("-")[-1] if layer.moe_num_experts else non_gated_version(self.hidden_act),
-            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            norm_epsilon=layer.norm_epsilon,
-            position_embedding_type="rope_gpt_neox",
-            world_size=self.tensor_parallel,
-            tp_size=self.tensor_parallel,
-            pp_size=1,
-            max_lora_rank=layer.max_lora_rank,
-            quantization=QuantConfig(),
-        )
-
-        config.set_if_not_exist('mlp_bias', False)
-        config.set_if_not_exist('attn_bias', False)
-        config.set_if_not_exist('rotary_base', layer.rotary_base)
-        config.set_if_not_exist('rotary_scaling', rotary_scaling)
-        config.set_if_not_exist('enable_pos_shift', False)
-        config.set_if_not_exist('dense_context_fmha', False)
-        config.set_if_not_exist('moe_num_experts', 0)
-
-        if layer.moe_num_experts:
-            if layer.moe_num_experts is not None:
-                if layer.moe_top_k is None:
-                    layer.moe_top_k = 1
-
-                layer.moe_tp_mode = MoeConfig.ParallelismMode.TENSOR_PARALLEL if layer.moe_tp_mode is None else None
-                layer.moe_renorm_mode = (
-                    MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE if layer.moe_renorm_mode is None else None
-                )
-                moe_config = MoeConfig(
-                    layer.moe_num_experts, layer.moe_top_k, layer.moe_tp_mode, layer.moe_renorm_mode
-                )
-                moe_config.validate()
-                config.moe_num_experts = layer.moe_num_experts
-                config.moe_top_k = layer.moe_top_k
-                config.moe_tp_mode = layer.moe_tp_mode
-                config.moe_normalization_mode = layer.moe_renorm_mode
-
-        return LLaMADecoderLayer(
-            config=config,
-            layer_idx=self.layer_id,
-        )
diff --git a/nemo/export/trt_llm/model_config.py b/nemo/export/trt_llm/model_config.py
deleted file mode 100644
index 0f120dc56153..000000000000
--- a/nemo/export/trt_llm/model_config.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dataclasses
-from dataclasses import dataclass, field
-from typing import Dict, List, get_args, get_origin
-
-import numpy as np
-import tensorrt as trt
-import torch.nn as nn
-from tensorrt_llm._utils import pad_vocab_size
-from tensorrt_llm.functional import is_gated_activation
-from transformers import LlamaConfig, PretrainedConfig
-from transformers.models.llama.modeling_llama import LlamaRMSNorm
-
-from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, split, torch_to_numpy_with_dtype
-
-
-DECODER_GPT2 = "gpt2"
-DECODER_GPTJ = "gptj"
-DECODER_LLAMA = "llama"
-DECODER_GPTNEXT = "gptnext"
-DECODER_FALCON = "falcon"
-DECODER_GEMMA = "gemma"
-
-QUANTIZATION_NONE = ""
-QUANTIZATION_FP8 = "fp8"
-QUANTIZATION_INT8_SQ = "int8_sq"
-
-LINEAR_COLUMN = "column"
-LINEAR_ROW = "row"
-
-LAYERNORM_DEFAULT = ""
-LAYERNORM_RMS = "rms"
-
-LAYER_DEFAULT = ""
-LAYER_QKV = "qkv"
-
-
-@dataclass
-class EmbeddingConfig:
-    """The embedding layer config."""
-
-    weight: np.array = None
-    # Whether the embedding weights are local
-    is_local: bool = False
-
-    @staticmethod
-    def from_nn_module(module: nn.Module, dtype=trt.float16):
-        """Converts an nn.Module to an EmbeddingConfig."""
-        return EmbeddingConfig(weight=torch_to_numpy_with_dtype(module.weight, dtype))
-
-    @property
-    def local_vocab_size(self):
-        """Infers the vocab_size from the embedding layer weights shape."""
-        return self.weight.shape[0]
-
-    @property
-    def hidden_size(self):
-        """Infers the hidden_size from the embedding layer weights shape."""
-        return self.weight.shape[1]
-
-
-@dataclass
-class LayernormConfig:
-    """The layernorm layer config."""
-
-    weight: np.array = None
-    bias: np.array = None
-    layernorm_type: str = LAYERNORM_DEFAULT
-
-    @staticmethod
-    def from_nn_module(module: nn.Module, dtype=trt.float16):
-        """Converts an nn.Module to an LayernormConfig."""
-        layernorm_type = LAYERNORM_RMS if type(module) is LlamaRMSNorm else LAYERNORM_DEFAULT
-
-        config = LayernormConfig(weight=torch_to_numpy_with_dtype(module.weight, dtype), layernorm_type=layernorm_type)
-        if layernorm_type == LAYERNORM_DEFAULT:
-            config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
-
-        return config
-
-
-@dataclass
-class LinearConfig:
-    """The linear layer config."""
-
-    linear_type: str = ""
-    weight: np.array = None
-    bias: np.array = None
-    activation_scaling_factor: np.array = None
-    weights_scaling_factor: np.array = None
-    prequant_scaling_factor: np.array = None
-    layer_type: str = LAYER_DEFAULT
-
-    @staticmethod
-    def from_nn_module(module: nn.Module, linear_type: str, rank=0, tensor_parallel=1, dtype=trt.float16):
-        """Converts an nn.Module to an LinearConfig."""
-        weight = torch_to_numpy_with_dtype(module.weight, dtype)
-        if "Conv1D" in type(module).__name__:
-            weight = weight.transpose()
-        else:
-            assert type(module) is nn.Linear
-
-        config = LinearConfig()
-        config.linear_type = linear_type
-        config.weight = np.ascontiguousarray(
-            split(weight, tensor_parallel, rank, dim=0 if linear_type == LINEAR_COLUMN else 1)
-        )
-
-        if hasattr(module, "bias") and module.bias is not None:
-            if linear_type == LINEAR_COLUMN:
-                config.bias = np.ascontiguousarray(
-                    split(
-                        torch_to_numpy_with_dtype(module.bias, dtype),
-                        tensor_parallel,
-                        rank,
-                    )
-                )
-            else:
-                config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
-
-        return config
-
-    @staticmethod
-    def from_qkv_nn_modules(qkv_modules: List[nn.Module], rank=0, tensor_parallel=1, dtype=trt.float16):
-        """Converts the qkv modules to an LinearConfig."""
-        config = LinearConfig()
-        config.linear_type = LINEAR_COLUMN
-        config.layer_type = LAYER_QKV
-        if len(qkv_modules) == 1:
-            # QKV layers combined as a single module, e.g. GPT2
-            qkv_module = qkv_modules[0]
-            assert "Conv1D" in type(qkv_module).__name__
-
-            qkv_shape = qkv_module.weight.shape
-            # Decode the concat QKV weights and split them to different GPU rank.
-            config.weight = np.ascontiguousarray(
-                split(
-                    torch_to_numpy_with_dtype(qkv_module.weight, dtype=dtype).reshape(
-                        qkv_shape[0], 3, qkv_shape[-1] // 3
-                    ),
-                    tensor_parallel,
-                    rank,
-                    dim=-1,
-                )
-                .reshape(qkv_shape[0], -1)
-                .transpose()
-            )
-            config.bias = np.ascontiguousarray(
-                split(
-                    torch_to_numpy_with_dtype(qkv_module.bias, dtype=dtype).reshape(3, qkv_shape[-1] // 3),
-                    tensor_parallel,
-                    rank,
-                    dim=-1,
-                ).reshape(-1)
-            )
-
-        elif len(qkv_modules) == 3:
-            # Separate QKV layers
-            for m in qkv_modules:
-                assert type(m) is nn.Linear
-                assert not (hasattr(m, "bias") and m.bias is not None)
-
-            q_weight = split(torch_to_numpy_with_dtype(qkv_modules[0].weight), tensor_parallel, rank)
-            k_weight = split(torch_to_numpy_with_dtype(qkv_modules[1].weight), tensor_parallel, rank)
-            v_weight = split(torch_to_numpy_with_dtype(qkv_modules[2].weight), tensor_parallel, rank)
-            split_v = np.concatenate((q_weight, k_weight, v_weight))
-            config.weight = np.ascontiguousarray(split_v)
-
-        else:
-            assert False, f"QKV modules format {qkv_modules} not supported"
-
-        return config
-
-
-@dataclass
-class MoEMLPConfig:
-    """The MLP layer config."""
-
-    fc1: LinearConfig = None
-    fc2: LinearConfig = None
-    router: LinearConfig = None
-    hidden_act: str = ""
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        llm_config: PretrainedConfig,
-        layer_id: int,
-        rank: int = 0,
-        is_mcore: bool = False,
-    ):
-        """Converts the nemo weights and config to `MLPConfig`."""
-        mlp = MoEMLPConfig(hidden_act=llm_config.activation_function)
-        mlp.fc1 = LinearConfig(linear_type=LINEAR_COLUMN)
-
-        mlp.fc1.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc1.weight.{rank}"
-        )
-
-        mlp.fc1.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc1.bias.{rank}"
-        )
-
-        mlp.fc2 = LinearConfig(linear_type=LINEAR_ROW)
-        mlp.fc2.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc2.weight.{rank}"
-        )
-        mlp.fc2.bias = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc2.bias.{rank}"
-        )
-
-        mlp.router = LinearConfig(linear_type=LINEAR_ROW)
-        mlp.router.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.router.weight.{rank}")
-        return mlp
-
-
-@dataclass
-class AttentionConfig:
-    """The attention layer config."""
-
-    qkv: LinearConfig = None
-    dense: LinearConfig = None
-
-    rotary_dim: int = -np.inf
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        layer_id: int,
-        rank: int = 0,
-    ):
-        """Converts the nemo weights and config to `AttentionConfig`."""
-        attention = AttentionConfig()
-        attention.qkv = LinearConfig(linear_type=LINEAR_COLUMN, layer_type=LAYER_QKV)
-        attention.qkv.weight = get_tensor_from_dict(
-            weights_dict, f"layers.{layer_id}.attention.query_key_value.weight.{rank}"
-        )
-        attention.qkv.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.attention.query_key_value.bias.{rank}",
-        )
-
-        attention.dense = LinearConfig(linear_type=LINEAR_ROW)
-        attention.dense.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.weight.{rank}")
-        attention.dense.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.attention.dense.bias",
-        )
-        return attention
-
-
-@dataclass
-class MLPConfig:
-    """The MLP layer config."""
-
-    fc: LinearConfig = None
-    gate: LinearConfig = None
-    proj: LinearConfig = None
-    hidden_act: str = ""
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        llm_config: PretrainedConfig,
-        layer_id: int,
-        rank: int = 0,
-        is_mcore: bool = False,
-    ):
-        """Converts the nemo weights and config to `MLPConfig`."""
-        mlp = MLPConfig(hidden_act=llm_config.activation_function)
-        mlp.fc = LinearConfig(linear_type=LINEAR_COLUMN)
-        mlp.fc.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.weight.{rank}")
-
-        # print("********** mlp.fc.weight : ", mlp.fc.weight )
-
-        mlp.fc.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",
-        )
-
-        gated = is_gated_activation(mlp.hidden_act)
-        is_fast_glu = mlp.hidden_act in ['fast-geglu', 'fast-swiglu', 'fast-reglu']
-        if gated:
-            mlp.gate = LinearConfig(linear_type=LINEAR_COLUMN)
-            layer_name = (
-                f"layers.{layer_id}.mlp.dense_h_to_4h_2.weight.{rank}"
-                if isinstance(llm_config, LlamaConfig) and not is_mcore and not is_fast_glu
-                else f"layers.{layer_id}.mlp.dense_h_to_4h.gate.weight.{rank}"
-            )
-            mlp.gate.weight = get_tensor_from_dict(
-                weights_dict,
-                layer_name,
-            )
-            mlp.gate.bias = get_tensor_from_dict(
-                weights_dict,
-                f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}",
-            )
-
-        mlp.proj = LinearConfig(linear_type=LINEAR_ROW)
-        mlp.proj.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_4h_to_h.weight.{rank}")
-        mlp.proj.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_4h_to_h.bias")
-        return mlp
-
-
-@dataclass
-class DecoderLayerConfig:
-    """The decoder layer config."""
-
-    decoder_type: str = ""
-    input_layernorm: LayernormConfig = None
-    mlp_layernorm: LayernormConfig = None  # Falcon 40B/180B has mlp_layernorm
-    attention: AttentionConfig = None
-    post_layernorm: LayernormConfig = None
-    mlp: MLPConfig = None
-
-    num_attention_heads: int = 0
-
-    num_kv_heads: int = 0
-    kv_channels: int = None
-    max_position_embeddings: int = 0
-    rotary_pct: float = 0
-    rotary_base: int = 10000
-    rotary_scaling: float = None
-    position_embedding_type: str = None
-
-    moe_num_experts: int = None
-    moe_top_k: int = None
-    moe_tp_mode: int = None
-    moe_renorm_mode: int = None
-
-    vocab_size: int = 0
-    norm_epsilon: float = 0.0
-    max_lora_rank: int = 64
-
-    @property
-    def is_moe(self):
-        return self.moe_num_experts is not None and self.moe_num_experts > 1
-
-    @property
-    def hidden_size(self):
-        """Returns the hidden size of the transformer model."""
-        if self.is_moe:
-            return self.mlp.fc2.weight.shape[1]
-        else:
-            return self.mlp.fc.weight.shape[1]
-
-    @property
-    def ffn_hidden_size_local(self):
-        """Returns the ffn hidden size of the transformer model."""
-        if self.is_moe:
-            return self.mlp.fc2.weight.shape[-1]
-        else:
-            return self.mlp.fc.weight.shape[0]
-
-    @staticmethod
-    def from_nemo(
-        weights_dict: Dict[str, np.ndarray],
-        llm_config: PretrainedConfig,
-        decoder_type: str,
-        layer_id: int,
-        rank: int = 0,
-        is_mcore: bool = False,
-    ):
-        """Converts the nemo weights and config to `DecoderLayerConfig`."""
-        layer_config = DecoderLayerConfig(
-            decoder_type=decoder_type,
-            num_attention_heads=llm_config.n_head,
-            max_position_embeddings=llm_config.n_positions,
-            rotary_pct=llm_config.rotary_pct if hasattr(llm_config, "rotary_pct") else 1.0,
-            rotary_base=(llm_config.rotary_base if hasattr(llm_config, "rotary_base") else 10000),
-            rotary_scaling=(llm_config.rotary_scaling if hasattr(llm_config, "rotary_scaling") else None),
-            position_embedding_type=(
-                llm_config.position_embedding_type if hasattr(llm_config, "position_embedding_type") else None
-            ),
-            num_kv_heads=(llm_config.num_kv_heads if hasattr(llm_config, "num_kv_heads") else 0),
-            kv_channels=(llm_config.kv_channels if hasattr(llm_config, "kv_channels") else None),
-            moe_num_experts=(llm_config.moe_num_experts if hasattr(llm_config, "moe_num_experts") else None),
-            moe_top_k=(llm_config.moe_top_k if hasattr(llm_config, "moe_top_k") else None),
-            moe_tp_mode=(llm_config.moe_tp_mode if hasattr(llm_config, "moe_tp_mode") else None),
-            moe_renorm_mode=(llm_config.moe_renorm_mode if hasattr(llm_config, "moe_renorm_mode") else None),
-            vocab_size=llm_config.vocab_size,
-            norm_epsilon=llm_config.norm_epsilon,
-        )
-        layer_config.input_layernorm = LayernormConfig()
-        layer_config.input_layernorm.layernorm_type = (
-            LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
-        )
-        layer_config.input_layernorm.weight = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.input_layernorm.weight",
-        )
-        layer_config.input_layernorm.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.input_layernorm.bias",
-        )
-
-        layer_config.mlp_layernorm = LayernormConfig()
-        layer_config.mlp_layernorm.layernorm_type = LAYERNORM_DEFAULT  # Falcon uses default layernorm
-        layer_config.mlp_layernorm.weight = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.pre_mlp_layernorm.weight",
-        )
-        layer_config.mlp_layernorm.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.pre_mlp_layernorm.bias",
-        )
-
-        layer_config.post_layernorm = LayernormConfig()
-        layer_config.post_layernorm.layernorm_type = (
-            LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
-        )
-
-        layer_config.post_layernorm.weight = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.post_attention_layernorm.weight",
-        )
-        layer_config.post_layernorm.bias = get_tensor_from_dict(
-            weights_dict,
-            f"layers.{layer_id}.post_attention_layernorm.bias",
-        )
-
-        if layer_config.post_layernorm.weight is None:  # Falcon doesn't have post layernorm
-            layer_config.post_layernorm = None
-
-        if layer_config.mlp_layernorm.weight is None:
-            layer_config.mlp_layernorm = None
-
-        layer_config.attention = AttentionConfig.from_nemo(
-            weights_dict,
-            layer_id,
-            rank,
-        )
-
-        moe = False
-        if llm_config.moe_num_experts is not None:
-            if llm_config.moe_num_experts > 1:
-                moe = True
-
-        if moe:
-            layer_config.mlp = MoEMLPConfig.from_nemo(weights_dict, llm_config, layer_id, rank, is_mcore)
-        else:
-            layer_config.mlp = MLPConfig.from_nemo(weights_dict, llm_config, layer_id, rank, is_mcore)
-
-        return layer_config
-
-
-def _from_dict(class_type, data):
-    """Helper function to load the data as a class_type. class_type must be a dataclass."""
-    if data is None:
-        return None
-
-    if dataclasses.is_dataclass(class_type):
-        fieldtypes = {f.name: f.type for f in dataclasses.fields(class_type)}
-        return class_type(**{f: _from_dict(fieldtypes[f], data[f]) for f in data})
-    elif get_origin(class_type) == list and dataclasses.is_dataclass(get_args(class_type)[0]):
-        list_value = []
-        for child in data:
-            child_class_type = get_args(class_type)[0]
-            list_value.append(_from_dict(child_class_type, child))
-        return list_value
-    else:
-        return data
-
-
-@dataclass
-class ModelConfig:
-    """The full LLM model config that includes the full information needed for tensorrt_llm engine building.
-
-    This class includes all the fields that tensorrt_llm supports, but not all of the fields are required.
-    """
-
-    # Global metadata
-    quantization: str = QUANTIZATION_NONE
-    dtype: str = "float16"
-
-    # Model structure and weights
-    vocab_embedding: EmbeddingConfig = None
-    positional_embedding: EmbeddingConfig = None
-    layers: List[DecoderLayerConfig] = field(default_factory=list)
-    final_layernorm: LayernormConfig = None
-    lm_head: LinearConfig = None
-
-    # Ptuning metadata
-    use_prompt_tuning: bool = False
-    use_parallel_embedding: bool = False
-    max_lora_rank: int = 64
-
-    # Parallel metadata
-    mapping = None
-
-    def to_dict(self) -> dict:
-        """Converts the instance to a python dict."""
-        return dataclasses.asdict(self)
-
-    @staticmethod
-    def from_dict(d: dict):
-        """Load a dict to a `ModelConfig` instance."""
-        return _from_dict(ModelConfig, d)
-
-    @property
-    def vocab_size(self):
-        """Returns the vocab_size of the model."""
-        return (
-            self.vocab_embedding.local_vocab_size * self.mapping.tp_size
-            if self.vocab_embedding.is_local
-            else self.vocab_embedding.local_vocab_size
-        )
-
-    @property
-    def vocab_size_padded(self):
-        """Returns the padded vocab_size of the model rounds to the tensor_parallel."""
-        return pad_vocab_size(self.vocab_size, self.mapping.tp_size)
-
-    @property
-    def hidden_size(self):
-        """Returns the hidden_size of the model."""
-        return self.vocab_embedding.hidden_size
-
-    @property
-    def max_position_embeddings(self):
-        """Returns the max_position_embedding of the model."""
-        return self.layers[0].max_position_embeddings
-
-    @property
-    def num_attention_heads(self):
-        """Returns the num_attention_heads of the model."""
-        return self.layers[0].num_attention_heads
-
-    @property
-    def num_kv_heads(self):
-        """Returns the num_key_value_heads of the model."""
-        return self.layers[0].num_kv_heads if self.layers[0].num_kv_heads > 0 else self.num_attention_heads
-
-    @property
-    def head_size(self):
-        """Returns the head_size of the model."""
-        return self.layers[0].kv_channels
-
-    @property
-    def hidden_act(self):
-        """Returns the hidden_act of the model."""
-        return self.layers[0].mlp.hidden_act
diff --git a/nemo/export/trt_llm/model_config_trt.py b/nemo/export/trt_llm/model_config_trt.py
deleted file mode 100644
index 635f6ae4d807..000000000000
--- a/nemo/export/trt_llm/model_config_trt.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-from pathlib import Path
-from typing import List, Union
-
-from nemo.export.trt_llm.model_config import ModelConfig
-from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder
-
-
-def model_config_to_tensorrt_llm(
-    model_configs: List[ModelConfig],
-    engine_dir: Union[str, Path],
-    world_size: int = 1,
-    max_input_len: int = 200,
-    max_output_len: int = 200,
-    max_batch_size: int = 1,
-    max_beam_width: int = 1,
-    max_prompt_embedding_table_size: int = 0,
-    use_inflight_batching: bool = False,
-    paged_kv_cache: bool = False,
-    enable_context_fmha: bool = True,
-    enable_multi_block_mode: bool = False,
-    use_refit: bool = False,
-    use_lora_plugin: str = None,
-    lora_target_modules: List[str] = None,
-    max_lora_rank: int = 64,
-):
-    """The API to convert a torch or huggingface model represented as ModelConfig to tensorrt_llm.
-
-    Args:
-        model_configs: The list of ModelConfig converted, 1 for each GPU.
-        engine_dir: The target output directory to save the built tensorrt_llm engines.
-        gpus: the number of inference gpus for multi gpu inferencing.
-        max_input_len: The max input sequence length.
-        max_output_len: The max output sequence length.
-        max_batch_size: The max batch size.
-        max_beam_width: The max beam search width.
-        max_prompt_embedding_table_size: max size of the prompt embedding table.
-        use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
-        paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
-        enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
-        enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context.
-    """
-    engine_dir = Path(engine_dir)
-    if os.path.exists(engine_dir):
-        shutil.rmtree(engine_dir)
-
-    for rank in range(world_size):
-        model_configs[rank].use_prompt_tuning = max_prompt_embedding_table_size > 0
-        model_configs[rank].max_lora_rank = max_lora_rank
-        builder = LMHeadModelBuilder(model_configs[rank])
-        builder.build(
-            output_dir=engine_dir,
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_batch_size=max_batch_size,
-            max_beam_width=max_beam_width,
-            parallel_build=False,
-            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-            use_inflight_batching=use_inflight_batching,
-            paged_kv_cache=paged_kv_cache,
-            enable_context_fmha=enable_context_fmha,
-            enable_multi_block_mode=enable_multi_block_mode,
-            use_refit=use_refit,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_lora_rank=max_lora_rank,
-        )
diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py
index 7598b3f6825f..aa2a29888703 100644
--- a/nemo/export/trt_llm/nemo/convert.py
+++ b/nemo/export/trt_llm/nemo/convert.py
@@ -23,21 +23,6 @@
 weights_dict = {}
 
 
-def cpu_map_location(storage, loc):
-    return storage.cpu()
-
-
-def gpu_map_location(storage, loc):
-    if loc.startswith("cuda"):
-        training_gpu_idx = int(loc.split(":")[1])
-        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
-        return storage.cuda(inference_gpu_idx)
-    elif loc.startswith("cpu"):
-        return storage.cpu()
-    else:
-        raise ValueError(f"Not handled {loc}")
-
-
 def save_val(val, dir, key, tp_num=None):
     suffix = "" if tp_num is None else f".{tp_num}.bin"
     # Transpose linear layer weights to the correct shape.
@@ -411,137 +396,3 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
 
     global weights_dict
     return weights_dict
-
-
-# Similar to split_save_weight but done on GPU for performance
-@torch.no_grad()
-def save_weight_torch(tp_rank, saved_dir, split_factor, key, vals, storage_type, act_range, config):
-    def save_tranpose(val, key, shared=False):
-        if shared or tp_rank is None:
-            suffix = "bin"
-        else:
-            suffix = f"{tp_rank}.bin"
-
-        # Transpose linear layer weights to the correct shape.
-        assert torch.is_tensor(val)
-        if len(val.shape) >= 2:
-            val = val.reshape(val.shape[0], -1)
-            val = torch.transpose(val, 0, 1)
-        val = val.contiguous().to("cpu", non_blocking=True)
-
-        if type(saved_dir) is dict:
-            saved_dir[f"model.{key}.{suffix}"] = val
-        else:
-            global weights_dict
-            weights_dict[f"model.{key}.{suffix}"] = val
-
-    use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
-    split_gated_activation = config.get("split_gated_activation", False)
-    num_attention_heads = config.get("num_attention_heads", 0)
-    tp_size = config.get("tp_size", 1)
-    num_kv_heads = config.get("num_kv_heads", num_attention_heads)
-
-    if not isinstance(vals, list):
-        vals = [vals]
-
-    if config.get("transpose_weights", False) and vals[0].ndim == 2:
-        vals = [val.T for val in vals]
-    if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
-        vals = [val + 1.0 for val in vals]
-
-    gpu_vals = [val.to(storage_type) for val in vals]
-    gpu_val = gpu_vals[0]
-
-    if (
-        "input_layernorm.weight" in key
-        or "input_layernorm.bias" in key
-        or "pre_mlp_layernorm.weight" in key
-        or "pre_mlp_layernorm.bias" in key
-        or "attention.dense.bias" in key
-        or "attention.linear_proj.bias" in key
-        or "post_attention_layernorm.weight" in key
-        or "post_attention_layernorm.bias" in key
-        or "post_self_attn_layernorm.weight" in key
-        or "mlp.dense_4h_to_h.bias" in key
-        or "mlp.linear_fc2.bias" in key
-        or "final_layernorm.weight" in key
-        or "final_layernorm.bias" in key
-    ):
-        if "post_self_attn_layernorm.weight" in key:
-            key = key.replace("post_self_attn_layernorm.weight", "post_attention_layernorm.weight")
-        elif "mlp.linear_fc2.bias" in key:
-            key = key.replace("mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias")
-        elif "attention.linear_proj.bias" in key:
-            key = key.replace("attention.linear_proj.bias", "attention.dense.bias")
-
-        save_tranpose(gpu_val, key, shared=True)
-    elif (
-        "attention.dense.weight" in key
-        or "mlp.dense_4h_to_h.weight" in key
-        or "attention.linear_proj.weight" in key
-        or "mlp.linear_fc2.weight" in key
-    ):
-        if "attention.linear_proj.weight" in key:
-            key = key.replace("attention.linear_proj.weight", "attention.dense.weight")
-        elif "mlp.linear_fc2.weight" in key:
-            key = key.replace("mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight")
-        save_tranpose(gpu_val, key)
-    elif (
-        "mlp.dense_h_to_4h.weight" in key
-        or "mlp.dense_h_to_4h.bias" in key
-        or "mlp.linear_fc1.weight" in key
-        or "mlp.linear_fc1.bias" in key
-    ):
-        if split_gated_activation:
-            val, gate = torch.chunk(gpu_val, 2, axis=-1)
-        else:
-            val, gate = None, None
-
-        if "mlp.linear_fc1" in key:
-            key = key.replace("mlp.linear_fc1", "mlp.dense_h_to_4h")
-
-        save_tranpose(val, key)
-
-        if split_gated_activation:
-            prefix, dot, suffix = key.rpartition(".")
-            key = prefix + ".gate" + dot + suffix
-            save_tranpose(gate, key)
-
-    elif "mlp.dense_h_to_4h_2.weight" in key or "mlp.dense_h_to_4h_2.bias" in key:
-        save_tranpose(gpu_val, key)
-
-    elif "attention.query_key_value.bias" in key or "attention.linear_qkv.bias" in key:
-        raise NotImplementedError("Attention QKV bias not implemented")
-
-    elif "attention.query_key_value.weight" in key or "attention.linear_qkv.weight" in key:
-        assert use_attention_nemo_shape, "Only support NEMO shape for QKV weights"
-        hidden_dim = vals[0].shape[0]
-        size_per_head = hidden_dim // num_attention_heads
-        q_num = num_attention_heads // num_kv_heads
-
-        len_vals = len(vals)
-        gpu_val = gpu_val.reshape(hidden_dim, num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head)
-
-        # Split the QKV to separate variables.
-        # [qqqqkkvv] - > [qqqq,kk,vv]
-        qkv = torch.split(gpu_val, [q_num, 1, 1], dim=2)
-        split_vals = torch.concatenate(
-            [qkv[0].reshape(hidden_dim, -1), qkv[1].reshape(hidden_dim, -1), qkv[2].reshape(hidden_dim, -1)], dim=1
-        )
-
-        if "attention.linear_qkv.weight" in key:
-            key = key.replace("attention.linear_qkv.weight", "attention.query_key_value.weight")
-        save_tranpose(split_vals, key)
-
-    elif (
-        "attention.query.weight" in key
-        or "attention.query.bias" in key
-        or "attention.key_value.weight" in key
-        or "attention.key_value.bias" in key
-    ):
-        pass
-    else:
-        print(f"[WARNING] {key} not handled by converter")
-
-    global weights_dict
-    return weights_dict
diff --git a/nemo/export/trt_llm/nemo/nemo.py b/nemo/export/trt_llm/nemo/nemo.py
index c3564f1c4e8e..6276de5dddd9 100644
--- a/nemo/export/trt_llm/nemo/nemo.py
+++ b/nemo/export/trt_llm/nemo/nemo.py
@@ -23,11 +23,25 @@
 from transformers import FalconConfig, GPT2Config, LlamaConfig
 
 from nemo.export.tarutils import TarPath
-from nemo.export.trt_llm.nemo.convert import cpu_map_location, gpu_map_location
 
 LOGGER = logging.getLogger("NeMo")
 
 
+def cpu_map_location(storage, loc):
+    return storage.cpu()
+
+
+def gpu_map_location(storage, loc):
+    if loc.startswith("cuda"):
+        training_gpu_idx = int(loc.split(":")[1])
+        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
+        return storage.cuda(inference_gpu_idx)
+    elif loc.startswith("cpu"):
+        return storage.cpu()
+    else:
+        raise ValueError(f"Not handled {loc}")
+
+
 def nemo_to_llm_config(nemo_model_config, vocab_size, eos_id, bos_id, decoder_type):
     convertion_dict = {
         "activation_function": "activation",
diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
index 8112bb8755e3..d83129b43fab 100644
--- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
+++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
@@ -35,7 +35,7 @@
 from transformers import AutoTokenizer, GPT2Tokenizer, LlamaConfig
 
 from nemo.export.tarutils import TarPath, ZarrPathStore
-from nemo.export.trt_llm.nemo.convert import save_weight_torch, split_and_save_weight
+from nemo.export.trt_llm.nemo.convert import split_and_save_weight
 from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir, extract_layers_with_prefix, nemo_to_llm_config
 from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
 
diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py
index d735cab36b00..7e687ce020da 100644
--- a/nemo/export/trt_llm/nemo_utils.py
+++ b/nemo/export/trt_llm/nemo_utils.py
@@ -14,20 +14,16 @@
 
 
 import argparse
-import copy
 import csv
 import datetime
 import logging
 import os
-import shutil
 import sys
-import tempfile
 from pathlib import Path
 from typing import Dict, List, Tuple, Union
 
 import numpy as np
 import tensorrt_llm
-from tensorrt_llm import str_dtype_to_trt
 from tensorrt_llm._utils import pad_vocab_size
 from tensorrt_llm.functional import non_gated_version
 from tensorrt_llm.layers import MoeConfig
@@ -35,24 +31,79 @@
 from transformers import AutoTokenizer, LlamaConfig, PreTrainedTokenizer
 
 from nemo.export.tarutils import TarPath
-from nemo.export.trt_llm.decoder import DECODER_MODEL_TYPE
-from nemo.export.trt_llm.model_config import (
-    LAYERNORM_DEFAULT,
-    LAYERNORM_RMS,
-    LINEAR_COLUMN,
-    DecoderLayerConfig,
-    EmbeddingConfig,
-    LayernormConfig,
-    LinearConfig,
-    ModelConfig,
-)
 from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir
-from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint, convert_nemo_model
-from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, get_tensor_parallel_group, split
+from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint
+
+
+DECODER_MODEL_TYPE = {
+    "gptj": 'GPTForCausalLM',
+    "gptnext": 'GPTForCausalLM',
+    "llama": 'LLaMAForCausalLM',
+    "gemma": 'GemmaForCausalLM',
+    "falcon": 'FalconForCausalLM',
+}
 
 LOGGER = logging.getLogger("NeMo")
 
 
+def prompt_convert(prompt_config, prompt_weights):
+    if "task_templates" in prompt_config:
+        prompt_templates = prompt_config["task_templates"]
+        actual_task_id = 0
+        vtokens_embeddings = []
+        vtokens_len = []
+        for task_name_id, prompt_task in enumerate(prompt_templates):
+            prompt_task_name = prompt_task["taskname"]
+            LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
+            prompt_task_weights = prompt_weights["prompt_table"].get(
+                f"prompt_table.{prompt_task_name}.prompt_embeddings.weight"
+            )
+            if prompt_task_weights is None:
+                continue
+            vtokens_embeddings.append(prompt_task_weights)
+            vtokens_len.append(prompt_task_weights.shape[0])
+            actual_task_id += 1
+
+        max_vtoken_len = max(vtokens_len)
+        embedding_dim = vtokens_embeddings[0].shape[1]
+
+        # pad tasks to longest task embedding table
+        for i, vtoken_emb_table in enumerate(vtokens_embeddings):
+            padded_table = torch.zeros((max_vtoken_len, embedding_dim))
+            padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table
+            vtokens_embeddings[i] = padded_table
+
+        vtokens_embeddings = torch.stack(vtokens_embeddings)
+    else:
+        vtokens_embeddings = prompt_weights["prompt_embeddings_weights"]
+
+    return vtokens_embeddings
+
+
+def is_nemo_file(path):
+    flag = False
+
+    if path is not None:
+        if len(path) > 5:
+            pc = pathlib.Path(path)
+            if pc.exists():
+                if pc.is_file():
+                    if path[-5 : len(path)] == ".nemo":
+                        flag = True
+
+    return flag
+
+
+def split(v, tp_size, idx, dim=0):
+    """Splits the np tensor v on dim and return the idx's slice."""
+    if tp_size == 1:
+        return v
+    if len(v.shape) == 1:
+        return np.ascontiguousarray(np.split(v, tp_size)[idx])
+    else:
+        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
+
+
 def _nemo_llm_decode(
     in_file: str,
     out_dir: str,
@@ -123,83 +174,6 @@ def get_tokenzier(tokenizer_dir_or_path: Path) -> PreTrainedTokenizer:
     return build_tokenizer(tokenizer_config)
 
 
-def nemo_llm_to_model_config(
-    in_file: str,
-    decoder_type: str,
-    nemo_export_dir: Union[str, Path],
-    dtype: str = "bfloat16",
-    tensor_parallel_size: int = 1,
-    pipeline_parallel_size: int = 1,
-    save_nemo_model_config: bool = False,
-) -> Tuple[List[ModelConfig], PreTrainedTokenizer]:
-    """Converts the NEMO file and construct the `ModelConfig` before tensorrt_llm deployment."""
-    dtype_str = dtype
-
-    weights_dict, llm_model_config, tokenizer = _nemo_llm_decode(
-        in_file=in_file,
-        out_dir=nemo_export_dir,
-        tensor_parallelism=tensor_parallel_size,
-        processes=1,
-        storage_type=dtype_str,
-        load_checkpoints_on_gpu=False,
-        decoder_type=decoder_type,
-        save_nemo_model_config=save_nemo_model_config,
-    )
-
-    world_size = tensor_parallel_size * pipeline_parallel_size
-    model_config_template = ModelConfig()
-    model_config_template.dtype = dtype_str
-
-    str_dtype_to_trt(dtype_str)
-
-    model_configs = []
-    for i in range(world_size):
-
-        model_configs.append(copy.deepcopy(model_config_template))
-
-        model_configs[i].vocab_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wte"))
-
-        model_configs[i].positional_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wpe"))
-
-        model_configs[i].final_layernorm = LayernormConfig(
-            weight=get_tensor_from_dict(weights_dict, "final_layernorm.weight"),
-            bias=get_tensor_from_dict(weights_dict, "final_layernorm.bias"),
-        )
-        model_configs[i].final_layernorm.layernorm_type = (
-            LAYERNORM_RMS if isinstance(llm_model_config, LlamaConfig) else LAYERNORM_DEFAULT
-        )
-        model_configs[i].mapping = tensorrt_llm.Mapping(
-            world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size
-        )
-
-    for i in range(llm_model_config.n_layer):
-        for j in range(world_size):
-            model_configs[j].layers.append(
-                DecoderLayerConfig.from_nemo(
-                    weights_dict=weights_dict,
-                    llm_config=llm_model_config,
-                    decoder_type=decoder_type,
-                    layer_id=i,
-                    rank=model_configs[j].mapping.tp_rank,
-                    is_mcore=llm_model_config.is_mcore,
-                )
-            )
-
-    lm_head_weight = get_tensor_from_dict(weights_dict, "lm_head.weight")
-
-    if model_configs[0].vocab_size_padded != model_configs[0].vocab_size:
-        pad_width = model_configs[0].vocab_size_padded - model_configs[0].vocab_size
-        lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
-
-    for i in range(world_size):
-        model_configs[i].lm_head = LinearConfig(linear_type=LINEAR_COLUMN)
-        model_configs[i].lm_head.weight = np.ascontiguousarray(
-            split(lm_head_weight, model_configs[i].mapping.tp_size, model_configs[i].mapping.tp_rank)
-        )
-
-    return model_configs, tokenizer
-
-
 def to_word_list_format(
     word_dict: List[List[str]],
     tokenizer=None,
@@ -258,83 +232,6 @@ def to_word_list_format(
     return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
 
 
-def nemo_llm_model_to_model_config(
-    nemo_model: str,
-    decoder_type: str,
-    nemo_model_config: str,
-    dtype_str: str = "float32",
-) -> Tuple[List[ModelConfig], PreTrainedTokenizer]:
-    """Converts the NEMO model object and construct the `ModelConfig` before tensorrt_llm deployment."""
-    from megatron.core import parallel_state
-
-    assert nemo_model_config is not None, "gpt_model_config must be provided when in is a nemo model"
-
-    weights_dict, llm_model_config = convert_nemo_model(nemo_model, nemo_model_config, dtype_str, decoder_type)
-    is_mcore = nemo_model_config.get("mcore_gpt", False)
-    llm_model_config.is_mcore = is_mcore
-
-    model_config = ModelConfig()
-    model_config.use_prompt_tuning = False
-    model_config.dtype = dtype_str
-    model_config.use_parallel_embedding = True
-    str_dtype_to_trt(dtype_str)
-
-    model_config.vocab_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wte"), is_local=True)
-
-    model_config.positional_embedding = EmbeddingConfig(
-        weight=get_tensor_from_dict(weights_dict, "wpe"), is_local=True
-    )
-
-    model_config.final_layernorm = LayernormConfig(
-        weight=get_tensor_from_dict(weights_dict, "final_layernorm.weight"),
-        bias=get_tensor_from_dict(weights_dict, "final_layernorm.bias"),
-    )
-    model_config.final_layernorm.layernorm_type = (
-        LAYERNORM_RMS if isinstance(llm_model_config, LlamaConfig) else LAYERNORM_DEFAULT
-    )
-
-    tensor_parallel_size = nemo_model_config.tensor_model_parallel_size
-    pipeline_parallel_size = 1
-    world_size = tensor_parallel_size * pipeline_parallel_size
-
-    # hack since tensorrt_llm doesnt support DP natively so init all ranks with DP=1
-    model_config.mapping = tensorrt_llm.Mapping(
-        world_size=tensor_parallel_size * pipeline_parallel_size,
-        rank=tensorrt_llm.mpi_rank() % world_size,
-        tp_size=tensor_parallel_size,
-        pp_size=pipeline_parallel_size,
-    )
-    model_config.mapping.rank = tensorrt_llm.mpi_rank()
-    model_config.mapping.tp_group = get_tensor_parallel_group(tensor_parallel_size)
-
-    LOGGER.info(
-        f'''Resharing: Rank {tensorrt_llm.mpi_rank()} mapping:
-        tp_rank  {parallel_state.get_tensor_model_parallel_rank()} -> {model_config.mapping.tp_rank},
-        pp_rank  {parallel_state.get_pipeline_model_parallel_rank()} -> {model_config.mapping.pp_rank},
-        tp_group {model_config.mapping.tp_group}'''
-    )
-
-    for i in range(llm_model_config.n_layer):
-        model_config.layers.append(
-            DecoderLayerConfig.from_nemo(
-                weights_dict=weights_dict,
-                llm_config=llm_model_config,
-                decoder_type=decoder_type,
-                layer_id=i,
-                rank=model_config.mapping.tp_rank,
-                is_mcore=llm_model_config.is_mcore,
-            )
-        )
-    lm_head_weight = get_tensor_from_dict(weights_dict, "lm_head.weight")
-
-    assert model_config.vocab_size_padded == model_config.vocab_size
-
-    model_config.lm_head = LinearConfig(linear_type=LINEAR_COLUMN)
-    model_config.lm_head.weight = lm_head_weight
-
-    return [model_config]
-
-
 def nemo_to_trtllm_config(
     in_file: str,
     decoder_type: str,
diff --git a/nemo/export/trt_llm/quantization_utils.py b/nemo/export/trt_llm/quantization_utils.py
deleted file mode 100644
index 86365f774bb7..000000000000
--- a/nemo/export/trt_llm/quantization_utils.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import numpy as np
-from tensorrt_llm.layers import Linear, RowLinear
-from tensorrt_llm.quantization.layers import FP8Linear, FP8RowLinear, Int8SmoothQuantLinear, Int8SmoothQuantRowLinear
-
-from nemo.export.trt_llm.model_config import (
-    QUANTIZATION_FP8,
-    QUANTIZATION_INT8_SQ,
-    QUANTIZATION_NONE,
-    LinearConfig,
-    ModelConfig,
-)
-
-
-def quantize_linear(tensorrt_llm_layer, quantization: str, layer_config: LinearConfig):
-    """Returns the quantized tensorrt_llm linear layer."""
-    if quantization == QUANTIZATION_NONE:
-        return tensorrt_llm_layer
-
-    if quantization == QUANTIZATION_FP8:
-        # FP8 is not sensitive to scaling factors. So we just quantize all layers possible.
-        default_scaling_factor = np.array([1], dtype=np.float32)
-        if layer_config.activation_scaling_factor is None:
-            layer_config.activation_scaling_factor = default_scaling_factor
-        if layer_config.weights_scaling_factor is None:
-            layer_config.weights_scaling_factor = default_scaling_factor
-
-    if layer_config.activation_scaling_factor is None or layer_config.weights_scaling_factor is None:
-        print(f"No valid scaling factors in {tensorrt_llm_layer._get_name()}, skipping quantization" " on this layer")
-        return tensorrt_llm_layer
-    else:
-        assert np.all(layer_config.activation_scaling_factor > 0)
-        assert np.all(layer_config.weights_scaling_factor > 0)
-
-    bias = tensorrt_llm_layer.bias is not None
-
-    linear_layer_type = type(tensorrt_llm_layer)
-    if linear_layer_type == Linear:
-        if quantization == QUANTIZATION_FP8:
-            linear = FP8Linear
-        elif quantization == QUANTIZATION_INT8_SQ:
-            linear = Int8SmoothQuantLinear
-        else:
-            assert False, f"{quantization} is not supported."
-        quantized_linear_layer = linear(
-            in_features=tensorrt_llm_layer.in_features,
-            out_features=tensorrt_llm_layer.out_features * tensorrt_llm_layer.tp_size,
-            bias=bias,
-            dtype=tensorrt_llm_layer.dtype,
-            tp_group=tensorrt_llm_layer.tp_group,
-            tp_size=tensorrt_llm_layer.tp_size,
-            gather_output=tensorrt_llm_layer.gather_output,
-        )
-    elif linear_layer_type == RowLinear:
-        if quantization == QUANTIZATION_FP8:
-            row_linear = FP8RowLinear
-        elif quantization == QUANTIZATION_INT8_SQ:
-            row_linear = Int8SmoothQuantRowLinear
-        else:
-            assert False, f"{quantization} is not supported."
-        quantized_linear_layer = row_linear(
-            in_features=tensorrt_llm_layer.in_features * tensorrt_llm_layer.tp_size,
-            out_features=tensorrt_llm_layer.out_features,
-            bias=bias,
-            dtype=tensorrt_llm_layer.dtype,
-            tp_group=tensorrt_llm_layer.tp_group,
-            tp_size=tensorrt_llm_layer.tp_size,
-        )
-    else:
-        assert False, f"{linear_layer_type} is not supported."
-
-    quantized_linear_layer.weight = tensorrt_llm_layer.weight
-    quantized_linear_layer.bias = tensorrt_llm_layer.bias
-
-    quantized_linear_layer.activation_scaling_factor.value = layer_config.activation_scaling_factor
-    quantized_linear_layer.weights_scaling_factor.value = layer_config.weights_scaling_factor
-
-    if hasattr(quantized_linear_layer, "prequant_scaling_factor"):
-        quantized_linear_layer.prequant_scaling_factor.value = layer_config.prequant_scaling_factor
-
-    return quantized_linear_layer
-
-
-def naive_quantization(config: ModelConfig, quantization: str):
-    """Generates a constant scaling factor (1) with target quantization.
-
-    This is for debugging and performance measurement only.
-    """
-    config.quantization = quantization
-    # Here the scaling factor is not inversed.
-    # In nvidia systems:
-    # pytorch_quantization uses inv scale
-    # onnx & trt uses non-inv scale
-    # cask uses inv scale
-    default_scaling_factor = np.array([1], dtype=np.float32)
-
-    if quantization == QUANTIZATION_FP8:
-        for layer in config.layers:
-            linear_layers = [
-                layer.attention.qkv,
-                layer.attention.dense,
-                layer.mlp.fc,
-                layer.mlp.proj,
-                layer.mlp.gate,
-            ]
-            for linear_layer in linear_layers:
-                if linear_layer:
-                    linear_layer.activation_scaling_factor = default_scaling_factor
-                    linear_layer.weights_scaling_factor = default_scaling_factor
-        config.lm_head.activation_scaling_factor = default_scaling_factor
-        config.lm_head.weights_scaling_factor = default_scaling_factor
-
-    else:
-        assert False, f"{quantization} not supported"
diff --git a/nemo/export/trt_llm/tensor_utils.py b/nemo/export/trt_llm/tensor_utils.py
deleted file mode 100644
index 2fce81b91647..000000000000
--- a/nemo/export/trt_llm/tensor_utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Dict
-
-import numpy as np
-import tensorrt as trt
-import tensorrt_llm
-import torch
-
-
-def torch_to_numpy_with_dtype(tensor, dtype=trt.float16):
-    """Converts a torch tensor to numpy array with the dtype."""
-    if dtype == trt.float16:
-        torch_dtype = torch.float16
-    elif dtype == trt.float32:
-        torch_dtype = torch.float32
-    elif dtype == trt.bfloat16:
-        torch_dtype = torch.bfloat16
-    else:
-        assert False, f"{dtype} not supported"
-    return tensorrt_llm._utils.torch_to_numpy(tensor.detach().to(torch_dtype))
-
-
-def split(v, tp_size, idx, dim=0):
-    """Splits the np tensor v on dim and return the idx's slice."""
-    if tp_size == 1:
-        return v
-    if len(v.shape) == 1:
-        return np.ascontiguousarray(np.split(v, tp_size)[idx])
-    else:
-        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
-
-
-def get_tensor_parallel_group(tensor_parallel: int):
-    """Returns the tensor_parallel_group config based on tensor_parallel."""
-    from mpi4py import MPI
-
-    mpi_rank = MPI.COMM_WORLD.Get_rank()
-    offset = mpi_rank - mpi_rank % tensor_parallel
-    tp_group = list(range(offset, offset + tensor_parallel))
-    return None if tensor_parallel == 1 else tp_group
-
-
-def get_tensor_from_dict(weights_dict: Dict[str, np.ndarray], name: str) -> np.array:
-    """Loads tensor from the weights_dict."""
-    return weights_dict.get(f"model.{name}.bin", None)
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index 2336b8eb38ce..30490cc91254 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -20,10 +20,7 @@
 from pathlib import Path
 from typing import List
 
-import tensorrt as trt
 import tensorrt_llm
-import torch
-from tensorrt_llm import str_dtype_to_trt
 from tensorrt_llm._common import check_max_num_tokens
 from tensorrt_llm._utils import np_dtype_to_trt
 from tensorrt_llm.builder import BuildConfig, Builder
@@ -41,323 +38,6 @@
 LOGGER = logging.getLogger("NeMo")
 
 
-def get_engine_name(model, dtype, tp_size, pp_size, rank):
-    """Returns the engine file name based on the provided info."""
-    if pp_size == 1:
-        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
-    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, pp_size, rank)
-
-
-def serialize_engine(engine, path):
-    """Serializes the engine to path."""
-    logger.info(f"Serializing engine to {path}...")
-    tik = time.time()
-    with open(path, "wb") as f:
-        f.write(bytearray(engine))
-    tok = time.time()
-    t = time.strftime("%H:%M:%S", time.gmtime(tok - tik))
-    logger.info(f"Engine serialized. Total time: {t}")
-
-
-def refit_runtime_engine(params, cuda_engine):
-    '''
-    @brief: Inplace refit one TensorRT cuda engine using weights from the network,
-        user should guarantee that the engine is built with REFIT flag, and the network has the same structure with the engine.
-    @param engine_buffer: A serialized TensorRT engine.
-    @param network: Network object.
-    @return: A serialized TRT engine if refit successfully, None otherwise
-    '''
-    logger.info(f'Refit runtime engine')
-    tik = time.time()
-
-    # Refit engine
-    assert params is not None
-    refitter = trt.Refitter(cuda_engine, logger.trt_logger)
-    for name, param in params:
-        trt_param = trt.Weights(np_dtype_to_trt(param._value.dtype), param._value.ctypes.data, param._value.size)
-
-        if trt_param is None or not refitter.set_named_weights(name, trt_param):
-            logger.error(f'Failed to refit weight: {name}')
-            return None
-
-    if not refitter.refit_cuda_engine():
-        logger.error(f'Failed to refit engine.')
-        return None
-
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    logger.info(f'Total time of refitting {cuda_engine.name}: {t}')
-
-    return cuda_engine
-
-
-def build_rank_engine(
-    tensorrt_llm_gpt,
-    builder: Builder,
-    builder_config: tensorrt_llm.builder.BuilderConfig,
-    engine_name,
-    args,
-):
-
-    str_dtype_to_trt(args.dtype)
-    ootb = os.getenv("OOTB", False)
-
-    network = builder.create_network()
-    network.trt_network.name = engine_name
-
-    # We have to use the attention plugin for most of the models.
-    if args.use_gpt_attention_plugin:
-        network.plugin_config.set_gpt_attention_plugin(dtype=args.use_gpt_attention_plugin)
-
-    if not ootb:
-        network.plugin_config.use_custom_all_reduce = False
-
-        if args.use_gemm_plugin:
-            network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
-        if args.use_layernorm_plugin:
-            network.plugin_config.set_layernorm_plugin(dtype=args.use_layernorm_plugin)
-        assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
-        if args.enable_context_fmha:
-            network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
-        if args.enable_context_fmha_fp32_acc:
-            network.plugin_config.set_context_fmha(ContextFMHAType.enabled_with_fp32_acc)
-        if args.remove_input_padding:
-            network.plugin_config.enable_remove_input_padding()
-        else:
-            network.plugin_config.remove_input_padding = False
-        if args.paged_kv_cache:
-            network.plugin_config.enable_paged_kv_cache()
-        else:
-            network.plugin_config.paged_kv_cache = False
-        if args.use_ib_gpt_attention_plugin:
-            network.plugin_config.set_inflight_batching_gpt_attention_plugin(dtype=args.use_ib_gpt_attention_plugin)
-        if args.enable_multi_block_mode:
-            network.plugin_config.enable_mmha_multi_block_mode()
-
-        if args.use_lora_plugin:
-            network.plugin_config.set_lora_plugin(dtype=args.use_lora_plugin)
-
-        if args.use_lookup_plugin:
-            # Use the plugin for the embedding parallelism and sharing
-            network.plugin_config.set_lookup_plugin(dtype=args.dtype)
-    else:
-        LOGGER.warning("Build engine in OOTB mode, disable all plugins except nccl.")
-
-    if args.mapping.world_size > 1:
-        network.plugin_config.set_nccl_plugin(args.dtype)
-
-    with net_guard(network):
-        # Prepare
-        network.set_named_parameters(tensorrt_llm_gpt.named_parameters())
-
-        # Forward
-        inputs = tensorrt_llm_gpt.prepare_inputs(
-            max_batch_size=args.max_batch_size,
-            max_input_len=args.max_input_len,
-            max_new_tokens=args.max_input_len + args.max_output_len,
-            use_cache=True,
-            max_beam_width=args.max_beam_width,
-            paged_kv_cache=args.paged_kv_cache,
-            tokens_per_block=args.tokens_per_block,
-            prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-            lora_target_modules=args.lora_target_modules,
-        )
-        tensorrt_llm_gpt(*inputs)
-
-    # Network -> Engine
-    engine = builder.build_engine(network, builder_config)
-    if args.mapping.rank == 0 or args.use_refit:
-        config_path = args.output_dir / "config.json"
-        builder.save_config(builder_config, config_path)
-    return engine
-
-
-def _build_impl(tensorrt_llm_model, args):
-    torch.cuda.set_device(args.mapping.rank % args.gpus_per_node)
-    tensorrt_llm.logger.set_level(args.log_level)
-    args.output_dir.mkdir(parents=True, exist_ok=True)
-    timing_cache_file = args.timing_cache if args.timing_cache else args.output_dir / "model.cache"
-    timing_cache = timing_cache_file
-
-    if args.use_lora_plugin is not None:
-        add_lora(tensorrt_llm_model, args.max_lora_rank)
-
-    builder = Builder()
-    apply_query_key_layer_scaling = False
-
-    builder_config = builder.create_builder_config(
-        name=MODEL_NAME,
-        precision=args.dtype,
-        timing_cache=timing_cache,
-        tensor_parallel=args.mapping.tp_size,
-        pipeline_parallel=args.mapping.pp_size,
-        world_size=args.mapping.tp_size * args.mapping.pp_size,
-        parallel_build=args.parallel_build,
-        num_layers=tensorrt_llm_model._num_layers,
-        num_heads=tensorrt_llm_model._num_heads,
-        num_kv_heads=tensorrt_llm_model._num_kv_heads,
-        head_size=tensorrt_llm_model._head_size,
-        hidden_size=tensorrt_llm_model._hidden_size,
-        vocab_size=tensorrt_llm_model._vocab_size,
-        hidden_act=tensorrt_llm_model.hidden_act,
-        max_position_embeddings=tensorrt_llm_model.max_position_embeddings,
-        add_bos=tensorrt_llm_model._add_bos,
-        apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-        max_batch_size=args.max_batch_size,
-        max_input_len=args.max_input_len,
-        max_output_len=args.max_output_len,
-        max_beam_width=args.max_beam_width,
-        max_num_tokens=None,
-        max_draft_len=0,
-        int8="int8" in args.quantization,
-        opt_level=args.builder_opt,
-        paged_kv_cache=args.paged_kv_cache,
-        tokens_per_block=args.tokens_per_block,
-        max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-        use_parallel_embedding=args.use_parallel_embedding,
-        embedding_sharding_dim=args.embedding_sharding_dim,
-        fp8="fp8" in args.quantization,
-        use_refit=args.use_refit,
-        gather_context_logits=False,
-        gather_generation_logits=False,
-        quant_mode=args.quant_mode,
-        lora_target_modules=args.lora_target_modules,
-        max_lora_rank=args.max_lora_rank,
-    )
-
-    tp_size = args.mapping.tp_size
-    pp_size = args.mapping.pp_size
-    rank = args.mapping.rank
-    engine_name = get_engine_name(MODEL_NAME, args.dtype, tp_size, pp_size, rank)
-    engine = build_rank_engine(tensorrt_llm_model, builder, builder_config, engine_name, args)
-    assert engine is not None, f"Failed to build engine for rank {rank}"
-
-    serialize_engine(engine, args.output_dir / engine_name)
-
-    if args.mapping.rank == 0:
-        ok = builder.save_timing_cache(builder_config, timing_cache_file)
-        assert ok, "Failed to save timing cache."
-
-
-def build(
-    tensorrt_llm_model,
-    output_dir: Path,
-    mapping=None,
-    dtype="float16",
-    timing_cache="",
-    log_level="info",
-    max_batch_size=1,
-    max_input_len=200,
-    max_output_len=200,
-    max_beam_width=1,
-    max_prompt_embedding_table_size=0,
-    parallel_build=False,
-    gpus_per_node=1,
-    quantization=None,
-    use_inflight_batching=False,
-    paged_kv_cache=False,
-    enable_context_fmha: bool = True,
-    enable_multi_block_mode=False,
-    use_refit=False,
-    use_lora_plugin: str = None,
-    lora_target_modules: List[str] = None,
-    max_lora_rank: int = 64,
-):
-    """Builds the tensorrt_llm_model to engine."""
-    args = argparse.Namespace()
-    args.mapping = mapping
-    args.dtype = dtype
-    args.timing_cache = timing_cache
-    args.log_level = log_level
-    args.max_batch_size = max_batch_size
-    args.max_input_len = max_input_len
-    args.max_output_len = max_output_len
-    args.max_beam_width = max_beam_width
-    args.use_gpt_attention_plugin = dtype
-    args.use_gemm_plugin = dtype
-    args.use_layernorm_plugin = False
-    args.parallel_build = parallel_build
-    args.enable_context_fmha = enable_context_fmha
-    args.enable_context_fmha_fp32_acc = False
-    args.gpus_per_node = gpus_per_node
-    args.builder_opt = None
-    args.output_dir = Path(output_dir)
-    args.remove_input_padding = True
-    args.use_smooth_quant = False
-    args.use_weight_only = False
-    args.weight_only_precision = "int8"
-    args.per_channel = False
-    args.per_token = False
-    args.int8_kv_cache = False
-    args.random_seed = None
-    args.paged_kv_cache = paged_kv_cache
-    args.max_prompt_embedding_table_size = max_prompt_embedding_table_size
-    args.use_inflight_batching = use_inflight_batching
-    args.use_ib_gpt_attention_plugin = False
-    args.use_parallel_embedding = False
-    args.embedding_sharding_dim = 0
-    args.use_lookup_plugin = False
-    args.tokens_per_block = 64
-    args.quantization = quantization
-    args.enable_multi_block_mode = enable_multi_block_mode
-    args.use_refit = use_refit
-    args.use_lora_plugin = use_lora_plugin
-    args.lora_target_modules = lora_target_modules
-    args.max_lora_rank = max_lora_rank
-
-    logger.set_level(args.log_level)
-
-    assert not (
-        args.use_smooth_quant and args.use_weight_only
-    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
-
-    assert not (
-        args.use_smooth_quant and args.use_weight_only
-    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
-
-    if args.use_ib_gpt_attention_plugin:
-        logger.warning(
-            "use_ib_gpt_attention_plugin is deprecated. Use combination of"
-            " --use_gpt_attention_plugin=dtype --use_inflight_batching instead."
-        )
-
-    if args.use_inflight_batching:
-        assert args.use_gpt_attention_plugin, "You have to use GPT attention plugin for in-flight batching mode"
-
-        if not args.paged_kv_cache:
-            logger.warning("Paged kv cache feature will enabled for in-flight batching mode.")
-            args.paged_kv_cache = True
-
-        if not args.remove_input_padding:
-            logger.warning("Remove input padding feature will enabled for in-flight batching mode.")
-            args.remove_input_padding = True
-
-    if args.use_smooth_quant:
-        args.quant_mode = QuantMode.use_smooth_quant(args.per_token, args.per_channel)
-    elif args.use_weight_only:
-        args.quant_mode = QuantMode.use_weight_only(args.weight_only_precision == "int4")
-    else:
-        args.quant_mode = QuantMode(0)
-
-    if args.int8_kv_cache:
-        args.quant_mode = args.quant_mode.set_int8_kv_cache()
-
-    if args.random_seed is not None:
-        torch.manual_seed(args.random_seed)
-
-    if args.mapping.is_first_pp_rank():
-        if tensorrt_llm_model._modules['vocab_embedding'].tp_size > 1:
-            args.use_parallel_embedding = True
-            args.embedding_sharding_dim = tensorrt_llm_model._modules['vocab_embedding'].sharding_dim
-
-    tik = time.time()
-    _build_impl(tensorrt_llm_model, args)
-
-    tok = time.time()
-    t = time.strftime("%H:%M:%S", time.gmtime(tok - tik))
-    logger.info(f"Total time of building all {args.mapping.world_size} engines: {t}")
-
-
 def build_and_save_engine(
     max_input_len=1024,
     max_output_len=1024,
diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py
deleted file mode 100644
index f4b44552af63..000000000000
--- a/nemo/export/trt_llm/tensorrt_llm_model.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from pathlib import Path
-from typing import List
-
-import numpy as np
-import torch
-from tensorrt_llm import default_net, str_dtype_to_trt
-from tensorrt_llm._utils import trt_dtype_to_str
-from tensorrt_llm.functional import expand_mask, gather_last_token_logits, recv, send, shape
-from tensorrt_llm.layers import AttentionParams, ColumnLinear, KeyValueCacheParams, LoraParams
-from tensorrt_llm.models.generation_mixin import GenerationMixin
-from tensorrt_llm.module import Module, ModuleList
-
-from nemo.export.trt_llm.decoder import build_decoder_layer
-from nemo.export.trt_llm.model_config import DECODER_GEMMA, DECODER_LLAMA, ModelConfig
-from nemo.export.trt_llm.quantization_utils import quantize_linear
-from nemo.export.trt_llm.tensorrt_llm_build import build
-from nemo.export.trt_llm.tensorrt_llm_utils import (
-    build_embedding_from_config,
-    build_layernorm_from_config,
-    print_tensorrt_llm,
-)
-
-
-def get_transformer_layers(mapping, num_layers):
-    layers_per_pipeline_stage = num_layers // mapping.pp_size
-    layers_range = list(
-        range(mapping.pp_rank * layers_per_pipeline_stage, (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1)
-    )
-    return layers_range
-
-
-class ModelBuilder(Module):
-    def __init__(self, model_config: ModelConfig):
-        super().__init__()
-        self.quantization = model_config.quantization
-        self.max_position_embeddings = model_config.max_position_embeddings
-        self.hidden_act = model_config.hidden_act
-
-        self._dtype = str_dtype_to_trt(model_config.dtype)
-        self._kv_dtype = self._dtype
-        self._tensor_parallel = model_config.mapping.tp_size
-        self._vocab_size = model_config.vocab_size
-        self._hidden_size = model_config.hidden_size
-        self._num_layers = len(model_config.layers)
-        self._num_heads = model_config.num_attention_heads
-        self._num_kv_heads = model_config.num_kv_heads
-        self._head_size = (
-            model_config.hidden_size // model_config.num_attention_heads
-            if model_config.head_size is None
-            else model_config.head_size
-        )
-        self._use_prompt_tuning = model_config.use_prompt_tuning
-        self._add_bos = model_config.layers[0].decoder_type in (DECODER_GEMMA, DECODER_LLAMA)
-        self._mapping = model_config.mapping
-        self.rank = model_config.mapping.rank
-        self.max_lora_rank = model_config.max_lora_rank
-
-        if self._mapping.is_first_pp_rank():
-            self.vocab_embedding = build_embedding_from_config(
-                model_config.vocab_embedding,
-                self._dtype,
-                use_prompt_tuning=self._use_prompt_tuning,
-                tensor_parallel=model_config.mapping.tp_size,
-                tensor_parallel_rank=model_config.mapping.tp_rank,
-            )
-
-            if model_config.positional_embedding.weight is not None:
-                self.positional_embedding = build_embedding_from_config(
-                    model_config.positional_embedding,
-                    self._dtype,
-                    tensor_parallel=model_config.mapping.tp_size,
-                    tensor_parallel_rank=model_config.mapping.tp_rank,
-                )
-
-        self.layers = []
-        for layer_id in get_transformer_layers(self._mapping, self._num_layers):
-            model_config.layers[layer_id].max_lora_rank = self.max_lora_rank
-            self.layers.append(
-                build_decoder_layer(
-                    model_config.layers[layer_id],
-                    layer_id,
-                    self._num_layers,
-                    dtype=self._dtype,
-                    quantization=model_config.quantization,
-                    rank=self.rank,
-                    tensor_parallel=self._tensor_parallel,
-                    tp_group=model_config.mapping.tp_group,
-                )
-            )
-
-        self.layers = ModuleList(self.layers)
-
-        if self._mapping.is_last_pp_rank():
-            self.ln_f = build_layernorm_from_config(model_config.final_layernorm, self._dtype)
-
-    def forward(
-        self,
-        input_ids,
-        position_ids,
-        use_cache=False,
-        attention_mask=None,
-        kv_cache_params=None,
-        attention_params=None,
-        prompt_embedding_table=None,
-        prompt_tasks=None,
-        prompt_vocab_size=None,
-        inflight_batching_args=None,
-        hidden_states=None,
-        lora_params=None,
-    ):
-        ptuning_args = []
-        if self._use_prompt_tuning:
-            ptuning_args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size]
-
-        if self._mapping.is_first_pp_rank():
-            x = self.vocab_embedding(input_ids, *ptuning_args)
-            if hasattr(self, "positional_embedding") and self.positional_embedding:
-                assert position_ids
-                x = x + self.positional_embedding(position_ids)
-            hidden_states = x
-        else:
-            hidden_states = recv(hidden_states, self._mapping.prev_pp_rank())
-
-        kv_cache_params.fill_none_tensor_list(len(self.layers))
-
-        if use_cache:
-            presents = []
-
-        if attention_mask is not None:
-            attention_mask = expand_mask(attention_mask, shape(input_ids, -1))
-
-        for layer_idx, (layer, past) in enumerate(
-            zip(
-                self.layers,
-                kv_cache_params.past_key_value,
-            )
-        ):
-
-            decoder_params = {
-                "hidden_states": hidden_states,
-                "attention_mask": attention_mask,
-                "use_cache": use_cache,
-                "kv_cache_params": KeyValueCacheParams(
-                    past_key_value=[past],
-                    host_past_key_value_lengths=kv_cache_params.host_past_key_value_lengths,
-                    kv_cache_block_pointers=kv_cache_params.kv_cache_block_pointers,
-                    host_max_attention_window_sizes=kv_cache_params.host_max_attention_window_sizes,
-                    cache_indirection=kv_cache_params.cache_indirection,
-                    host_sink_token_length=kv_cache_params.host_sink_token_length,
-                    host_kv_cache_block_pointers=kv_cache_params.host_kv_cache_block_pointers,
-                ),
-                "attention_params": attention_params,
-            }
-
-            if lora_params.lora_ranks is not None:
-                decoder_params["lora_layer_params"] = lora_params.get_layer_params(layer_idx)
-
-            hidden_states = layer(**decoder_params)
-
-            if use_cache:
-                presents.append(hidden_states[1])
-                hidden_states = hidden_states[0]
-
-        if self._mapping.is_last_pp_rank():
-            hidden_states = self.ln_f(hidden_states)
-        else:
-            hidden_states = send(hidden_states, self._mapping.next_pp_rank())
-
-        if use_cache:
-            return hidden_states, tuple(presents)
-        return hidden_states
-
-
-class LMHeadModelBuilder(ModelBuilder, GenerationMixin):
-    def __init__(self, model_config: ModelConfig):
-        super().__init__(model_config)
-
-        if self._mapping.is_last_pp_rank():
-            self.lm_head = ColumnLinear(
-                self._hidden_size,
-                model_config.vocab_size_padded,
-                bias=False,
-                dtype=self._dtype,
-                tp_group=self._mapping.tp_group,
-                tp_size=self._tensor_parallel,
-                gather_output=True,
-                share_weight=None,
-            )
-            self.lm_head.weight.value = model_config.lm_head.weight
-            if model_config.quantization:
-                self.lm_head = quantize_linear(self.lm_head, model_config.quantization, model_config.lm_head)
-
-    def forward(
-        self,
-        input_ids,
-        position_ids,
-        use_cache=False,
-        last_token_ids=None,
-        attention_mask=None,
-        kv_cache_params=None,
-        attention_params=None,
-        prompt_embedding_table=None,
-        prompt_tasks=None,
-        prompt_vocab_size=None,
-        inflight_batching_args=None,
-        hidden_states=None,
-        lora_params=None,
-    ):
-
-        hidden_states = super().forward(
-            input_ids,
-            position_ids,
-            use_cache,
-            attention_mask,
-            kv_cache_params,
-            attention_params,
-            prompt_embedding_table,
-            prompt_tasks,
-            prompt_vocab_size,
-            inflight_batching_args,
-            hidden_states,
-            lora_params,
-        )
-
-        if use_cache:
-            hidden_states, presents = hidden_states
-
-        if self._mapping.is_last_pp_rank():
-            assert last_token_ids is not None, "Expecting last token ids to be not None"
-            hidden_states = gather_last_token_logits(
-                hidden_states, last_token_ids, default_net().plugin_config.remove_input_padding
-            )
-
-            # [batch_size, hidden_size] -> [batch_size, vocab_size]
-            lm_logits = self.lm_head(hidden_states)
-            lm_logits.mark_output("logits", str_dtype_to_trt("float16"))
-        else:
-            hidden_states.mark_output('hidden_states_output', self._dtype)
-
-        if use_cache:
-            if not default_net().plugin_config.paged_kv_cache:
-                for i, present in zip(self._mapping.pp_layers(self._num_layers), presents):
-                    present.mark_output(f'present_key_value_{i}', self._kv_dtype)
-            if self._mapping.is_last_pp_rank():
-                return (lm_logits, presents)
-            return (hidden_states, presents)
-        else:
-            if self._mapping.is_last_pp_rank():
-                return lm_logits
-            return hidden_states
-
-    def prepare_inputs(
-        self,
-        max_batch_size,
-        max_input_len,
-        max_new_tokens,
-        use_cache=True,
-        max_beam_width: int = 1,
-        paged_kv_cache: bool = False,
-        tokens_per_block: int = 64,
-        prompt_embedding_table_size: int = 0,
-        lora_target_modules: List[str] = None,
-    ):
-
-        # Prepare inputs
-        head_size = self._head_size
-        num_heads_kv = self._num_kv_heads
-        remove_input_padding = default_net().plugin_config.remove_input_padding
-        use_gpt_attention_plugin = default_net().plugin_config.gpt_attention_plugin
-        use_gemm_plugin = default_net().plugin_config.gemm_plugin
-        use_custom_all_reduce = default_net().plugin_config.use_custom_all_reduce
-        use_lora_plugin = default_net().plugin_config.lora_plugin
-
-        model_inputs = self.prepare_basic_inputs(
-            max_batch_size=max_batch_size,
-            max_beam_width=max_beam_width,
-            max_input_len=max_input_len,
-            max_seq_len=max_new_tokens,
-            num_kv_heads=num_heads_kv,
-            head_size=head_size,
-            num_layers=self._num_layers,
-            kv_dtype=self._kv_dtype,
-            remove_input_padding=remove_input_padding,
-            use_gpt_attention_plugin=use_gpt_attention_plugin,
-            use_gemm_plugin=use_gemm_plugin,
-            paged_kv_cache=paged_kv_cache,
-            tokens_per_block=tokens_per_block,
-            gather_context_logits=False,
-            gather_generation_logits=False,
-            dtype=self._dtype,
-            num_heads=self._num_heads,
-            mapping=self._mapping,
-            max_num_tokens=None,
-            prompt_embedding_table_size=prompt_embedding_table_size,
-            position_encoding_2d=False,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_draft_len=0,
-            use_custom_all_reduce=use_custom_all_reduce,
-        )
-
-        inflight_batching_args = None
-
-        return (
-            model_inputs["input_ids"],
-            model_inputs["position_ids"],
-            use_cache,
-            model_inputs["last_token_ids"],
-            model_inputs["attention_mask"],
-            KeyValueCacheParams(
-                past_key_value=model_inputs['past_key_value'],
-                host_past_key_value_lengths=model_inputs['host_past_key_value_lengths'],
-                host_max_attention_window_sizes=model_inputs['host_max_attention_window_sizes'],
-                kv_cache_block_pointers=model_inputs['kv_cache_block_pointers'],
-                host_kv_cache_block_pointers=model_inputs['host_kv_cache_block_pointers'],
-                cache_indirection=model_inputs['cache_indirection'],
-                host_sink_token_length=model_inputs['host_sink_token_length'],
-            ),
-            AttentionParams(
-                sequence_length=model_inputs['sequence_length'],
-                context_lengths=model_inputs['context_lengths'],
-                host_context_lengths=model_inputs['host_context_lengths'],
-                max_context_length=max_input_len,
-                host_request_types=model_inputs['host_request_types'],
-            ),
-            model_inputs['prompt_embedding_table'],
-            model_inputs['tasks'],
-            model_inputs['prompt_vocab_size'],
-            inflight_batching_args,
-            model_inputs["hidden_states_input"],
-            LoraParams(
-                model_inputs['lora_ranks'],
-                model_inputs['lora_weights_pointers'],
-                host_context_lengths=model_inputs['host_context_lengths'],
-                max_context_length=max_input_len,
-                host_request_types=model_inputs['host_request_types'],
-            ),
-        )
-
-    def build(
-        self,
-        output_dir: Path,
-        timing_cache: str = "",
-        log_level: str = "info",
-        max_batch_size: int = 1,
-        max_input_len: int = 200,
-        max_output_len: int = 200,
-        max_beam_width: int = 1,
-        parallel_build: bool = False,
-        max_prompt_embedding_table_size: int = 0,
-        use_inflight_batching: bool = False,
-        paged_kv_cache: bool = False,
-        enable_context_fmha: bool = True,
-        enable_multi_block_mode: bool = False,
-        use_refit: bool = False,
-        use_lora_plugin: str = None,
-        lora_target_modules: List[str] = None,
-        max_lora_rank: int = 64,
-    ):
-
-        if self.rank > torch.cuda.device_count():
-            print(f"warning: Rank {self.rank} larger than GPUs available ({torch.cuda.device_count()})")
-
-        build(
-            tensorrt_llm_model=self,
-            output_dir=output_dir,
-            mapping=self._mapping,
-            dtype=trt_dtype_to_str(self._dtype),
-            timing_cache=timing_cache,
-            log_level=log_level,
-            max_batch_size=max_batch_size,
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_beam_width=max_beam_width,
-            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-            parallel_build=parallel_build,
-            gpus_per_node=torch.cuda.device_count(),
-            quantization=self.quantization,
-            use_inflight_batching=use_inflight_batching,
-            paged_kv_cache=paged_kv_cache,
-            enable_context_fmha=enable_context_fmha,
-            enable_multi_block_mode=enable_multi_block_mode,
-            use_refit=use_refit,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_lora_rank=max_lora_rank,
-        )
-
-    def print(self):
-        np.set_printoptions(threshold=36)
-        print_tensorrt_llm(f"rank.{self.rank}", self)
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index 1bdfd5237caf..f79d6ddce4bc 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -25,16 +25,11 @@
 import tensorrt_llm
 import torch
 from mpi4py.futures import MPIPoolExecutor
-from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraManager
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
 from transformers import PreTrainedTokenizer
 
-from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
-from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder
-
-from nemo.export.trt_llm.tensorrt_llm_build import get_engine_name, MODEL_NAME, refit_runtime_engine  # isort:skip
 from nemo.export.trt_llm.nemo_utils import to_word_list_format  # isort:skip
 
 
@@ -330,110 +325,6 @@ def load(
     )
 
 
-def load_refit(
-    tokenizer,
-    engine_dir: str,
-    lora_ckpt_list: List[str] = None,
-    num_beams: int = 1,
-    model_configs: List = None,
-    stream=None,
-) -> TensorrtLLMHostContext:
-    """Loaded the compiled LLM model and run it.
-
-    It also supports running the TRT LLM model on multi-GPU.
-    """
-
-    config_path = os.path.join(engine_dir, "config.json")
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    """The impl of `load` API for on a single GPU worker."""
-    tensorrt_llm.logger.set_level("error")
-
-    engine_dir = Path(engine_dir)
-    config_path = engine_dir / "config.json"
-
-    (
-        model_config,
-        world_size,
-        tensor_parallel_size,
-        pipeline_parallel_size,
-        dtype,
-        max_input_len,
-        max_batch_size,
-    ) = _read_config(config_path)
-
-    runtime_rank = torch.cuda.current_device()
-    assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound"
-
-    # Manipulate the tensorrt_llm mapping to make it compatible with the multiprocessed env.
-    assert tensorrt_llm.mpi_world_size() == torch.distributed.get_world_size(), "MPI world size mismatch"
-    runtime_mapping = tensorrt_llm.Mapping(
-        world_size=tensorrt_llm.mpi_world_size(),
-        rank=runtime_rank,
-        tp_size=tensorrt_llm.mpi_world_size(),
-        pp_size=1,
-    )
-
-    engine_name = get_engine_name(
-        MODEL_NAME, dtype, tensor_parallel_size, pipeline_parallel_size, tensorrt_llm.mpi_rank()
-    )
-
-    logger.info(f"Loading engine: Rank ({tensorrt_llm.mpi_rank()} -> {engine_dir}/{engine_name}")
-
-    serialize_path = os.path.join(engine_dir, engine_name)
-    with open(serialize_path, "rb") as f:
-        engine_buffer = f.read()
-
-    decoder = tensorrt_llm.runtime.GenerationSession(
-        model_config, engine_buffer, runtime_mapping, debug_mode=False, stream=stream
-    )
-    runtime_mapping.rank = runtime_rank
-    runtime_mapping.tp_group = get_tensor_parallel_group(
-        tensor_parallel_size
-    )  # Override the tp_group to support TP+DP
-    runtime_mapping.tp_rank = runtime_rank
-    runtime_mapping.tp_size = tensor_parallel_size
-    runtime_mapping.pp_group = [runtime_rank]
-    runtime_mapping.pp_rank = 0
-
-    sampling_config = SamplingConfig(end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams)
-
-    if decoder.use_lora_plugin:
-        lora_manager = LoraManager()
-        if lora_ckpt_list is not None:
-            lora_manager.load_from_nemo(
-                model_files=lora_ckpt_list,
-                model_config=model_config,
-                runtime_mapping=runtime_mapping,
-            )
-    else:
-        lora_manager = None
-
-    # create a new builder and refit the current engine
-    new_builder = LMHeadModelBuilder(model_configs[0])
-    engine = decoder.runtime.engine
-    refit_runtime_engine(new_builder.named_parameters(), engine)
-
-    # Initialize the global context so it can be used during `run` API.
-    global tensorrt_llm_worker_context
-    tensorrt_llm_worker_context.decoder = decoder
-    tensorrt_llm_worker_context.sampling_config = sampling_config
-    tensorrt_llm_worker_context.lora_manager = lora_manager
-    tensorrt_llm_worker_context.max_batch_size = max_batch_size
-    tensorrt_llm_worker_context.max_input_len = max_input_len
-
-    max_batch_size = config["builder_config"]["max_batch_size"]
-    max_input_len = config["builder_config"]["max_input_len"]
-
-    return TensorrtLLMHostContext(
-        executor=None,
-        world_size=world_size,
-        tokenizer=tokenizer,
-        max_batch_size=max_batch_size,
-        max_input_len=max_input_len,
-    )
-
-
 def forward(
     input_tensors: List[torch.IntTensor],
     max_output_len: int,
diff --git a/nemo/export/trt_llm/tensorrt_llm_utils.py b/nemo/export/trt_llm/tensorrt_llm_utils.py
deleted file mode 100644
index b732daca2525..000000000000
--- a/nemo/export/trt_llm/tensorrt_llm_utils.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-
-import tensorrt as trt
-from tensorrt_llm.layers import Embedding, LayerNorm, PromptTuningEmbedding, RmsNorm
-from tensorrt_llm.module import Module
-
-from nemo.export.trt_llm.model_config import LAYERNORM_DEFAULT, LAYERNORM_RMS, EmbeddingConfig, LayernormConfig
-from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def build_embedding_from_config(
-    config: EmbeddingConfig,
-    dtype: trt.DataType,
-    tensor_parallel: int = 1,
-    tensor_parallel_rank: int = 0,
-    use_prompt_tuning: bool = False,
-):
-    """Returns the tensorrt_llm embedding layer from the embedding config."""
-    # If the config is empty, return an empty impl.
-    if config is None:
-        return None
-    EmbeddingCls = PromptTuningEmbedding if use_prompt_tuning else Embedding
-
-    trt_embedding = EmbeddingCls(
-        config.weight.shape[0] * tensor_parallel,
-        config.weight.shape[1],
-        dtype=dtype,
-        tp_size=tensor_parallel,
-        tp_rank=tensor_parallel_rank,
-        tp_group=get_tensor_parallel_group(tensor_parallel),
-    )
-    trt_embedding.weight.value = config.weight
-    return trt_embedding
-
-
-def build_layernorm_from_config(config: LayernormConfig, dtype: trt.DataType):
-    """Returns the tensorrt_llm layernorm layer from the torch layernorm."""
-    # If the config is empty, return an empty impl.
-    if config is None:
-        return None
-
-    if config.layernorm_type == LAYERNORM_DEFAULT:
-        trt_layernorm = LayerNorm(normalized_shape=config.weight.shape[0], dtype=dtype)
-        trt_layernorm.weight.value = config.weight
-        trt_layernorm.bias.value = config.bias
-    elif config.layernorm_type == LAYERNORM_RMS:
-        trt_layernorm = RmsNorm(normalized_shape=config.weight.shape[0], dtype=dtype)
-        trt_layernorm.weight.value = config.weight
-    else:
-        raise NotImplementedError(f"{config.layernorm_type} not supported")
-    return trt_layernorm
-
-
-def print_tensorrt_llm(name: str, tensorrt_llm_module: Module):
-    """Prints the tensorrt llm structure including weights and related data for debugging purpose."""
-    for tensor_name in [
-        "weight",
-        "bias",
-        "activation_scaling_factor",
-        "weights_scaling_factor",
-        "prequant_scaling_factor",
-    ]:
-        if hasattr(tensorrt_llm_module, tensor_name):
-            tensor = getattr(tensorrt_llm_module, tensor_name)
-            if tensor is not None:
-                LOGGER.info(f"{name}.{tensor_name}:{tensor._value.dtype}:{tensor._value.shape}:\n{tensor._value}")
-
-    for k, v in tensorrt_llm_module.named_children():
-        print_tensorrt_llm(f"{name}.{k}({v._get_name()})", v)
diff --git a/nemo/export/trt_llm/utils.py b/nemo/export/trt_llm/utils.py
deleted file mode 100644
index 0f9fb66313b9..000000000000
--- a/nemo/export/trt_llm/utils.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-import pathlib
-import numpy as np
-import torch
-
-log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
-logging.basicConfig(format=log_format)
-LOGGER = logging.getLogger("NeMo")
-
-# numpy doesn't know bfloat16, define abstract binary type instead
-np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"})
-
-
-def prompt_convert(prompt_config, prompt_weights):
-    if "task_templates" in prompt_config:
-        prompt_templates = prompt_config["task_templates"]
-        actual_task_id = 0
-        vtokens_embeddings = []
-        vtokens_len = []
-        for task_name_id, prompt_task in enumerate(prompt_templates):
-            prompt_task_name = prompt_task["taskname"]
-            LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
-            prompt_task_weights = prompt_weights["prompt_table"].get(
-                f"prompt_table.{prompt_task_name}.prompt_embeddings.weight"
-            )
-            if prompt_task_weights is None:
-                continue
-            vtokens_embeddings.append(prompt_task_weights)
-            vtokens_len.append(prompt_task_weights.shape[0])
-            actual_task_id += 1
-
-        max_vtoken_len = max(vtokens_len)
-        embedding_dim = vtokens_embeddings[0].shape[1]
-
-        # pad tasks to longest task embedding table
-        for i, vtoken_emb_table in enumerate(vtokens_embeddings):
-            padded_table = torch.zeros((max_vtoken_len, embedding_dim))
-            padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table
-            vtokens_embeddings[i] = padded_table
-
-        vtokens_embeddings = torch.stack(vtokens_embeddings)
-    else:
-        vtokens_embeddings = prompt_weights["prompt_embeddings_weights"]
-
-    return vtokens_embeddings
-
-
-def cpu_map_location(storage, loc):
-    return storage.cpu()
-
-
-def is_nemo_file(path):
-    flag = False
-
-    if path is not None:
-        if len(path) > 5:
-            pc = pathlib.Path(path)
-            if pc.exists():
-                if pc.is_file():
-                    if path[-5 : len(path)] == ".nemo":
-                        flag = True
-
-    return flag

From 4a263e7f257d7e04f8e5d71756abeb4d9f4cfc60 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 31 May 2024 14:05:18 -0700
Subject: [PATCH 140/178] add large model stable training fix and contrastive
 loss update for variable seq (#9259) (#9348)

* add stable training fix and contrastive loss update for variable seq length input


* Apply isort and black reformatting


* replace remove_bias with use_bias


---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: nithinraok <nithinraok@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: nithinraok <nithinraok@users.noreply.github.com>
---
 .../fastconformer_ctc_bpe_streaming.yaml      |  1 +
 .../fastconformer_ctc_char_streaming.yaml     |  1 +
 ...astconformer_transducer_bpe_streaming.yaml |  1 +
 ...stconformer_transducer_char_streaming.yaml |  1 +
 ...r_hybrid_transducer_ctc_bpe_streaming.yaml |  1 +
 ..._hybrid_transducer_ctc_char_streaming.yaml |  1 +
 ...stconformer_hybrid_transducer_ctc_bpe.yaml |  1 +
 ...tconformer_hybrid_transducer_ctc_char.yaml |  1 +
 .../fast-conformer-long_ctc_bpe.yaml          |  1 +
 .../fast-conformer-long_transducer_bpe.yaml   |  1 +
 .../ssl/fastconformer/fast-conformer.yaml     |  1 +
 .../asr/losses/ssl_losses/contrastive.py      | 23 +++---
 .../asr/modules/conformer_encoder.py          |  4 ++
 .../asr/parts/submodules/conformer_modules.py | 71 ++++++++++++++-----
 .../parts/submodules/multi_head_attention.py  | 41 +++++++----
 15 files changed, 110 insertions(+), 40 deletions(-)

diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
index a59a2628cd2f..acb499f18ffb 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
@@ -80,6 +80,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
index 8f8f7e40e39a..8dd978bb00e4 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
@@ -78,6 +78,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
index 69b21b496ddd..9f199c2dd488 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
@@ -85,6 +85,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
index 8fd096525e74..c7f83216aa0b 100644
--- a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
@@ -84,6 +84,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
index b0965b580d5b..6f356ce91caa 100644
--- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
@@ -90,6 +90,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
index 9c144d22edec..870bb0190c03 100644
--- a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
@@ -88,6 +88,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
index 69e4546b77a7..3fc91cc1e436 100644
--- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml
@@ -87,6 +87,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
index ea98d13e62da..e99ba69df57a 100644
--- a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
+++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml
@@ -85,6 +85,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
index 2fab24fa6373..3e3d2bf6788e 100644
--- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
+++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml
@@ -88,6 +88,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 18
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
index 4d5f4dbcbd06..5f6c37288ae9 100644
--- a/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
+++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml
@@ -90,6 +90,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling parameters
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
index 47ad5aa458ca..6e7b5e107629 100644
--- a/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
+++ b/examples/asr/conf/ssl/fastconformer/fast-conformer.yaml
@@ -79,6 +79,7 @@ model:
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 17
     d_model: 512
+    use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules
 
     # Sub-sampling params
     subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
diff --git a/nemo/collections/asr/losses/ssl_losses/contrastive.py b/nemo/collections/asr/losses/ssl_losses/contrastive.py
index bab691913c0a..16a70925ac9b 100644
--- a/nemo/collections/asr/losses/ssl_losses/contrastive.py
+++ b/nemo/collections/asr/losses/ssl_losses/contrastive.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from math import ceil
+
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -25,8 +27,7 @@
 class ContrastiveLoss(Loss):
     @property
     def input_types(self):
-        """Input types definitions for Contrastive.
-        """
+        """Input types definitions for Contrastive."""
         return {
             "spectrograms": NeuralType(("B", "D", "T"), SpectrogramType()),
             "spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()),
@@ -147,13 +148,17 @@ def sample_negatives(self, y, num):
 
     @typecheck()
     def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=None):
-        spec_in = spectrograms.transpose(-2, -1)
+        targets = spectrograms.transpose(-2, -1)
         masks = spec_masks.transpose(-2, -1)
-        targets = spec_in
         # BxTxC
+        diff = int(ceil(targets.shape[1] / decoder_outputs.shape[1]) * decoder_outputs.shape[1]) - targets.shape[1]
+
+        if diff > 0:
+            targets = F.pad(targets, (0, 0, 0, diff))
+            masks = F.pad(masks, (0, 0, 0, diff))
 
-        targets = targets.reshape(targets.shape[0], targets.shape[1] // self.combine_time_steps, -1)
-        masks = masks.reshape(targets.shape[0], targets.shape[1], -1)
+        targets = targets.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
+        masks = masks.reshape(targets.shape[0], decoder_outputs.shape[1], -1)
 
         if self.quantized_targets:
             if self.store_ids:
@@ -198,7 +203,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
             if self.sample_from_non_masked:
                 # sample from all steps in utterance
                 negatives, _ = self.sample_negatives(
-                    targets.transpose(0, 1), targets_masked_only.size(0),  # TxBxC  # T'
+                    targets.transpose(0, 1),
+                    targets_masked_only.size(0),  # TxBxC  # T'
                 )
             else:
                 # only sample from masked steps in utterance
@@ -239,7 +245,8 @@ def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=Non
             elif self.sample_from_non_masked:
                 # sample from all steps in batch
                 negatives, _ = self.sample_negatives(
-                    targets.reshape(targets.shape[0] * targets.shape[1], -1), targets_masked_only.size(0),  # BTxC
+                    targets.reshape(targets.shape[0] * targets.shape[1], -1),
+                    targets_masked_only.size(0),  # BTxC
                 )  # T'
             else:
                 # only sample from masked steps
diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
index d0e014e42a37..d8f0e58833f7 100644
--- a/nemo/collections/asr/modules/conformer_encoder.py
+++ b/nemo/collections/asr/modules/conformer_encoder.py
@@ -118,6 +118,8 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin):
             Defaults to None.
         conv_dual_mode (bool): specifies if convolution should be dual mode when dual_offline mode is being used. When enables, the left half of the convolution kernel would get masked in streaming cases.
             Defaults to False
+        use_bias (bool): Use bias in all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models.
+            Defaults to True.
         dropout (float): the dropout rate used in all layers except the attention layers
             Defaults to 0.1.
         dropout_pre_encoder (float): the dropout rate used before the encoder
@@ -282,6 +284,7 @@ def __init__(
         conv_kernel_size=31,
         conv_norm_type='batch_norm',
         conv_context_size=None,
+        use_bias=True,
         dropout=0.1,
         dropout_pre_encoder=0.1,
         dropout_emb=0.1,
@@ -426,6 +429,7 @@ def __init__(
                 pos_bias_u=pos_bias_u,
                 pos_bias_v=pos_bias_v,
                 att_context_size=self.att_context_size,
+                use_bias=use_bias,
             )
             self.layers.append(layer)
 
diff --git a/nemo/collections/asr/parts/submodules/conformer_modules.py b/nemo/collections/asr/parts/submodules/conformer_modules.py
index aed6cc16245c..efd23ef44628 100644
--- a/nemo/collections/asr/parts/submodules/conformer_modules.py
+++ b/nemo/collections/asr/parts/submodules/conformer_modules.py
@@ -56,6 +56,8 @@ class ConformerLayer(torch.nn.Module, AdapterModuleMixin, AccessMixin):
         conv_kernel_size (int): kernel size for depthwise convolution in convolution module
         dropout (float): dropout probabilities for linear layers
         dropout_att (float): dropout probabilities for attention distributions
+        use_bias (bool): Apply bias to all Linear and Conv1d layers from each ConformerLayer to improve activation flow and stabilize training of huge models.
+            Defaults to True.
     """
 
     def __init__(
@@ -75,6 +77,7 @@ def __init__(
         pos_bias_u=None,
         pos_bias_v=None,
         att_context_size=[-1, -1],
+        use_bias=True,
     ):
         super(ConformerLayer, self).__init__()
 
@@ -84,7 +87,7 @@ def __init__(
 
         # first feed forward module
         self.norm_feed_forward1 = LayerNorm(d_model)
-        self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout)
+        self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, use_bias=use_bias)
 
         # convolution module
         self.norm_conv = LayerNorm(d_model)
@@ -93,6 +96,7 @@ def __init__(
             kernel_size=conv_kernel_size,
             norm_type=conv_norm_type,
             conv_context_size=conv_context_size,
+            use_bias=use_bias,
         )
 
         # multi-headed self-attention module
@@ -107,6 +111,7 @@ def __init__(
                 pos_bias_u=pos_bias_u,
                 pos_bias_v=pos_bias_v,
                 max_cache_len=MHA_max_cache_len,
+                use_bias=use_bias,
             )
         elif self_attention_model == 'rel_pos_local_attn':
             self.self_attn = RelPositionMultiHeadAttentionLongformer(
@@ -120,10 +125,15 @@ def __init__(
                 global_tokens=global_tokens,
                 global_tokens_spacing=global_tokens_spacing,
                 global_attn_separate=global_attn_separate,
+                use_bias=use_bias,
             )
         elif self_attention_model == 'abs_pos':
             self.self_attn = MultiHeadAttention(
-                n_head=n_heads, n_feat=d_model, dropout_rate=dropout_att, max_cache_len=MHA_max_cache_len
+                n_head=n_heads,
+                n_feat=d_model,
+                dropout_rate=dropout_att,
+                max_cache_len=MHA_max_cache_len,
+                use_bias=use_bias,
             )
         else:
             raise ValueError(
@@ -133,7 +143,7 @@ def __init__(
 
         # second feed forward module
         self.norm_feed_forward2 = LayerNorm(d_model)
-        self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout)
+        self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout, use_bias=use_bias)
 
         self.dropout = nn.Dropout(dropout)
         self.norm_out = LayerNorm(d_model)
@@ -280,16 +290,25 @@ class ConformerConvolution(nn.Module):
         pointwise_activation (str): name of the activation function to be used for the pointwise conv.
             Note that Conformer uses a special key `glu_` which is treated as the original default from
             the paper.
+        use_bias (bool): Use bias in all Linear and Conv1d layers improve activation flow and stabilize training of huge models.
+            Defaults to True
     """
 
     def __init__(
-        self, d_model, kernel_size, norm_type='batch_norm', conv_context_size=None, pointwise_activation='glu_'
+        self,
+        d_model,
+        kernel_size,
+        norm_type='batch_norm',
+        conv_context_size=None,
+        pointwise_activation='glu_',
+        use_bias=True,
     ):
         super(ConformerConvolution, self).__init__()
         assert (kernel_size - 1) % 2 == 0
         self.d_model = d_model
         self.kernel_size = kernel_size
         self.norm_type = norm_type
+        self.use_bias = use_bias
 
         if conv_context_size is None:
             conv_context_size = (kernel_size - 1) // 2
@@ -305,7 +324,12 @@ def __init__(
             dw_conv_input_dim = d_model
 
         self.pointwise_conv1 = nn.Conv1d(
-            in_channels=d_model, out_channels=d_model * 2, kernel_size=1, stride=1, padding=0, bias=True
+            in_channels=d_model,
+            out_channels=d_model * 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=self.use_bias,
         )
 
         self.depthwise_conv = CausalConv1D(
@@ -315,7 +339,7 @@ def __init__(
             stride=1,
             padding=conv_context_size,
             groups=dw_conv_input_dim,
-            bias=True,
+            bias=self.use_bias,
         )
 
         if norm_type == 'batch_norm':
@@ -334,7 +358,12 @@ def __init__(
 
         self.activation = Swish()
         self.pointwise_conv2 = nn.Conv1d(
-            in_channels=dw_conv_input_dim, out_channels=d_model, kernel_size=1, stride=1, padding=0, bias=True
+            in_channels=dw_conv_input_dim,
+            out_channels=d_model,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=self.use_bias,
         )
 
     def forward(self, x, pad_mask=None, cache=None):
@@ -370,31 +399,34 @@ def forward(self, x, pad_mask=None, cache=None):
             return x, cache
 
     def reset_parameters_conv(self):
-        pw1_max = pw2_max = self.d_model ** -0.5
-        dw_max = self.kernel_size ** -0.5
+        pw1_max = pw2_max = self.d_model**-0.5
+        dw_max = self.kernel_size**-0.5
 
         with torch.no_grad():
             nn.init.uniform_(self.pointwise_conv1.weight, -pw1_max, pw1_max)
-            nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max)
             nn.init.uniform_(self.pointwise_conv2.weight, -pw2_max, pw2_max)
-            nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max)
             nn.init.uniform_(self.depthwise_conv.weight, -dw_max, dw_max)
-            nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max)
+            if self.use_bias:
+                nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max)
+                nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max)
+                nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max)
 
 
 class ConformerFeedForward(nn.Module):
     """
     feed-forward module of Conformer model.
+    use_bias (bool): Apply bias to all Linear and Conv1d layers improve activation flow and stabilize training of huge models.
     """
 
-    def __init__(self, d_model, d_ff, dropout, activation=Swish()):
+    def __init__(self, d_model, d_ff, dropout, activation=Swish(), use_bias=True):
         super(ConformerFeedForward, self).__init__()
         self.d_model = d_model
         self.d_ff = d_ff
-        self.linear1 = nn.Linear(d_model, d_ff)
+        self.use_bias = use_bias
+        self.linear1 = nn.Linear(d_model, d_ff, bias=self.use_bias)
         self.activation = activation
         self.dropout = nn.Dropout(p=dropout)
-        self.linear2 = nn.Linear(d_ff, d_model)
+        self.linear2 = nn.Linear(d_ff, d_model, bias=self.use_bias)
 
     def forward(self, x):
         x = self.linear1(x)
@@ -404,10 +436,11 @@ def forward(self, x):
         return x
 
     def reset_parameters_ff(self):
-        ffn1_max = self.d_model ** -0.5
-        ffn2_max = self.d_ff ** -0.5
+        ffn1_max = self.d_model**-0.5
+        ffn2_max = self.d_ff**-0.5
         with torch.no_grad():
             nn.init.uniform_(self.linear1.weight, -ffn1_max, ffn1_max)
-            nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max)
             nn.init.uniform_(self.linear2.weight, -ffn2_max, ffn2_max)
-            nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max)
+            if self.use_bias:
+                nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max)
+                nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max)
diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py
index 6a866a617f35..19d713405953 100644
--- a/nemo/collections/asr/parts/submodules/multi_head_attention.py
+++ b/nemo/collections/asr/parts/submodules/multi_head_attention.py
@@ -55,21 +55,23 @@ class MultiHeadAttention(nn.Module):
         n_head (int): number of heads
         n_feat (int): size of the features
         dropout_rate (float): dropout rate
+        use_bias (bool): whether to remove bias in linear and conv layers
     """
 
-    def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0):
+    def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0, use_bias=True):
         """Construct an MultiHeadedAttention object."""
         super(MultiHeadAttention, self).__init__()
         self.cache_drop_size = None
+        self.use_bias = use_bias
         assert n_feat % n_head == 0
         # We assume d_v always equals d_k
         self.d_k = n_feat // n_head
         self.s_d_k = math.sqrt(self.d_k)
         self.h = n_head
-        self.linear_q = nn.Linear(n_feat, n_feat)
-        self.linear_k = nn.Linear(n_feat, n_feat)
-        self.linear_v = nn.Linear(n_feat, n_feat)
-        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.linear_q = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat, bias=use_bias)
+        self.linear_out = nn.Linear(n_feat, n_feat, bias=use_bias)
         self.dropout = nn.Dropout(p=dropout_rate)
 
         self._max_cache_len = max_cache_len
@@ -161,11 +163,18 @@ class RelPositionMultiHeadAttention(MultiHeadAttention):
         n_head (int): number of heads
         n_feat (int): size of the features
         dropout_rate (float): dropout rate
+        use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention
     """
 
-    def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0):
+    def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0, use_bias=True):
         """Construct an RelPositionMultiHeadedAttention object."""
-        super().__init__(n_head=n_head, n_feat=n_feat, dropout_rate=dropout_rate, max_cache_len=max_cache_len)
+        super().__init__(
+            n_head=n_head,
+            n_feat=n_feat,
+            dropout_rate=dropout_rate,
+            max_cache_len=max_cache_len,
+            use_bias=use_bias,
+        )
         # linear transformation for positional encoding
         self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
         # these two learnable biases are used in matrix c and matrix d
@@ -253,7 +262,7 @@ def forward(self, query, key, value, mask, pos_emb, cache=None):
 class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention):
     """Multi-Head Attention layer of Transformer-XL with sliding window local+global attention from Longformer.
     Partially adapted from allenai (https://github.com/allenai/longformer/blob/master/longformer/sliding_chunks.py)
-    and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py) 
+    and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py)
     Paper: https://arxiv.org/abs/1901.02860 (Transformer-XL),
            https://arxiv.org/abs/2004.05150 (Longformer)
     Args:
@@ -267,6 +276,7 @@ class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention):
         global_tokens (int): number of tokens to be used for global attention
         global_tokens_spacing (int): how far apart the global tokens are
         global_attn_separate (bool): whether the q, k, v layers used for global tokens should be separate
+        use_bias (bool): whether to apply bias in linear and conv layers of MultiHeadAttention
     """
 
     def __init__(
@@ -281,6 +291,7 @@ def __init__(
         global_tokens=0,
         global_tokens_spacing=1,
         global_attn_separate=False,
+        use_bias=True,
     ):
         """Construct an RelPositionMultiHeadAttentionLongformer object."""
         super().__init__(
@@ -290,6 +301,7 @@ def __init__(
             pos_bias_u=pos_bias_u,
             pos_bias_v=pos_bias_v,
             max_cache_len=max_cache_len,
+            use_bias=use_bias,
         )
         self.att_context_size = att_context_size
         self.global_tokens = global_tokens
@@ -297,9 +309,9 @@ def __init__(
         self.global_attn_separate = global_attn_separate
 
         if self.global_attn_separate:
-            self.global_q = nn.Linear(n_feat, n_feat)
-            self.global_k = nn.Linear(n_feat, n_feat)
-            self.global_v = nn.Linear(n_feat, n_feat)
+            self.global_q = nn.Linear(n_feat, n_feat, bias=use_bias)
+            self.global_k = nn.Linear(n_feat, n_feat, bias=use_bias)
+            self.global_v = nn.Linear(n_feat, n_feat, bias=use_bias)
 
     def forward(self, query, key, value, pad_mask, pos_emb, cache=None):
         """Compute Scaled Dot Product Local Attention with rel. positional encoding. using overlapping chunks
@@ -650,7 +662,8 @@ def _compute_out_global_to_all(
         global_attn_scores = global_attn_scores.transpose(1, 2)
 
         global_attn_scores = global_attn_scores.masked_fill(
-            is_index_masked.transpose(2, 3), torch.finfo(global_attn_scores.dtype).min,
+            is_index_masked.transpose(2, 3),
+            torch.finfo(global_attn_scores.dtype).min,
         )
 
         global_attn_scores = global_attn_scores.view(batch_size * self.h, max_num_global_attn_indices, seq_len)
@@ -747,7 +760,9 @@ def _get_invalid_locations_mask(self, w: int, device: str):
         return mask.bool().to(device), ending_mask
 
     def mask_invalid_locations(
-        self, input_tensor: torch.Tensor, w: int,
+        self,
+        input_tensor: torch.Tensor,
+        w: int,
     ):
         """
         Mask locations invalid for the sliding window attention

From 006bd7f0614f963aea09cee4ffcff25afa8dd0db Mon Sep 17 00:00:00 2001
From: jgerh <163925524+jgerh@users.noreply.github.com>
Date: Fri, 31 May 2024 14:35:15 -0700
Subject: [PATCH 141/178] Nemo readme revisions (#9129)

* REvisions to NeMo ReadMe

* NeMo Readme.rst revisions

* Update README.rst

Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: jgerh <163925524+jgerh@users.noreply.github.com>

* ReadMe updates

* ReadMe Updates

* Updates to NeMo Readme with new license information

* NeMo Framework ReadMe Revisions Updates

Signed-off-by: Jennifer Gerhold <jgerhold@nvidia.com>

* NeMo Framework ReadMe Revisions 2

Signed-off-by: Jennifer Gerhold <jgerhold@nvidia.com>

---------

Signed-off-by: jgerh <163925524+jgerh@users.noreply.github.com>
Signed-off-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Jennifer Gerhold <jgerhold@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 README.rst | 287 ++++++++++++++++++++++++++---------------------------
 1 file changed, 143 insertions(+), 144 deletions(-)

diff --git a/README.rst b/README.rst
index 121c82b8590f..4a68acc286cd 100644
--- a/README.rst
+++ b/README.rst
@@ -108,57 +108,51 @@ Latest News
 Introduction
 ------------
 
-NVIDIA NeMo Framework is a generative AI framework built for researchers and PyTorch developers
-working on large language models (LLMs), multimodal models (MM), automatic speech recognition (ASR),
-and text-to-speech synthesis (TTS).
-The primary objective of NeMo is to provide a scalable framework for researchers and developers from industry and academia
-to more easily implement and design new generative AI models by being able to leverage existing code and pretrained models.
+NVIDIA NeMo Framework is a scalable and cloud-native generative AI framework built for researchers and PyTorch developers working on Large Language Models (LLMs), Multimodal Models (MMs), Automatic Speech Recognition (ASR), Text to Speech (TTS), and Computer Vision (CV) domains. It is designed to help you efficiently create, customize, and deploy new generative AI models by leveraging existing code and pre-trained model checkpoints.
 
 For technical documentation, please see the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_.
 
-All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_ and
-training is automatically scalable to 1000s of GPUs.
+LLMs and MMs Training, Alignment, and Customization
+###################################################
 
-When applicable, NeMo models take advantage of the latest possible distributed training techniques,
-including parallelism strategies such as
+All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_.
+Training is automatically scalable to 1000s of GPUs.
 
-* data parallelism
-* tensor parallelism
-* pipeline model parallelism
-* fully sharded data parallelism (FSDP)
-* sequence parallelism
-* context parallelism
-* mixture-of-experts (MoE)
+When applicable, NeMo models leverage cutting-edge distributed training techniques, incorporating `parallelism strategies <https://docs.nvidia.com/nemo-framework/user-guide/latest/modeloverview.html>`_ to enable efficient training of very large models. These techniques include Tensor Parallelism (TP), Pipeline Parallelism (PP), Fully Sharded Data Parallelism (FSDP), Mixture-of-Experts (MoE), and Mixed Precision Training with BFloat16 and FP8, as well as others.
 
-and mixed precision training recipes with bfloat16 and FP8 training.
+NeMo Transformer-based LLMs and MMs utilize `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_ for FP8 training on NVIDIA Hopper GPUs, while leveraging `NVIDIA Megatron Core <https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core>`_ for scaling Transformer model training.
 
-NeMo's Transformer based LLM and Multimodal models leverage `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_ for FP8 training on NVIDIA Hopper GPUs
-and leverages `NVIDIA Megatron Core <https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core>`_ for scaling transformer model training.
+NeMo LLMs can be aligned with state-of-the-art methods such as SteerLM, Direct Preference Optimization (DPO), and Reinforcement Learning from Human Feedback (RLHF). See `NVIDIA NeMo Aligner <https://github.com/NVIDIA/NeMo-Aligner>`_ for more information.
 
-NeMo LLMs can be aligned with state of the art methods such as SteerLM, DPO and Reinforcement Learning from Human Feedback (RLHF),
-see `NVIDIA NeMo Aligner <https://github.com/NVIDIA/NeMo-Aligner>`_ for more details.
+In addition to supervised fine-tuning (SFT), NeMo also supports the latest parameter efficient fine-tuning (PEFT) techniques such as LoRA, P-Tuning, Adapters, and IA3. Refer to the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/index.html>`_ for the full list of supported models and techniques.
 
-NeMo LLM and Multimodal models can be deployed and optimized with `NVIDIA Inference Microservices (Early Access) <https://developer.nvidia.com/nemo-microservices-early-access>`_.
+LLMs and MMs Deployment and Optimization
+########################################
 
-NeMo ASR and TTS models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
+NeMo LLMs and MMs can be deployed and optimized with `NVIDIA Inference Microservices (Early Access) <https://developer.nvidia.com/nemo-microservices-early-access>`_, in short, NIMs.
 
-For scaling NeMo LLM and Multimodal training on Slurm clusters or public clouds, please see the `NVIDIA Framework Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_.
-The NeMo Framework launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and Multimodal models and also has an `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_
-which can be used to find the optimal model parallel configuration for training on a specific cluster.
-To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_
-The NeMo Framework Launcher does not currently support ASR and TTS training but will soon.
+NeMo ASR and TTS models can be optimized for inference and deployed for production use cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
 
-Getting started with NeMo is simple.
-State of the Art pretrained NeMo models are freely available on `HuggingFace Hub <https://huggingface.co/models?library=nemo&sort=downloads&search=nvidia>`_ and
+NeMo Framework Launcher
+#######################
+
+`NeMo Framework Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_ is a cloud-native tool that streamlines the NeMo Framework experience. It is used for launching end-to-end NeMo Framework training jobs on CSPs and Slurm clusters. 
+
+The NeMo Framework Launcher includes extensive recipes, scripts, utilities, and documentation for training NeMo LLMs. It also includes the NeMo Framework `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_, which is designed to find the optimal model parallel configuration for training on a specific cluster.
+
+To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_. The NeMo Framework Launcher does not currently support ASR and TTS training, but it will soon.
+
+Get Started with NeMo Framework
+-------------------------------
+
+Getting started with NeMo Framework is easy. State-of-the-art pretrained NeMo models are freely available on `Hugging Face Hub <https://huggingface.co/models?library=nemo&sort=downloads&search=nvidia>`_ and
 `NVIDIA NGC <https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC>`_.
 These models can be used to generate text or images, transcribe audio, and synthesize speech in just a few lines of code.
 
 We have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that
-can be run on `Google Colab <https://colab.research.google.com>`_ or with our `NGC NeMo Framework Container. <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_
-and we have `playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_ for users that want to train NeMo models with the NeMo Framework Launcher.
+can be run on `Google Colab <https://colab.research.google.com>`_ or with our `NGC NeMo Framework Container <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_. We also have `playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_ for users who want to train NeMo models with the NeMo Framework Launcher.
 
-For advanced users that want to train NeMo models from scratch or finetune existing NeMo models
-we have a full suite of `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ that support multi-GPU/multi-node training.
+For advanced users who want to train NeMo models from scratch or fine-tune existing NeMo models, we have a full suite of `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ that support multi-GPU/multi-node training.
 
 Key Features
 ------------
@@ -172,9 +166,9 @@ Key Features
 Requirements
 ------------
 
-1) Python 3.10 or above
-2) Pytorch 1.13.1 or above
-3) NVIDIA GPU, if you intend to do model training
+* Python 3.10 or above
+* Pytorch 1.13.1 or above
+* NVIDIA GPU (if you intend to do model training)
 
 Developer Documentation
 -----------------------
@@ -197,54 +191,48 @@ Developer Documentation
 | Stable  | |stable|    | `Documentation of the stable (i.e. most recent release) branch. <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/>`_ |
 +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+
 
-
-Getting help with NeMo
+Install NeMo Framework
 ----------------------
-FAQ can be found on NeMo's `Discussions board <https://github.com/NVIDIA/NeMo/discussions>`_. You are welcome to ask questions or start discussions there.
-
-
-Installation
-------------
 
 The NeMo Framework can be installed in a variety of ways, depending on your needs. Depending on the domain, you may find one of the following installation methods more suitable.
 
-* Conda / Pip - Refer to the `Conda <#conda>`_ and `Pip <#pip>`_ sections for installation instructions.
+* Conda / Pip - Refer to `Conda <#conda>`_ and `Pip <#pip>`_ for installation instructions.
 
-  * This is recommended for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains.
-  * When using a Nvidia PyTorch container as the base, this is the recommended installation method for all domains.
+  * This is the recommended method for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains.
+  * When using a Nvidia PyTorch container as the base, this is the recommended method for all domains.
 
-* Docker Containers - Refer to the `Docker containers <#docker-containers>`_ section for installation instructions.
+* Docker Containers - Refer to `Docker containers <#docker-containers>`_ for installation instructions.
 
-  * This is recommended for Large Language Models (LLM), Multimodal and Vision domains.
-  * NeMo LLM & Multimodal Container - `nvcr.io/nvidia/nemo:24.03.framework`
-  * NeMo Speech Container - `nvcr.io/nvidia/nemo:24.01.speech`
+  * NeMo Framework container - `nvcr.io/nvidia/nemo:24.05`
 
-* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for installation instructions.
-  * It's highly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`
+* LLMs and MMs Dependencies - Refer to `LLMs and MMs Dependencies <#install-llms-and-mms-dependencies>`_ for installation instructions.
+
+**Important: We strongly recommended that you start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`**
 
 Conda
-~~~~~
+^^^^^^
 
-We recommend installing NeMo in a fresh Conda environment.
+Install NeMo in a fresh Conda environment:
 
 .. code-block:: bash
 
     conda create --name nemo python==3.10.12
     conda activate nemo
 
-Install PyTorch using their `configurator <https://pytorch.org/get-started/locally/>`_.
+Install PyTorch using their `configurator <https://pytorch.org/get-started/locally/>`_:
 
 .. code-block:: bash
 
     conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
 
-The command used to install PyTorch may depend on your system. Please use the configurator linked above to find the right command for your system.
+The command to install PyTorch may depend on your system. Use the configurator linked above to find the right command for your system.
 
 Then, install NeMo via Pip or from Source. We do not provide NeMo on the conda-forge or any other Conda channel.
 
 Pip
-~~~
-Use this installation mode if you want the latest released version.
+^^^
+
+To install the nemo_toolkit, use the following installation method:
 
 .. code-block:: bash
 
@@ -252,12 +240,12 @@ Use this installation mode if you want the latest released version.
     pip install Cython
     pip install nemo_toolkit['all']
 
-Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command.
+Depending on the shell used, you may need to use the ``"nemo_toolkit[all]"`` specifier instead in the above command.
 
-Pip (Domain Specific)
-~~~~~~~~~~~~~~~~~~~~~
+Pip from a Specific Domain
+^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-To install only a specific domain of NeMo, use the following commands. Note: It is required to install the above pre-requisites before installing a specific domain of NeMo.
+To install a specific domain of NeMo, you must first install the nemo_toolkit using the instructions listed above. Then, you run the following domain-specific commands:
 
 .. code-block:: bash
 
@@ -267,9 +255,10 @@ To install only a specific domain of NeMo, use the following commands. Note: It
     pip install nemo_toolkit['vision']
     pip install nemo_toolkit['multimodal']
 
-Pip from source
-~~~~~~~~~~~~~~~
-Use this installation mode if you want the version from a particular GitHub branch (e.g main).
+Pip from a Source Branch
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you want to work with a specific version of NeMo from a particular GitHub branch (e.g main), use the following installation method:
 
 .. code-block:: bash
 
@@ -278,9 +267,10 @@ Use this installation mode if you want the version from a particular GitHub bran
     python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]
 
 
-From source
-~~~~~~~~~~~
-Use this installation mode if you are contributing to NeMo.
+Build from Source
+^^^^^^^^^^^^^^^^^
+
+If you want to clone the NeMo GitHub repository and contribute to NeMo open-source development work, use the following installation method:
 
 .. code-block:: bash
 
@@ -289,18 +279,16 @@ Use this installation mode if you are contributing to NeMo.
     cd NeMo
     ./reinstall.sh
 
-If you only want the toolkit without additional conda-based dependencies, you may replace ``reinstall.sh``
-with ``pip install -e .`` when your PWD is the root of the NeMo repository.
+If you only want the toolkit without the additional Conda-based dependencies, you can replace ``reinstall.sh`` with ``pip install -e .`` when your PWD is the root of the NeMo repository.
 
-Mac computers with Apple silicon
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-To install NeMo on Mac with Apple M-Series GPU:
+Mac Computers with Apple Silicon
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- create a new Conda environment
+To install NeMo on Mac computers with the Apple M-Series GPU, you need to create a new Conda environment, install PyTorch 2.0 or higher, and then install the nemo_toolkit.
 
-- install PyTorch 2.0 or higher
+**Important: This method is only applicable to the ASR domain.**
 
-- run the following code:
+Run the following code:
 
 .. code-block:: shell
 
@@ -322,24 +310,22 @@ To install NeMo on Mac with Apple M-Series GPU:
     # Note that only the ASR toolkit is guaranteed to work on MacBook - so for MacBook use pip install 'nemo_toolkit[asr]'
 
 Windows Computers
-~~~~~~~~~~~~~~~~~
-
-One of the options is using Windows Subsystem for Linux (WSL).
+^^^^^^^^^^^^^^^^^
 
-To install WSL:
-
-- In PowerShell, run the following code:
+To install the Windows Subsystem for Linux (WSL), run the following code in PowerShell: 
 
 .. code-block:: shell
 
     wsl --install
     # [note] If you run wsl --install and see the WSL help text, it means WSL is already installed.
 
-Learn more about installing WSL at `Microsoft's official documentation <https://learn.microsoft.com/en-us/windows/wsl/install>`_.
+To learn more about installing WSL, refer to `Microsoft's official documentation <https://learn.microsoft.com/en-us/windows/wsl/install>`_.
+
+After installing your Linux distribution with WSL, two options are available:
 
-After Installing your Linux distribution with WSL:
-  - **Option 1:** Open the distribution (Ubuntu by default) from the Start menu and follow the instructions.
-  - **Option 2:** Launch the Terminal application. Download it from `Microsoft's Windows Terminal page <https://learn.microsoft.com/en-us/windows/terminal>`_ if not installed.
+**Option 1:** Open the distribution (Ubuntu by default) from the Start menu and follow the instructions.
+
+**Option 2:** Launch the Terminal application. Download it from `Microsoft's Windows Terminal page <https://learn.microsoft.com/en-us/windows/terminal>`_ if not installed.
 
 Next, follow the instructions for Linux systems, as provided above. For example:
 
@@ -351,8 +337,11 @@ Next, follow the instructions for Linux systems, as provided above. For example:
     ./reinstall.sh
 
 RNNT
-~~~~
-Note that RNNT requires numba to be installed from conda.
+^^^^
+
+For optimal performance of a Recurrent Neural Network Transducer (RNNT), install the Numba package from Conda.
+
+Run the following code:
 
 .. code-block:: bash
 
@@ -360,14 +349,12 @@ Note that RNNT requires numba to be installed from conda.
   pip uninstall numba
   conda install -c conda-forge numba
 
-LLM and Multimodal Dependencies
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Install LLMs and MMs Dependencies
+---------------------------------
 
-The LLM and Multimodal domains require three additional dependencies: 
-NVIDIA Apex, NVIDIA Transformer Engine, and NVIDIA Megatron Core.
+If you work with the LLM and MM domains, three additional dependencies are required: NVIDIA Apex, NVIDIA Transformer Engine, and NVIDIA Megatron Core. When working with the `main` branch, these dependencies may require a recent commit.
 
-When working with the `main` branch these dependencies may require a recent commit.
-The most recent working versions of these dependencies are:
+The most recent working versions of these dependencies are here:
 
 .. code-block:: bash
 
@@ -376,11 +363,14 @@ The most recent working versions of these dependencies are:
   export mcore_commit=fbb375d4b5e88ce52f5f7125053068caff47f93f
   export nv_pytorch_tag=24.02-py3
 
-When using a released version of NeMo, 
-please refer to the `Software Component Versions <https://docs.nvidia.com/nemo-framework/user-guide/latest/softwarecomponentversions.html>`_ 
-for the correct versions.
+When using a released version of NeMo, please refer to the `Software Component Versions <https://docs.nvidia.com/nemo-framework/user-guide/latest/softwarecomponentversions.html>`_ for the correct versions.
+
+PyTorch Container
+^^^^^^^^^^^^^^^^^
+
+We recommended that you start with a base NVIDIA PyTorch container: nvcr.io/nvidia/pytorch:24.02-py3.
 
-If starting with a base NVIDIA PyTorch container first launch the container:
+If starting with a base NVIDIA PyTorch container, you must first launch the container:
 
 .. code-block:: bash
 
@@ -393,15 +383,14 @@ If starting with a base NVIDIA PyTorch container first launch the container:
     --ulimit stack=67108864 \
     nvcr.io/nvidia/pytorch:$nv_pytorch_tag
 
-Then install the dependencies:
+Next, you need to install the dependencies.
 
 Apex
-~~~~
-NeMo LLM Multimodal Domains require that NVIDIA Apex to be installed.
-Apex comes installed in the NVIDIA PyTorch container but it's possible that
-NeMo LLM and Multimodal may need to be updated to a newer version.
+^^^^
 
-To install Apex, run
+NVIDIA Apex is required for LLM and MM domains. Although Apex is pre-installed in the NVIDIA PyTorch container, you may need to update it to a newer version.
+
+To install Apex, run the following code:
 
 .. code-block:: bash
 
@@ -410,35 +399,32 @@ To install Apex, run
     git checkout $apex_commit
     pip install . -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam --group_norm"
 
+When attempting to install Apex separately from the NVIDIA PyTorch container, you might encounter an error if the CUDA version on your system is different from the one used to compile PyTorch. To bypass this error, you can comment out the relevant line in the setup file located in the Apex repository on GitHub here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32.
 
-While installing Apex outside of the NVIDIA PyTorch container,
-it may raise an error if the CUDA version on your system does not match the CUDA version torch was compiled with.
-This raise can be avoided by commenting it here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32
+cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using.
 
-cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using:
+To install cuda-nvprof, run the following code:
 
 .. code-block:: bash
 
   conda install -c nvidia cuda-nvprof=11.8
 
-packaging is also needed:
+Finally, install the packaging:
 
 .. code-block:: bash
 
   pip install packaging
 
-With the latest versions of Apex, the `pyproject.toml` file in Apex may need to be deleted in order to install locally.
-
+To install the most recent versions of Apex locally, it might be necessary to remove the `pyproject.toml` file from the Apex directory.
 
 Transformer Engine
-~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^
+
+NVIDIA Transformer Engine is required for LLM and MM domains. Although the Transformer Engine is pre-installed in the NVIDIA PyTorch container, you may need to update it to a newer version.
 
-The NeMo LLM Multimodal Domains require that NVIDIA Transformer Engine to be installed.
-Transformer Engine comes installed in the NVIDIA PyTorch container but it's possible that
-NeMo LLM and Multimodal may need Transformer Engine to be updated to a newer version.
+The Transformer Engine facilitates training with FP8 precision on NVIDIA Hopper GPUs and introduces many enhancements for the training of Transformer-based models. Refer to `Transformer Enginer <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_ for information. 
 
-Transformer Engine enables FP8 training on NVIDIA Hopper GPUs and many performance optimizations for transformer-based model training.
-Documentation for installing Transformer Engine can be found `here <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_. 
+To install Transformer Engine, run the following code:
 
 .. code-block:: bash
 
@@ -451,14 +437,15 @@ Documentation for installing Transformer Engine can be found `here <https://docs
 Transformer Engine requires PyTorch to be built with at least CUDA 11.8.
 
 Megatron Core
-~~~~~~~~~~~~~
+^^^^^^^^^^^^^
+
+Megatron Core is required for LLM and MM domains.
+
+Megatron Core is a library for scaling large Transformer-based models. NeMo LLMs and MMs leverage Megatron Core for model parallelism, 
 
-The NeMo LLM Multimodal Domains require that NVIDIA Megatron Core to be installed.
-Megatron core is a library for scaling large transformer base models. 
-NeMo LLM and Multimodal models leverage Megatron Core for model parallelism, 
 transformer architectures, and optimized PyTorch datasets.
 
-NeMo LLM and Multimodal may need Megatron Core to be updated to a recent version.
+To install Megatron Core, run the following code:
 
 .. code-block:: bash
 
@@ -469,27 +456,32 @@ NeMo LLM and Multimodal may need Megatron Core to be updated to a recent version
   cd megatron/core/datasets && \
   make
 
-
 NeMo Text Processing
-~~~~~~~~~~~~~~~~~~~~
-NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.
+--------------------
+
+NeMo Text Processing, specifically Inverse Text Normalization, is now a separate repository. It is located here: `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.
+
+Docker Containers
+-----------------
+
+NeMo containers are launched concurrently with NeMo version updates. For example, the release of NeMo ``r1.23.0`` comes with the container ``nemo:24.01.speech``. The latest containers are:
+
+* NeMo LLM and MM container - `nvcr.io/nvidia/nemo:24.03.framework`
+* NeMo Speech container - `nvcr.io/nvidia/nemo:24.01.speech`
 
-Docker containers
-~~~~~~~~~~~~~~~~~
-We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.23.0`` comes with container ``nemo:24.01.speech``, you may find more details about released containers in `releases page <https://github.com/NVIDIA/NeMo/releases>`_.
+You can find additional information about released containers on the `NeMo releases page <https://github.com/NVIDIA/NeMo/releases>`_.
 
-To use a pre-built container, please run
+To use a pre-built container, run the following code:
 
 .. code-block:: bash
 
     docker pull nvcr.io/nvidia/nemo:24.01.speech
 
-To build a nemo container with Dockerfile from a branch, please run
+To build a nemo container with Dockerfile from a branch, run the following code:
 
 .. code-block:: bash
 
-    DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest .
-
+    DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest
 
 If you choose to work with the main branch, we recommend using NVIDIA's PyTorch container version 23.10-py3 and then installing from GitHub.
 
@@ -499,25 +491,32 @@ If you choose to work with the main branch, we recommend using NVIDIA's PyTorch
     -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
     stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.10-py3
 
-Examples
---------
 
-Many examples can be found under the `"Examples" <https://github.com/NVIDIA/NeMo/tree/stable/examples>`_ folder.
+Future Work
+-----------
 
+The NeMo Framework Launcher does not currently support ASR and TTS training, but it will soon.
 
-Contributing
-------------
+Discussions Board
+-----------------
+
+FAQ can be found on the NeMo `Discussions board <https://github.com/NVIDIA/NeMo/discussions>`_. You are welcome to ask questions or start discussions on the board.
+
+Contribute to NeMo
+------------------
 
 We welcome community contributions! Please refer to `CONTRIBUTING.md <https://github.com/NVIDIA/NeMo/blob/stable/CONTRIBUTING.md>`_ for the process.
 
 Publications
-------------
+------------------
 
 We provide an ever-growing list of `publications <https://nvidia.github.io/NeMo/publications/>`_ that utilize the NeMo Framework.
 
-If you would like to add your own article to the list, you are welcome to do so via a pull request to this repository's ``gh-pages-src`` branch.
-Please refer to the instructions in the `README of that branch <https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme>`_.
+To contribute an article to the collection, please submit a pull request to the ``gh-pages-src`` branch of this repository. For detailed information, please consult the README located at the `gh-pages-src branch <https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme>`_.
+
+Licenses
+--------
+
+* `NeMo GitHub Apache 2.0 license <https://github.com/NVIDIA/NeMo?tab=Apache-2.0-1-ov-file#readme>`__
 
-License
--------
-NeMo is released under an `Apache 2.0 license <https://github.com/NVIDIA/NeMo/blob/stable/LICENSE>`_.
+* NeMo is licensed under the `NVIDIA AI PRODUCT AGREEMENT <https://www.nvidia.com/en-us/data-center/products/nvidia-ai-enterprise/eula/>`__. By pulling and using the container, you accept the terms and conditions of this license.
\ No newline at end of file

From 64c2812a2537f29e7e6a62780207f2749ec17ed1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 31 May 2024 21:37:37 -0700
Subject: [PATCH 142/178] Guard cuda memory allocator update (#9312) (#9313)

* Guard cuda memory allocator update


* Apply isort and black reformatting


---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: titu1994 <titu1994@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
---
 nemo/collections/common/data/lhotse/dataloader.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 32bbc1f3e8f4..01bf51b0e2c6 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -531,7 +531,13 @@ def maybe_set_cuda_expandable_segments(enabled: bool):
             warnings.warn(
                 "You have set PYTORCH_CUDA_ALLOC_CONF without expandable_segments:True option. We're setting that option anyway. To disable it, set cuda_expandable_segments=False in NeMo dataloader configuration."
             )
-        torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+
+        try:
+            torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+        except RuntimeError:
+            logging.info(
+                "Failed to set expandable_segments:True for PyTorch CUDA allocator. You may get training speed improvements if you enable this"
+            )
 
 
 def _select_channel(cut, channel_selector: int | str) -> list:

From 28ccec727cb76ba14d7a55061c290906a7dc6664 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Sat, 1 Jun 2024 00:40:41 -0400
Subject: [PATCH 143/178] Prompt formatter API and canary transcribe tensor
 input support (#9206)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Apply CanaryPromptFormatter in dataset/inference

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Working inference with CanaryPromptFormatter

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Minimum working example of Canary.transcribe() with tensors

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* training fix

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Update to the new 'chat' based prompt formatting API

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Prompt formatters for popular models and partial unit test coverage

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Updated documentation

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Improved test coverage + proper preamble support

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix usage of PromptFormatter for MT-AED class + fix tokenization/formatting issues

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Move some canary hacks to canary prompt formatter, improve validation, add tests for aggtok

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* aed_model.transcribe(**slots) support, rename all slots to lowercase and drop pipes everywhere except template definition.

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* truly generic version

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* making transcribe_speech.py work prompt slots + syntactic sugar

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* update streaming_utils.py

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* code review: partial

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Accept multi-turn, single-turn, and legacy prompt format in transcribe() and transcribe_speech.py

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Address code reviews

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Add support for SPE special tokens bos/eos in prompt templates and ensure Llama2 format gives identical results with the reference implementation

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix tests and add llama2 prompt formatter tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
---
 examples/asr/transcribe_speech.py             |  13 +-
 nemo/collections/asr/data/audio_to_text.py    |  45 +--
 .../asr/data/audio_to_text_lhotse_prompted.py | 158 +++-----
 .../asr/models/aed_multitask_models.py        | 172 +++++++--
 .../asr/parts/mixins/transcription.py         |   8 +-
 .../asr/parts/utils/streaming_utils.py        |  66 +++-
 nemo/collections/common/prompts/__init__.py   |   0
 nemo/collections/common/prompts/canary.py     |  71 ++++
 nemo/collections/common/prompts/example.py    |  36 ++
 nemo/collections/common/prompts/formatter.py  | 347 ++++++++++++++++++
 nemo/collections/common/prompts/gemma.py      |  29 ++
 nemo/collections/common/prompts/llama.py      |  72 ++++
 nemo/collections/common/prompts/mistral.py    |  33 ++
 nemo/collections/common/prompts/phi2.py       |  62 ++++
 .../common/tokenizers/aggregate_tokenizer.py  |   9 +-
 .../common/tokenizers/canary_tokenizer.py     |  49 ++-
 .../tokenizers/sentencepiece_tokenizer.py     |   3 +-
 tests/collections/__init__.py                 |   0
 .../asr/test_asr_multitask_model_bpe.py       |  25 +-
 .../collections/asr/test_custom_tokenizer.py  |  12 +-
 .../common/prompt_formatters/conftest.py      |  51 +++
 .../test_canary_prompt_formatter.py           |  50 +++
 .../test_gemma_prompt_formatter.py            |  40 ++
 .../test_llama2_prompt_formatter.py           |  63 ++++
 .../test_mistral_prompt_formatter.py          |  32 ++
 .../test_prompt_formatter_api.py              | 147 ++++++++
 26 files changed, 1382 insertions(+), 211 deletions(-)
 create mode 100644 nemo/collections/common/prompts/__init__.py
 create mode 100644 nemo/collections/common/prompts/canary.py
 create mode 100644 nemo/collections/common/prompts/example.py
 create mode 100644 nemo/collections/common/prompts/formatter.py
 create mode 100644 nemo/collections/common/prompts/gemma.py
 create mode 100644 nemo/collections/common/prompts/llama.py
 create mode 100644 nemo/collections/common/prompts/mistral.py
 create mode 100644 nemo/collections/common/prompts/phi2.py
 create mode 100644 tests/collections/__init__.py
 create mode 100644 tests/collections/common/prompt_formatters/conftest.py
 create mode 100644 tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py
 create mode 100644 tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py
 create mode 100644 tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py
 create mode 100644 tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py
 create mode 100644 tests/collections/common/prompt_formatters/test_prompt_formatter_api.py

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index 1763c2035805..b63e9db5fef1 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -16,7 +16,7 @@
 import glob
 import json
 import os
-from dataclasses import dataclass, is_dataclass
+from dataclasses import dataclass, field, is_dataclass
 from tempfile import NamedTemporaryFile
 from typing import List, Optional, Union
 
@@ -25,6 +25,7 @@
 from omegaconf import OmegaConf, open_dict
 
 from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel
+from nemo.collections.asr.models.aed_multitask_models import parse_multitask_prompt
 from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
 from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
@@ -169,6 +170,14 @@ class TranscriptionConfig:
 
     # Decoding strategy for AED models
     multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
+    # Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs:
+    # Implicit single-turn assuming default role='user' (works with Canary-1B)
+    #  +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes
+    # Explicit single-turn prompt:
+    #  +prompt.role=user +prompt.slots.source_lang=en +prompt.slots.target_lang=es +prompt.slots.task=s2t_translation +prompt.slots.pnc=yes
+    # Explicit multi-turn prompt:
+    #  +prompt.turns='[{role:user,slots:{source_lang:en,target_lang:es,task:asr,pnc:yes}}]'
+    prompt: dict = field(default_factory=dict)
 
     # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
     decoder_type: Optional[str] = None
@@ -411,6 +420,8 @@ def autocast(dtype=None):
                 override_cfg.augmentor = augmentor
                 override_cfg.text_field = cfg.gt_text_attr_name
                 override_cfg.lang_field = cfg.gt_lang_attr_name
+                if hasattr(override_cfg, "prompt"):
+                    override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
                 transcriptions = asr_model.transcribe(
                     audio=filepaths,
                     override_config=override_cfg,
diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py
index 00c15109b64f..e0bb63ad18cd 100644
--- a/nemo/collections/asr/data/audio_to_text.py
+++ b/nemo/collections/asr/data/audio_to_text.py
@@ -75,7 +75,9 @@ def _speech_collate_fn(batch, pad_id):
     has_audio = audio_lengths[0] is not None
     if has_audio:
         max_audio_len = max(audio_lengths).item()
-    max_tokens_len = max(tokens_lengths).item()
+    has_tokens = tokens_lengths[0] is not None
+    if has_tokens:
+        max_tokens_len = max(tokens_lengths).item()
 
     audio_signal, tokens = [], []
     for b in batch:
@@ -89,19 +91,24 @@ def _speech_collate_fn(batch, pad_id):
                 pad = (0, max_audio_len - sig_len)
                 sig = torch.nn.functional.pad(sig, pad)
             audio_signal.append(sig)
-        tokens_i_len = tokens_i_len.item()
-        if tokens_i_len < max_tokens_len:
-            pad = (0, max_tokens_len - tokens_i_len)
-            tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id)
-        tokens.append(tokens_i)
+        if has_tokens:
+            tokens_i_len = tokens_i_len.item()
+            if tokens_i_len < max_tokens_len:
+                pad = (0, max_tokens_len - tokens_i_len)
+                tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id)
+            tokens.append(tokens_i)
 
     if has_audio:
         audio_signal = torch.stack(audio_signal)
         audio_lengths = torch.stack(audio_lengths)
     else:
         audio_signal, audio_lengths = None, None
-    tokens = torch.stack(tokens)
-    tokens_lengths = torch.stack(tokens_lengths)
+    if has_tokens:
+        tokens = torch.stack(tokens)
+        tokens_lengths = torch.stack(tokens_lengths)
+    else:
+        tokens = None
+        tokens_lengths = None
     if sample_ids is None:
         return audio_signal, audio_lengths, tokens, tokens_lengths
     else:
@@ -256,8 +263,7 @@ def cache_datastore_manifests(
     if num_datastore_manifests > 0:
         # Local utility function
         def cache_data(manifest_filepaths, cache_audio, num_workers, max_num_workers):
-            """Cache manifests and audio data from object store.
-            """
+            """Cache manifests and audio data from object store."""
             # Determine the number of workers to use
             if num_workers is None:
                 num_workers = os.cpu_count() - 1
@@ -421,8 +427,7 @@ class _AudioTextDataset(Dataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
@@ -546,8 +551,7 @@ class AudioToCharDataset(_AudioTextDataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
@@ -640,8 +644,7 @@ class AudioToBPEDataset(_AudioTextDataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
@@ -910,8 +913,7 @@ def __next__(self):
         return TarredAudioFilter(self.manifest_processor.collection)
 
     def _loop_offsets(self, iterator):
-        """This function is used to iterate through utterances with different offsets for each file.
-        """
+        """This function is used to iterate through utterances with different offsets for each file."""
 
         class TarredAudioLoopOffsets:
             def __init__(self, collection):
@@ -944,8 +946,7 @@ def _collate_fn(self, batch):
         return _speech_collate_fn(batch, self.pad_id)
 
     def _build_sample(self, tup):
-        """Builds the training sample by combining the data from the WebDataset with the manifest info.
-        """
+        """Builds the training sample by combining the data from the WebDataset with the manifest info."""
         audio_bytes, audio_filename, offset_id = tup
 
         # Grab manifest entry from self.manifest_preprocessor.collection
@@ -1316,7 +1317,9 @@ class BucketingDataset(IterableDataset):
     """
 
     def __init__(
-        self, dataset: IterableDataset, bucketing_batch_size: int,
+        self,
+        dataset: IterableDataset,
+        bucketing_batch_size: int,
     ):
         self.wrapped_dataset = dataset
         self.bucketing_batch_size = bucketing_batch_size
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
index 000b1a8f0839..e9e97d3d32d7 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 from typing import Callable, Sequence
 
-import omegaconf
 import torch.utils.data
 from lhotse import CutSet
 from lhotse.cut import MixedCut, MonoCut
@@ -21,7 +20,9 @@
 from lhotse.dataset.collation import collate_vectors
 
 from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
+from nemo.collections.common.prompts.canary import CanaryPromptFormatter
 from nemo.collections.common.tokenizers import CanaryTokenizer, TokenizerSpec
+from nemo.collections.common.tokenizers.canary_tokenizer import CANARY_SPECIAL_TOKENIZER
 
 
 class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):
@@ -57,21 +58,21 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         audio, audio_lens, cuts = self.load_audio(cuts)
 
-        tokens, prompt_tokens = self.prompt_format_fn(cuts, self.tokenizer, inference=self.inference)
+        prompts_with_answers, prompts = self.prompt_format_fn(cuts, self.tokenizer, inference=self.inference)
 
-        tokens = [torch.as_tensor(t) for t in tokens]
-        token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long)
-        tokens = collate_vectors(tokens, padding_value=self.padding_value)
+        prompts_with_answers = [torch.as_tensor(t) for t in prompts_with_answers]
+        prompts_with_answers_lens = torch.tensor([t.size(0) for t in prompts_with_answers], dtype=torch.long)
+        prompts_with_answers = collate_vectors(prompts_with_answers, padding_value=self.padding_value)
 
         if self.inference:
-            prompt_tokens = [torch.as_tensor(t) for t in prompt_tokens]
-            prompt_token_lens = torch.tensor([t.size(0) for t in prompt_tokens], dtype=torch.long)
-            prompt_tokens = collate_vectors(prompt_tokens, padding_value=self.padding_value)
+            prompts = [torch.as_tensor(t) for t in prompts]
+            prompts_lens = torch.tensor([t.size(0) for t in prompts], dtype=torch.long)
+            prompts = collate_vectors(prompts, padding_value=self.padding_value)
         else:
-            prompt_tokens = None
-            prompt_token_lens = None
+            prompts = None
+            prompts_lens = None
 
-        return audio, audio_lens, tokens, token_lens, prompt_tokens, prompt_token_lens
+        return audio, audio_lens, prompts_with_answers, prompts_with_answers_lens, prompts, prompts_lens
 
 
 # Mapping from a string name to a known prompt formatter function.
@@ -105,7 +106,9 @@ def get_prompt_format_fn(name: str) -> Callable[[CutSet, TokenizerWrapper, bool]
 
 
 @registered_prompt_format_fn
-def canary(cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False) -> Sequence[Sequence[int]]:
+def canary(
+    cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     """
     Prepend and append control tokens to the token sequence as per Canary format.
 
@@ -132,116 +135,53 @@ def canary(cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False) -
     assert isinstance(
         tokenizer._tokenizer, CanaryTokenizer
     ), "To use 'canary' prompt format, you must use the CanaryTokenizer."
-    tokenizer = tokenizer._tokenizer
+    formatter = CanaryPromptFormatter(tokenizer._tokenizer)
 
-    tokens, prompts = [], []
+    prompts_with_answers, prompts = [], []
     for cut in cuts:
         if isinstance(cut, MixedCut):
             cut = cut._first_non_padding_cut
-        assert isinstance(cut, MonoCut), "Expected MonoCut."
+        if not isinstance(cut, MonoCut):
+            raise TypeError(
+                f"Expected input audio to have a single channel (required MonoCut/MixedCut, but we received: {cut=})"
+            )
 
         # first, validate the utterance
-        missing_keys = [k for k in ("source_lang", "target_lang", "taskname", "pnc") if k not in cut.custom]
+        expected_slots = set(formatter.get_slots("user"))
+        missing_keys = expected_slots - set(cut.custom)
+        if "task" in missing_keys and "taskname" in cut.custom:
+            # Compatibility with "old" Canary manifest format.
+            # For compatbility with inference options, this slot is now called "task".
+            cut.custom["task"] = cut.custom["taskname"]
+            missing_keys.remove("task")
         if missing_keys:
             raise RuntimeError(
                 f"We found cut with ID {cut.id} that is missing the following keys: {missing_keys}"
                 f"Please ensure that every utterance in the input manifests contains these keys."
             )
 
-        # Actual tokenization. If a cut has multiple supervisions, we'll stitch their tokenized texts together.
-        texts = [sup.text for sup in cut.supervisions]
-        langs = [sup.language for sup in cut.supervisions]
-        taskname = cut.custom['taskname']
-        pnc = cut.custom['pnc']
-        source_lang = cut.custom['source_lang']
-        target_lang = cut.custom['target_lang']
-
-        tokens.append(canary_prompt(tokenizer, texts, langs, source_lang, target_lang, taskname, pnc))
-        if inference:
-            prompts.append(canary_prompt(tokenizer, None, None, source_lang, target_lang, taskname, pnc))
-    return tokens, prompts
-
-
-def canary_prompt(
-    tokenizer: CanaryTokenizer,
-    text: str | list[str] | None,
-    language: str | list[str] | None,
-    source_language: str,
-    target_language: str,
-    taskname: str,
-    pnc: str,
-) -> list[int]:
-    if isinstance(text, str):
-        text = [text]
-    if isinstance(language, str):
-        language = [language]
-
-    if text is not None:
-        try:
-            tokens = sum((tokenizer.text_to_ids(text_, lang_) for text_, lang_ in zip(text, language)), start=[])
-        except omegaconf.errors.KeyValidationError as e:
-            raise ProbablyIncorrectLanguageKeyError(
-                "We couldn't select the right tokenizer, which could be due to issues with reading "
-                "the language from the manifest. "
-                "If you're training, try setting lang_field='' to a different value (probably 'target_lang' or 'lang'). "
-                "If you're using model.transcribe() directly, please use override_config kwarg to set this. "
-                "If you're using transcribe_speech.py, use option gt_lang_attr_name='...' "
-            ) from e
-    else:
-        tokens = None  # create prompt for inference
-
-    # bos
-    prompted_tokens = [tokenizer.bos_id]
-
-    if tokens is not None and len(tokens) == 0:
-        # no speech token
-        prompted_tokens.append(tokenizer.nospeech_id)
-    else:
-        # first, validate the utterance
-        if source_language is None or target_language is None or taskname is None or pnc is None:
-            raise RuntimeError(
-                f"Missing keys provided to prompt: "
-                f"source_langauge={source_language},\n"
-                f"target_language={target_language},\n"
-                f"taskname={taskname},\n"
-                f"pnc={pnc}\n"
-                f"Please ensure that every utterance in the input manifests contains these keys."
-            )
-
-        # src_lang_id/no_speech
-        src_lang_id = tokenizer.spl_token_to_id(source_language)
-        prompted_tokens.append(src_lang_id)
-
-        # task
-        task = taskname
-        if task == 'asr' or task == "transcribe":
-            prompted_tokens.append(tokenizer.spl_token_to_id("transcribe"))
-        elif task == 's2t_translation' or task == 'ast' or task == "translate":
-            prompted_tokens.append(tokenizer.spl_token_to_id("translate"))
-        else:
-            raise ValueError(f"Unknown task: {task}")
-
-        # tgt_lang_id
-        tgt_lang_id = tokenizer.spl_token_to_id(target_language)
-        prompted_tokens.append(tgt_lang_id)
-
-        # PnC
-        pnc = f"{pnc}".lower().strip()  # to account for bool or str
-        if pnc in {'yes', 'true'}:
-            prompted_tokens.append(tokenizer.spl_token_to_id("pnc"))
-        elif pnc in {'no', 'false'}:
-            prompted_tokens.append(tokenizer.spl_token_to_id("nopnc"))
-        else:
-            raise ValueError(f"Unknown value for key 'pnc': {pnc}")
-
-        # text (only in training)
-        if tokens is not None:
-            prompted_tokens.extend(tokens)
+        encoded = formatter.encode_dialog(
+            turns=[
+                dict(
+                    role="user",
+                    slots={
+                        **{slot: cut.custom[slot] for slot in expected_slots},
+                        formatter.PROMPT_LANGUAGE_SLOT: CANARY_SPECIAL_TOKENIZER,
+                    },
+                ),
+                dict(
+                    role="assistant",
+                    slots={
+                        "text": ' '.join(s.text for s in cut.supervisions),
+                        formatter.PROMPT_LANGUAGE_SLOT: cut.custom["target_lang"],
+                    },
+                ),
+            ]
+        )
+        prompts_with_answers.append(encoded["input_ids"])
+        prompts.append(encoded["context_ids"])
 
-    # eos (only in training)
-    if tokens is not None:
-        prompted_tokens.append(tokenizer.eos_id)
-    return prompted_tokens
+    return prompts_with_answers, prompts
 
 
 class ProbablyIncorrectLanguageKeyError(RuntimeError):
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index b11d744a7e6a..880f8bb3a004 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import warnings
 from dataclasses import dataclass, field
 from math import ceil
 from typing import Any, Dict, List, Optional, Union
@@ -45,6 +46,7 @@
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
+from nemo.collections.common.prompts.formatter import PromptFormatter
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import (
     AudioSignal,
@@ -100,10 +102,7 @@ class MultiTaskTranscriptionConfig(TranscribeConfig):
     Configuration for Multi Task Transcription
     """
 
-    task: Optional[str] = None
-    pnc: Optional[bool] = None
-    source_lang: Optional[str] = None
-    target_lang: Optional[str] = None
+    prompt: list[dict[str, dict[str, str]]] | None = None
     text_field: str = "answer"
     lang_field: str = "target_lang"
 
@@ -112,10 +111,7 @@ class MultiTaskTranscriptionConfig(TranscribeConfig):
     )
 
     def __post_init__(self):
-        required_fields = ['task', 'pnc', 'source_lang', 'target_lang', 'text_field', 'lang_field']
-        for field in required_fields:
-            if not hasattr(self, field):
-                raise ValueError(f"`{field}` must be present in the transcription config: {self}")
+        self.prompt = parse_multitask_prompt(self.prompt)
 
 
 class EncDecMultiTaskModel(ASRModel, ExportableEncDecModel, ASRBPEMixin, ASRTranscriptionMixin):
@@ -134,6 +130,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
         super().__init__(cfg=cfg, trainer=trainer)
 
+        prompt_cls = PromptFormatter.resolve(self.prompt_format)
+        self.prompt = prompt_cls(
+            tokenizer=self.tokenizer,
+            defaults=OmegaConf.to_container(cfg.get("prompt_defaults")),
+        )
+
         # Setup audio preprocessor
         self.preprocessor = EncDecMultiTaskModel.from_config_dict(self.cfg.preprocessor)
         # Setup audio encoder
@@ -391,15 +393,12 @@ def transcribe(
         audio: Union[str, List[str], np.ndarray, DataLoader],
         batch_size: int = 4,
         return_hypotheses: bool = False,
-        task: Optional[str] = None,
-        pnc: Optional[bool] = None,
-        source_lang: Optional[str] = None,
-        target_lang: Optional[str] = None,
         num_workers: int = 0,
         channel_selector: Optional[ChannelSelectorType] = None,
         augmentor: DictConfig = None,
         verbose: bool = True,
         override_config: Optional[MultiTaskTranscriptionConfig] = None,
+        **prompt,
     ) -> Union[List[str], List[Hypothesis]]:
         """
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
@@ -412,15 +411,12 @@ def transcribe(
                 Bigger will result in better throughput performance but would use more memory.
             return_hypotheses: (bool) Either return hypotheses or text
                 With hypotheses can do some postprocessing like getting timestamp or rescoring
-            task: (str) task name. Defaults to `asr`.
-            pnc: (bool) whether to apply punctuation and capitalization or not. Defaults to True.
-            source_lang: (str) source language. Defaults to `en`.
-            target_lang: (str) target language. Defaults to `en`.
             num_workers: (int) number of workers for DataLoader
             channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
             augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied.
             verbose: (bool) whether to display tqdm progress bar
             override_config: (Optional[MultiTaskTranscriptionConfig]) A config to override the default config.
+            **prompt: Optional input to construct the prompts for the model. Accepted formats are: 1) legacy Canary-1B API source_lang=<lang>, target_lang=<lang>, etc. 2) explicit single-turn role=<role>, slots={<slot>: <value>, ...} 3) explicit multi-turn: turns=[{"role": <role>, "slots": {<slot>: <value>, ...}}]
 
         Returns:
             A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
@@ -433,10 +429,7 @@ def transcribe(
                 channel_selector=channel_selector,
                 augmentor=augmentor,
                 verbose=verbose,
-                task=task,
-                pnc=pnc,
-                source_lang=source_lang,
-                target_lang=target_lang,
+                prompt=prompt,
             )
         else:
             if not isinstance(override_config, MultiTaskTranscriptionConfig):
@@ -738,9 +731,6 @@ def _transcribe_on_begin(self, audio, trcfg: MultiTaskTranscriptionConfig):
                 if hasattr(trcfg, '_internal') and hasattr(trcfg._internal, 'manifest_path'):
                     trcfg._internal.manifest_filepath = manifest_path
 
-        elif isinstance(audio, (np.ndarray, torch.Tensor)):
-            raise NotImplementedError("Transcribing from numpy or torch tensors is not supported yet.")
-
     def _transcribe_input_manifest_processing(
         self, audio_files: List[str], temp_dir: str, trcfg: MultiTaskTranscriptionConfig
     ) -> Dict[str, Any]:
@@ -792,7 +782,47 @@ def _transcribe_forward(self, batch: Any, trcfg: MultiTaskTranscriptionConfig):
         log_probs, encoded_len, enc_states, enc_mask = self.forward(
             input_signal=batch[0], input_signal_length=batch[1]
         )
-        decoder_input_ids = batch[-2].to(trcfg._internal.device)
+        if len(batch) == 6:
+            # Prompt provided by the dataloader.
+            decoder_input_ids = batch[4]
+        else:
+            # The dataloader provided only audio + audio_lens, so we
+            # are constructing the prompt dynamically using TranscribeConfig.
+
+            # Now ask the prompt formatter about which slots are required.
+            # It will return a default prompt structure with default slot values (if available, None otherwise).
+            # We iterate over that structure and update slot values based on ``trcfg.prompt``.
+            default_turns = self.prompt.get_default_dialog_slots()
+            if not trcfg.prompt:
+                # No turns were provided, use defaults.
+                turns = default_turns
+            else:
+                # Turns were provided, iterate over them and fill missing slot values using defaults..
+                turns = trcfg.prompt.copy()  # shallow copy #1: don't override the config
+                for turn in turns:
+                    role = turn["role"]
+                    # Check if we have defaults for this role.
+                    # There shouldn't be more than a single turn for a given role, but if there are,
+                    # we'll emit a warning.
+                    if default_turns_for_role := [t for t in default_turns if t["role"] == role]:
+                        if len(default_turns_for_role) > 1:
+                            warnings.warn(
+                                f"More than one default turn detected for {role=}. "
+                                f"We'll be using default slot values for the first turn of {role=} only."
+                            )
+                        default_slots = default_turns_for_role[0]["slots"]
+                        turn["slots"] = turn["slots"].copy()  # shallow copy #1: don't override the config
+                        # fill missing slots using defaults
+                        for slot, val in default_slots.items():
+                            if turn["slots"].get(slot) is None:
+                                turn["slots"][slot] = val
+
+            decoder_input_ids = (
+                self.prompt.encode_dialog(turns=turns)["context_ids"]
+                .unsqueeze(0)
+                .repeat(batch[0].shape[0], 1)
+                .to(trcfg._internal.device)
+            )
         output = dict(
             log_probs=log_probs,
             encoded_lengths=encoded_len,
@@ -906,6 +936,8 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
         Returns:
             A list of dictionaries with the audio file paths fixed.
         """
+        # This method is a legacy helper for Canary that checks whether prompt slot values were provided
+        # in the input manifest and if not, it injects the defaults.
         out_json_items = []
         for item in json_items:
             if isinstance(item, str):
@@ -913,28 +945,21 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
                 entry = {
                     'audio_filepath': item,
                     'duration': 100000,
-                    'source_lang': 'en' if trcfg.source_lang is None else trcfg.source_lang,
-                    'taskname': 'asr' if trcfg.task is None else trcfg.task,
-                    'target_lang': 'en' if trcfg.target_lang is None else trcfg.target_lang,
-                    'pnc': 'yes' if trcfg.pnc is None else 'yes' if trcfg.pnc else 'no',
                     trcfg.text_field: 'nothing',
                 }
             elif isinstance(item, dict):
                 entry = item
                 entry['audio_filepath'] = get_full_path(entry['audio_filepath'], manifest_file=manifest_path)
-
-                if 'source_lang' not in entry:
-                    entry['source_lang'] = 'en' if trcfg.source_lang is None else trcfg.source_lang
-                if 'taskname' not in entry:
-                    entry['taskname'] = 'asr' if trcfg.task is None else trcfg.task
-                if 'target_lang' not in entry:
-                    entry['target_lang'] = 'en' if trcfg.target_lang is None else trcfg.target_lang
-                if 'pnc' not in entry:
-                    entry['pnc'] = 'yes' if trcfg.pnc is None else 'yes' if trcfg.pnc else 'no'
                 if trcfg.text_field not in entry:
                     entry[trcfg.text_field] = 'nothing'
             else:
                 raise ValueError(f"Expected str or dict, got {type(item)}")
+            default_turn = [t for t in trcfg.prompt if t["role"] == "user"]
+            default_turn = default_turn[0]["slots"] if default_turn else {}
+            for k, dv in (("source_lang", "en"), ("target_lang", "en"), ("taskname", "asr"), ("pnc", "yes")):
+                if k not in entry:
+                    # last-chance fallback injecting legacy Canary defaults if none were provided.
+                    entry[k] = default_turn.get(k, dv)
             out_json_items.append(entry)
         return out_json_items
 
@@ -977,3 +1002,76 @@ def predict_step(self, batch, batch_idx=0, dataloader_idx=0, has_processed_signa
 
         text = [self.decoding.strip_special_tokens(t) for t in text]
         return text
+
+
+def parse_multitask_prompt(prompt: dict | None) -> list[dict]:
+    if prompt is None or not prompt:
+        return []
+
+    # Case 1.
+    # Multi-turn prompting format. This format conforms to PromptFormatter API and needs no further modification.
+    # This format allows to condition the model on chat history, system+user prompts, etc.
+    # Example:
+    # model.transcribe(
+    #     audio,
+    #     turns=[
+    #         dict(
+    #             role="user",
+    #             slots=dict(
+    #                 source_lang='en', target_lang='de', task='asr', pnc=True, context='translate this text'
+    #             ),
+    #         ),
+    #         dict(
+    #             role="assistant",
+    #             slots=dict(message="Calculating the translation of given text. Do you want to proceed?"),
+    #         ),
+    #         dict(
+    #             role="user",
+    #             slots=dict(
+    #                 source_lang='en', target_lang='de', task='asr', pnc=True, context='Yes, please proceed.'
+    #             ),
+    #         ),
+    #     ],
+    # )
+    if 'turns' in prompt:
+        assert (
+            len(prompt) == 1
+            and isinstance(prompt["turns"], list)
+            and all(isinstance(t, dict) and "role" in t and "slots" in t for t in prompt["turns"])
+        ), (
+            f"When providing a multi-turn prompt through 'turns', no other keys are allowed "
+            f"and the value under prompt['turns'] must be a list of dicts with roles and slot values "
+            f"(we received {prompt=})"
+        )
+        return prompt["turns"]
+
+    values_are_dicts = any(isinstance(v, dict) for k, v in prompt.items() if k != "slots")
+    assert not values_are_dicts, (
+        f"We don't support dict values for prompt keys other than 'slots'. " f"We received {prompt=}"
+    )
+
+    # Case 2.
+    # Single-turn prompting format with explicitly provided role and slot names and values.
+    # We create a 1-item multi-turn prompt from this input.
+    # Example:
+    # model.transcribe(
+    #     audio,
+    #     role="user",
+    #     slots=dict(source_lang='en', target_lang='de', task='asr', pnc=True, context='translate this text'),
+    # )
+    if "role" in prompt and "slots" in prompt:
+        assert isinstance(prompt["slots"], dict), (
+            f"When providing a single-turn prompt through 'role', 'slots' must also be provided "
+            f"(we received {prompt=})."
+        )
+        return [prompt]
+
+    # Case 3.
+    # Legacy prompting format for Canary-1B preserved for backward compatibility.
+    # Extra fields are converted to a single-turn prompt with role "user" (unless overridden with 'role').
+    # Example:
+    # model.transcribe(
+    #     audio, pnc=True, source_lang='en', target_lang='de', task='asr', context='translate this text'
+    # )
+    role = prompt.pop("role", "user")
+    return [dict(role=role, slots=prompt)]
diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py
index df8d6bac50a9..261e97a225dd 100644
--- a/nemo/collections/asr/parts/mixins/transcription.py
+++ b/nemo/collections/asr/parts/mixins/transcription.py
@@ -148,11 +148,9 @@ def get_item(self, index):
         # Calculate seq length
         seq_len = torch.tensor(samples.shape[0], dtype=torch.long)
 
-        # Dummy text tokens
-        text_tokens = torch.tensor([0], dtype=torch.long)
-        text_tokens_len = torch.tensor(1, dtype=torch.long)
-
-        return (samples, seq_len, text_tokens, text_tokens_len)
+        # Typically NeMo ASR models expect the mini-batch to be a 4-tuple of (audio, audio_len, text, text_len).
+        # For inference, we set text and text_len to None to not disrupt the shape of the tuple.
+        return samples, seq_len, None, None
 
 
 class TranscriptionMixin(ABC):
diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index 71c945b66255..51a46184e66f 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -21,7 +21,6 @@
 from omegaconf import OmegaConf
 from torch.utils.data import DataLoader
 
-from nemo.collections.asr.data.audio_to_text_lhotse_prompted import canary_prompt
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder
 from nemo.collections.asr.parts.preprocessing.features import normalize_batch
@@ -444,7 +443,10 @@ def _convert_buffer_to_features(self):
         device = self.asr_model.device
         audio_signal = samples.unsqueeze_(0).to(device)
         audio_signal_len = torch.Tensor([samples.shape[1]]).to(device)
-        features, features_len = self.raw_preprocessor(input_signal=audio_signal, length=audio_signal_len,)
+        features, features_len = self.raw_preprocessor(
+            input_signal=audio_signal,
+            length=audio_signal_len,
+        )
         features = features.squeeze()
         self._update_feature_buffer(features[:, -self.feature_chunk_len :])
 
@@ -479,7 +481,10 @@ def __init__(self, samples, frame_len, preprocessor, device, pad_to_frame_len=Tr
         self._feature_frame_len = frame_len / timestep_duration
         audio_signal = torch.from_numpy(self._samples).unsqueeze_(0).to(device)
         audio_signal_len = torch.Tensor([self._samples.shape[0]]).to(device)
-        self._features, self._features_len = preprocessor(input_signal=audio_signal, length=audio_signal_len,)
+        self._features, self._features_len = preprocessor(
+            input_signal=audio_signal,
+            length=audio_signal_len,
+        )
         self._features = self._features.squeeze()
 
     def __iter__(self):
@@ -701,7 +706,12 @@ class for streaming frame-based ASR use reset() method to reset FrameASR's
     """
 
     def __init__(
-        self, asr_model, frame_len=1.6, total_buffer=4.0, batch_size=4, pad_to_buffer_len=True,
+        self,
+        asr_model,
+        frame_len=1.6,
+        total_buffer=4.0,
+        batch_size=4,
+        pad_to_buffer_len=True,
     ):
         '''
         Args:
@@ -1183,7 +1193,9 @@ def _get_batch_preds(self):
         del best_hyp, pred
 
     def transcribe(
-        self, tokens_per_chunk: int, delay: int,
+        self,
+        tokens_per_chunk: int,
+        delay: int,
     ):
         """
         Performs "middle token" alignment prediction using the buffered audio chunk.
@@ -1210,7 +1222,12 @@ def transcribe(
                 ids, toks = self._alignment_decoder(alignment, self.asr_model.tokenizer, self.blank_id)
 
                 if len(ids) > 0 and a_idx < signal_end_idx:
-                    self.unmerged[idx] = inplace_buffer_merge(self.unmerged[idx], ids, delay, model=self.asr_model,)
+                    self.unmerged[idx] = inplace_buffer_merge(
+                        self.unmerged[idx],
+                        ids,
+                        delay,
+                        model=self.asr_model,
+                    )
 
         output = []
         for idx in range(self.batch_size):
@@ -1276,7 +1293,9 @@ def __init__(
         self.alignment_basepath = alignment_basepath
 
     def transcribe(
-        self, tokens_per_chunk: int, delay: int,
+        self,
+        tokens_per_chunk: int,
+        delay: int,
     ):
         if self.lcs_delay < 0:
             raise ValueError(
@@ -1302,7 +1321,10 @@ def transcribe(
 
                     if len(ids) > 0:
                         self.unmerged[idx] = inplace_buffer_merge(
-                            self.unmerged[idx], ids, delay, model=self.asr_model,
+                            self.unmerged[idx],
+                            ids,
+                            delay,
+                            model=self.asr_model,
                         )
 
                 else:
@@ -1588,15 +1610,17 @@ def get_input_tokens(self, sample: dict):
                     f"We found sample that is missing the following keys: {missing_keys}"
                     f"Please ensure that every utterance in the input manifests contains these keys. Sample: {sample}"
                 )
-            tokens = canary_prompt(
-                tokenizer=self.asr_model.tokenizer,
-                text=None,
-                language=None,
-                source_language=sample['source_lang'],
-                target_language=sample['target_lang'],
-                taskname=sample['taskname'],
-                pnc=sample['pnc'],
-            )
+            tokens = self.asr_model.prompt.encode_dialog(
+                turns=[
+                    {
+                        "role": "user",
+                        "slots": {
+                            **sample,
+                            self.asr_model.prompt.PROMPT_LANGUAGE_SLOT: "spl_tokens",
+                        },
+                    }
+                ]
+            )["context_ids"]
         else:
             raise ValueError(f"Unknown prompt format: {self.asr_model.prompt_format}")
         return torch.tensor(tokens, dtype=torch.long, device=self.asr_model.device).unsqueeze(0)  # [1, T]
@@ -1712,12 +1736,16 @@ def _get_batch_preds(self, keep_logits=False):
                 encoded, encoded_len = results
                 log_probs = self.asr_model.ctc_decoder(encoder_output=encoded)
                 transcribed_texts, _ = self.asr_model.ctc_decoding.ctc_decoder_predictions_tensor(
-                    decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
+                    decoder_outputs=log_probs,
+                    decoder_lengths=encoded_len,
+                    return_hypotheses=False,
                 )
             else:
                 log_probs, encoded_len, predictions = results
                 transcribed_texts, _ = self.asr_model.decoding.ctc_decoder_predictions_tensor(
-                    decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
+                    decoder_outputs=log_probs,
+                    decoder_lengths=encoded_len,
+                    return_hypotheses=False,
                 )
 
             self.all_preds.extend(transcribed_texts)
diff --git a/nemo/collections/common/prompts/__init__.py b/nemo/collections/common/prompts/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/collections/common/prompts/canary.py b/nemo/collections/common/prompts/canary.py
new file mode 100644
index 000000000000..aadc976ba474
--- /dev/null
+++ b/nemo/collections/common/prompts/canary.py
@@ -0,0 +1,71 @@
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+from nemo.collections.common.tokenizers.canary_tokenizer import (
+    CANARY_BOS,
+    CANARY_EOS,
+    CANARY_NOPNC,
+    CANARY_PNC,
+    CANARY_SPECIAL_TOKENIZER,
+)
+
+
+class CanaryPromptFormatter(PromptFormatter):
+    NAME = "canary"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"{CANARY_BOS}|source_lang||task||target_lang||pnc|",
+            "slots": {
+                "source_lang": Modality.Text,
+                "task": Modality.Text,
+                "target_lang": Modality.Text,
+                "pnc": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|text|{CANARY_EOS}",
+            "slots": {
+                "text": Modality.Text,
+            },
+        },
+    }
+
+    def encode_turn(self, prompt_template: str, expected_slots: dict, slot_values: dict) -> list[int]:
+        # This method handles a level of indirection for Canary.
+        # It maps values provided in trcfg to the actual special tokens
+        # expected to be present in canary prompt.
+        # It used to be done in prompt_format_fn inside Dataset class corresponding to Canary,
+        # but we are not using it here anymore.
+        # This maps things such as '|task|: "asr"' to '|TASK|: "<|transcribe|>"'.
+        slot_values = map_manifest_values_to_special_tokens(slot_values)
+        return super().encode_turn(
+            prompt_template=prompt_template, expected_slots=expected_slots, slot_values=slot_values
+        )
+
+
+def map_manifest_values_to_special_tokens(slot_values: dict[str, str]) -> dict[str, str]:
+    slot_values = slot_values.copy()
+
+    any_special_token_present = False
+
+    for k in ("source_lang", "target_lang"):
+        if k in slot_values and not ((v := slot_values[k]).startswith("<|") and v.endswith("|>")):
+            slot_values[k] = "<|" + slot_values[k] + "|>"
+            any_special_token_present = True
+
+    k = "pnc"
+    if k in slot_values and slot_values[k] not in (CANARY_PNC, CANARY_NOPNC):
+        slot_values[k] = CANARY_PNC if slot_values[k] in ("yes", "1", "True", "true") else CANARY_NOPNC
+        any_special_token_present = True
+
+    # Note: we re-map 'taskname' to 'task' for compatibility with earlier versions of Canary training.
+    for k in ("task", "taskname"):
+        if k in slot_values and slot_values[k] not in ("<|transcribe|>", "<|translate|>"):
+            slot_values["task"] = "<|transcribe|>" if slot_values[k] == "asr" else "<|translate|>"
+            any_special_token_present = True
+
+    # Auto-inject which tokenizer to look up in CanaryTokenizer if not provided,
+    # and slots for this turn correspond to user prompt.
+    if any_special_token_present and PromptFormatter.PROMPT_LANGUAGE_SLOT not in slot_values:
+        slot_values[PromptFormatter.PROMPT_LANGUAGE_SLOT] = CANARY_SPECIAL_TOKENIZER
+
+    return slot_values
diff --git a/nemo/collections/common/prompts/example.py b/nemo/collections/common/prompts/example.py
new file mode 100644
index 000000000000..3589efb938f4
--- /dev/null
+++ b/nemo/collections/common/prompts/example.py
@@ -0,0 +1,36 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/phi-2#phi-2-usage
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+
+class ExamplePromptFormatter(PromptFormatter):
+    """
+    The simplest possible prompt formatter implementation.
+
+    It defines a dialog of the form:
+
+        User: Hi.
+        Assistant: Hi, how can I help you?
+        User: What's the time?
+        Assistant: It's 9 o'clock.
+
+    """
+
+    NAME = "example_prompt_format"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"User: |message|\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"Assistant: |message|\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/formatter.py b/nemo/collections/common/prompts/formatter.py
new file mode 100644
index 000000000000..524b2e62c5a3
--- /dev/null
+++ b/nemo/collections/common/prompts/formatter.py
@@ -0,0 +1,347 @@
+from abc import ABC
+from enum import Enum
+from functools import lru_cache
+from typing import Any, Type
+
+import torch
+
+from nemo.collections.common.tokenizers import AggregateTokenizer, TokenizerSpec
+
+PREAMBLE_ROLE = "preamble"
+
+# Slots used to define when special tokens bos/eos should be inserted.
+# These are special in the sense of how sentencepiece defines special tokens:
+# They have to be specially inserted into the token sequence, and if they appear in the tokenized string,
+# SPE wouldn't use the special token ids but rather tokenize them as if they were normal strings.
+# We mimic SPE's behavior if these special slots are present in the template definition.
+# To achieve that, insert |bos| / |eos| at the beginning/end of template.
+# E.g., inserting only bos in llama2 user role: "template": "|bos|[INST] |message| [\INST]"
+BOS_SLOT = "|bos|"
+EOS_SLOT = "|eos|"
+
+
+class Modality(Enum):
+    """
+    Modalities supported as PromptFormatter slot values.
+    """
+
+    Text = "text"
+
+    def matches(self, value: Any) -> bool:
+        """
+        Checks if the provided value is compatible with an instance of Modality.
+        """
+        match self:
+            case Modality.Text:
+                return isinstance(value, str)
+            case _:
+                return False
+
+
+class PromptFormatter(ABC):
+    """
+    :class:`~nemo.collections.common.prompts.formatter.PromptFormatter` is intended to simplify
+    working with various prompt format templates and encoding them into token ID tensors.
+
+    It assumes a dialog-like structure, which is a list of turns, with each turn assigned to a role.
+    Sub-classes of PromptFormatter define turn templates for each role under TEMPLATE class attribute.
+    Each template may define some constant parts (e.g. begin-of-turn or end-of-turn tokens, whitespaces, etc.)
+    and variable parts which we call "slots", that will be provided by the user during training or inference.
+
+    A role is typically "user" and "assistant", and some popular models also use a "system" role.
+    Other roles may be defined as well. We expect the role corresponding to the model's responses
+    will be registered under class attribute called OUTPUT_ROLE.
+    We reserve a special "preamble" role with no slots that will be inserted at the beginning of
+    the formatted prompt, if "preamble" is present in TEMPLATE.
+
+    A turn is a dict with keys "role" and "slots", where "slots" are a dict that maps slot names
+    to values that should be filled in the template.
+    For example, a user role template may be ``"Question: |message|"`` and corresponding ``slots`` would then be
+    ``{"message": "What time is it?"}``.
+
+    There is a special slot called ``|prompt_language|`` that's used to select the sub-tokenizer in
+    :class:`~nemo.collections.common.tokenizers.aggregate_tokenizer.AggregateTokenizer`.
+    It's only used when the tokenizer is aggregate; otherwise it's discarded.
+
+    PromptFormatter supports constructing prompts for training (complete context and answers)
+    and for inference (context-only).
+    Training/inference is determined automatically; if the last role in a dialog is the OUTPUT_ROLE,
+    that's an 'asked-and-answered' scenario, so we assume it's inteded for training.
+    We'll create a dict with tokenized results available under the following keys:
+
+    * ``context_ids`` (all turns minus last one),
+    * ``answer_ids`` (last turn)
+    * ``input_ids`` (previous two values concatenated)
+    * ``mask`` (boolean mask tensor of the same lenth as ``input_ids`` that's set to True on OUTPUT_ROLE turns)
+
+    Typically, the user will use the ``encode_dialog`` method providing a list of turns to it.
+    Example showing how to construct model inputs/outputs for training::
+
+        >>> formatter = PromptFormatter(tokenizer)
+        ... encoded_for_training = formatter.encode_dialog(
+        ...     turns=[
+        ...         {"role": "user", "slots": {"message": "What time is it?"}},
+        ...         {"role": "assistant", "slots": {"message": "Ten o'clock."}},
+        ...         {"role": "user", "slots": {"message": "PM or AM?"}},
+        ...         {"role": "assistant", "slots": {"message": "AM, naturally! It's bright outside"}},
+        ...     ]
+        ... )
+
+    Another example that shows how to use the same method to generate prompts for inference::
+
+
+        >>> formatter = PromptFormatter(tokenizer)
+        ... encoded_for_training = formatter.encode_dialog(
+        ...     turns=[
+        ...         {"role": "user", "slots": {"message": "What time is it?"}},
+        ...         {"role": "assistant", "slots": {"message": "Ten o'clock."}},
+        ...         {"role": "user", "slots": {"message": "PM or AM?"}},
+        ...     ]
+        ... )
+
+    """
+
+    # Used to support AggregateTokenizer; this key selects the right sub-tokenizer for each turn.
+    PROMPT_LANGUAGE_SLOT = "prompt_language"
+
+    # Subclasses will be registered under this name, to be used via PromptFormatter.resolve(name).
+    NAME = None
+
+    # Template is a dict that maps:
+    # * from a role name string (system/user/assistant/etc)
+    # * to a dict with keys
+    #   * "template" that has a string value (the prompt template)
+    #   * "slots" that has a value of dict[str, Modality]
+    #       * keys of slots are the names of formattable slots in the prompt template
+    #       * values of slots are :class:`Modality` objects that can be used to check
+    #           whether a specific value conforms to a given modality requirements
+    #           (e.g., Modality.Text may expect string objects).
+    # Template is intended to be defined by the child classes.
+    TEMPLATE = None
+
+    # Turns under this role indicate responses by the model; if the last turn in
+    # PromptFormatter.encode_dialog() ends with this role, it indicates a training example.
+    OUTPUT_ROLE = None
+
+    # Internal reserved field.
+    _REGISTERED_FORMATTERS = {}
+
+    def __init__(self, tokenizer: TokenizerSpec, defaults: list[dict] | None = None) -> None:
+        self.tokenizer = tokenizer
+        self._defaults = defaults if defaults is not None else []
+        self._validate_defaults()
+
+    def __init_subclass__(cls, **kwargs) -> None:
+        ERR = "PromptFormatter subclass definition error:"
+        if cls.__name__ not in cls._REGISTERED_FORMATTERS:
+            for attr in ("NAME", "TEMPLATE", "OUTPUT_ROLE"):
+                assert (
+                    getattr(cls, attr, None) is not None
+                ), f"{ERR} PromptFormatter subclass {cls} did not define a class attribute {attr}"
+            assert cls.NAME not in cls._REGISTERED_FORMATTERS, (
+                f"Cannot register {cls.__name__} under {cls.NAME}: another prompt formatter of type "
+                f"{cls._REGISTERED_FORMATTERS[cls.NAME]} has already been registered under this name."
+            )
+            cls._REGISTERED_FORMATTERS[cls.NAME] = cls
+        if "preamble" in cls.TEMPLATE:
+            assert (
+                len(cls.TEMPLATE["preamble"].get("slots", [])) == 0
+            ), f"{ERR} Slots are not allowed for preamble template, but we found: '{cls.TEMPLATE['preamble']}'"
+        for role in cls.get_roles():
+            template = cls.get_template(role)
+            for slot in cls.get_slots(role):
+                assert (
+                    _mangled(slot) in template
+                ), f"{ERR} Slot '{slot}' not found in template '{template}' for role '{role}'"
+        super().__init_subclass__(**kwargs)
+
+    @classmethod
+    def resolve(cls, name: str) -> Type["PromptFormatter"]:
+        if name not in cls._REGISTERED_FORMATTERS:
+            raise RuntimeError(
+                f"Unknown prompt formatter: '{name}' (known formats: {', '.join(cls._REGISTERED_FORMATTERS.keys())})"
+            )
+        return cls._REGISTERED_FORMATTERS[name]
+
+    @classmethod
+    @lru_cache(1)
+    def get_roles(cls) -> list[str]:
+        return list(cls.TEMPLATE.keys())
+
+    @classmethod
+    def get_slots(cls, role: str) -> dict[str, Modality]:
+        # returns a copy to avoid accidential mutation of a global object by the user
+        return cls.TEMPLATE[role].get("slots", {}).copy()
+
+    @classmethod
+    def get_template(cls, role: str) -> str:
+        return cls.TEMPLATE[role]["template"]
+
+    def get_default_dialog_slots(self) -> list[dict]:
+        """
+        Returns a list of dialog turns that can be used as a skeleton to fill with actual slot values.
+        If ``PromptFormatter`` was initialized with ``defaults`` argument, this method will return the
+        defaults. Otherwise, every slot is pre-filled with ``None``.
+        """
+
+        def _get_default_for_role(role: str) -> dict:
+            for turn in self._defaults:
+                if turn["role"] == role:
+                    return turn
+            return {}
+
+        return [
+            {
+                "role": role,
+                "slots": {
+                    slot: _get_default_for_role(role).get("slots", {}).get(slot) for slot in self.get_slots(role)
+                },
+            }
+            for role in self.get_roles()
+            if role != self.OUTPUT_ROLE
+        ]
+
+    def encode_turn(
+        self, prompt_template: str, expected_slots: dict[str, Modality], slot_values: dict[str, Any]
+    ) -> list[int]:
+        prompt = prompt_template
+        for slot in expected_slots:
+            # For the final substitution of 'slot' in the template we have to mangle it to '|slot|' anyway,
+            # but 'slot' form enables to use valid python identifiers as **kwargs
+            # for passing slots around in user functions.
+            value = slot_values.get(slot)
+            assert value is not None, f"Missing required {slot=} in {slot_values=} for {prompt_template=}"
+            prompt = prompt.replace(_mangled(slot), value)
+        return self._apply_tokenizer(prompt, lang=slot_values.get(self.PROMPT_LANGUAGE_SLOT))
+
+    def encode_dialog(self, turns: list[dict]) -> dict[str, torch.Tensor]:
+        assert len(turns) > 0, "Empty dialog is not supported."
+        roles = self.get_roles()
+
+        turn_tokens = []
+        turn_token_counts = []
+        turn_mask_values = []
+
+        if "preamble" in self.TEMPLATE:
+            preamble_turns = [idx for idx, t in enumerate(turns) if t["role"] == "preamble"]
+            if not preamble_turns:
+                turns = [{"role": "preamble", **self.TEMPLATE["preamble"]}] + turns
+            else:
+                assert (
+                    len(preamble_turns) == 1 and preamble_turns[0] == 0
+                ), f"Preamble can only be presented at turn 0, but we found preamble turns at indexes {preamble_turns}."
+
+        for turn in turns:
+            assert "role" in turn, f"A turn must have have a 'role' key. We received {turn=}"
+            role = turn["role"]
+            assert role in roles, f"Found turn with {role=}, but availables roles are {roles}"
+            expected_slots = self.get_slots(role)
+            slot_values = turn.get("slots", {})
+            if expected_slots:
+                assert (
+                    slot_values
+                ), f"A turn for role {role} must have have a non-empty value under 'slots' key. We received {turn=}"
+                self._validate_slot_values(expected_slots, slot_values)
+            template = self.get_template(role)
+            tokens = self.encode_turn(template, expected_slots, slot_values)
+            turn_tokens.extend(tokens)
+            turn_token_counts.append(len(tokens))
+            turn_mask_values.append(role == self.OUTPUT_ROLE)
+
+        ans = {"input_ids": torch.tensor(turn_tokens, dtype=torch.long)}
+        if turn_mask_values[-1]:
+            # The last turn comes from OUTPUT_ROLE, i.e. it's a response from the system.
+            # This indicates it's a training example for which we provide context/answer/mask.
+            ans["context_ids"] = ans["input_ids"][: -turn_token_counts[-1]]
+            ans["answer_ids"] = ans["input_ids"][-turn_token_counts[-1] :]
+            ans["mask"] = torch.tensor(
+                [
+                    turn_mask_values[turn_idx]
+                    for turn_idx, turn_len in enumerate(turn_token_counts)
+                    for _ in range(turn_len)
+                ],
+                dtype=torch.bool,
+            )
+        else:
+            ans["context_ids"] = ans["input_ids"]  # context == input for inference
+        return ans
+
+    def _apply_tokenizer(self, text: str, lang: str | None = None) -> list[int]:
+        # Check if the tokenizer is aggregate and perform extra checks.
+        is_agg = isinstance(self.tokenizer, AggregateTokenizer)
+        if is_agg:
+            assert lang is not None, (
+                f"Missing key '{self.PROMPT_LANGUAGE_SLOT}' in slot_values -- cannot resolve "
+                f"the correct sub-tokenizer in the aggregate tokenizer."
+            )
+
+        # Strip bos/eos if present and remember to apply them later.
+        has_bos = text.startswith(BOS_SLOT)
+        has_eos = text.endswith(EOS_SLOT)
+        if has_bos:
+            text = text[len(BOS_SLOT) :]
+        if has_eos:
+            text = text[: -len(EOS_SLOT)]
+
+        # Tokenize, selecting the right API depending on aggregate/normal tokenizer.
+        if is_agg:
+            tokens = self.tokenizer.text_to_ids(text, lang)
+        else:
+            tokens = self.tokenizer.text_to_ids(text)
+
+        # Lazily look up bos/eos and apply them. Lazy has the advantage that if a tokenizer
+        # doesn't define bos/eos and the prompt format does not request them, everything just works.
+        if has_eos:
+            eos_id = self.tokenizer.get_eos(lang) if is_agg else self.tokenizer.eos
+            tokens.append(eos_id)
+        if has_bos:
+            bos_id = self.tokenizer.get_bos(lang) if is_agg else self.tokenizer.bos
+            tokens = [bos_id] + tokens
+
+        return tokens
+
+    def _validate_slot_values(self, expected: dict[str, Modality], received: dict[str, Any]) -> None:
+        missing = set(expected) - set(received)
+        assert not missing, f"The following slot values were not provided: {missing}"
+        for slot in expected:
+            expected_modality = expected[slot]
+            value = received[slot]
+            assert expected_modality.matches(
+                value
+            ), f"{slot=} received {value=} which does not match modality {expected_modality}"
+
+    def _validate_defaults(self):
+        if not self._defaults:
+            return
+
+        err = "Error in default prompt definition:"
+        assert isinstance(self._defaults, list)
+        for turn in self._defaults:
+            assert isinstance(turn, dict)
+            assert "role" in turn, f"{err} Missing required 'role' key. We received {turn=}"
+            role = turn["role"]
+            assert role in self.get_roles(), (
+                f"{err} Invalid {role=} in {turn=} - " f"supported roles are: {self.get_roles()}."
+            )
+            if expected_slots := self.get_slots(role):
+                assert "slots" in turn, (
+                    f"{err} Missing required 'slots' key in {turn=} - "
+                    f"we expected the following slots to be provided: {expected_slots}."
+                )
+                for slot in turn["slots"]:
+                    assert slot in expected_slots, (
+                        f"{err} Invalid {slot=} in {turn=}. "
+                        f"The following slots are supported for {role=}: {expected_slots}"
+                    )
+
+
+def _mangled(slot: str) -> str:
+    if not (slot[0] == "|" and slot[-1] == "|"):
+        return f"|{slot}|"
+    return slot
+
+
+def _unmangled(slot: str) -> str:
+    if slot[0] == "|" and slot[-1] == "|":
+        return slot[1:-1]
+    return slot
diff --git a/nemo/collections/common/prompts/gemma.py b/nemo/collections/common/prompts/gemma.py
new file mode 100644
index 000000000000..e3b81c848a3e
--- /dev/null
+++ b/nemo/collections/common/prompts/gemma.py
@@ -0,0 +1,29 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/gemma#gemma-7b-prompt-format
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+GEMMA_BOS = "<start_of_turn>"
+GEMMA_END_OF_TURN = "<end_of_turn>"
+GEMMA_NL = "\n\n"
+
+
+class GemmaPromptFormatter(PromptFormatter):
+    NAME = "gemma"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"{GEMMA_BOS}user\n|message|{GEMMA_END_OF_TURN}\n{GEMMA_BOS}model\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            # Note: that trailing NL is bothering me.
+            "template": f"|message|{GEMMA_END_OF_TURN}\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/llama.py b/nemo/collections/common/prompts/llama.py
new file mode 100644
index 000000000000..fdaccfaa846e
--- /dev/null
+++ b/nemo/collections/common/prompts/llama.py
@@ -0,0 +1,72 @@
+from nemo.collections.common.prompts.formatter import BOS_SLOT, EOS_SLOT, Modality, PromptFormatter
+
+
+class Llama2PromptFormatter(PromptFormatter):
+    """
+    This template has been validated to provide identical tokenized results to the official code
+    in https://github.com/meta-llama/llama/blob/main/llama/generation.py
+    """
+
+    NAME = "llama2"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "system_and_user": {
+            "template": f"{BOS_SLOT}[INST] <<SYS>>\n|system|\n<</SYS>>\n\n|message| [/INST]",
+            "slots": {
+                "system": Modality.Text,
+                "message": Modality.Text,
+            },
+        },
+        "user": {
+            "template": "|bos|[INST] |message| [/INST]",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message| {EOS_SLOT}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
+
+
+LLAMA3_BOS = "<|begin_of_text|>"
+LLAMA3_HEADER_BEGIN = "<|start_header_id|>"
+LLAMA3_HEADER_END = "<|end_header_id|>"
+LLAMA3_END_OF_TURN = "<|eot_id|>"
+LLAMA3_NL = "\n\n"
+
+
+class Llama3PromptFormatter(PromptFormatter):
+    """
+    Implemented following the code at:
+     https://github.com/meta-llama/llama3/blob/main/llama/test_tokenizer.py#L56
+    """
+
+    NAME = "llama3"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "preamble": {
+            "template": LLAMA3_BOS,
+        },
+        "system": {
+            "template": f"{LLAMA3_HEADER_BEGIN}system{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        "user": {
+            "template": f"{LLAMA3_HEADER_BEGIN}user{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"{LLAMA3_HEADER_BEGIN}assistant{LLAMA3_HEADER_END}{LLAMA3_NL}|message|{LLAMA3_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/mistral.py b/nemo/collections/common/prompts/mistral.py
new file mode 100644
index 000000000000..e882ac5973b1
--- /dev/null
+++ b/nemo/collections/common/prompts/mistral.py
@@ -0,0 +1,33 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/mistral-7b#chat-template-for-mistral-7b-instruct
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+MISTRAL_BOS = "<s>"
+MISTRAL_PROMPT_BEGIN = "[INST]"
+MISTRAL_PROMPT_END = "[/INST]"
+MISTRAL_END_OF_TURN = "</s>"
+MISTRAL_NL = "\n\n"
+
+
+class MistralPromptFormatter(PromptFormatter):
+    NAME = "mistral"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "preamble": {
+            "template": MISTRAL_BOS,
+        },
+        "user": {
+            "template": f"{MISTRAL_PROMPT_BEGIN} |message| {MISTRAL_PROMPT_END} ",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|{MISTRAL_END_OF_TURN}",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/prompts/phi2.py b/nemo/collections/common/prompts/phi2.py
new file mode 100644
index 000000000000..67dad8d5dd82
--- /dev/null
+++ b/nemo/collections/common/prompts/phi2.py
@@ -0,0 +1,62 @@
+"""
+Implemented following the guide at https://www.promptingguide.ai/models/phi-2#phi-2-usage
+"""
+
+from nemo.collections.common.prompts.formatter import Modality, PromptFormatter
+
+
+class Phi2QAPromptFormatter(PromptFormatter):
+    NAME = "phi2_qa"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"Instruct: |message|\nOutput: ",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
+
+
+class Phi2ChatPromptFormatter(PromptFormatter):
+    NAME = "phi2_chat"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"Human: |message|\nAI: ",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
+
+
+class Phi2CodePromptFormatter(PromptFormatter):
+    NAME = "phi2_code"
+    OUTPUT_ROLE = "assistant"
+    TEMPLATE = {
+        "user": {
+            "template": f"|message|\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+        OUTPUT_ROLE: {
+            "template": f"|message|",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
+    }
diff --git a/nemo/collections/common/tokenizers/aggregate_tokenizer.py b/nemo/collections/common/tokenizers/aggregate_tokenizer.py
index 9c003c37525a..66ec28ebda4d 100644
--- a/nemo/collections/common/tokenizers/aggregate_tokenizer.py
+++ b/nemo/collections/common/tokenizers/aggregate_tokenizer.py
@@ -15,6 +15,7 @@
 from typing import Dict, List, Union
 
 import numpy as np
+import torch
 
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.utils import logging
@@ -124,7 +125,7 @@ def tokens_to_text(self, tokens, lang_id):
         return tokenizer.decode_pieces(tokens)
 
     def ids_to_text(self, ids):
-        if isinstance(ids, np.ndarray):
+        if isinstance(ids, (np.ndarray, torch.Tensor)):
             ids = ids.tolist()
 
         tokens = []
@@ -224,6 +225,12 @@ def tokens_to_ids(self, tokens: Union[str, List[str]], langs: Union[str, List[st
             ids.append(self.token_to_id(token, lang_id))
         return ids
 
+    def get_bos(self, lang_id: str) -> int:
+        return self.tokenizers_dict[lang_id].bos + self.token_id_offset[lang_id]
+
+    def get_eos(self, lang_id: str) -> int:
+        return self.tokenizers_dict[lang_id].eos + self.token_id_offset[lang_id]
+
     @property
     def vocab(self):
         return self.vocabulary
diff --git a/nemo/collections/common/tokenizers/canary_tokenizer.py b/nemo/collections/common/tokenizers/canary_tokenizer.py
index aed95c1f9312..6adcdd8cf734 100644
--- a/nemo/collections/common/tokenizers/canary_tokenizer.py
+++ b/nemo/collections/common/tokenizers/canary_tokenizer.py
@@ -24,7 +24,15 @@
 __all__ = ['CanaryTokenizer']
 
 # Default tokens for compatibility with Canary.
-DEFAULT_TOKENS = ["<|nospeech|>", "<pad>", "<|endoftext|>", "<|startoftranscript|>", "<|pnc|>", "<|nopnc|>"]
+CANARY_BOS = "<|startoftranscript|>"
+CANARY_EOS = "<|endoftext|>"
+CANARY_PAD = "<pad>"
+CANARY_NOSPEECH = "<|nospeech|>"
+CANARY_PNC = "<|pnc|>"
+CANARY_NOPNC = "<|nopnc|>"
+DEFAULT_TOKENS = [CANARY_NOSPEECH, CANARY_PAD, CANARY_EOS, CANARY_BOS, CANARY_PNC, CANARY_NOPNC]
+
+CANARY_SPECIAL_TOKENIZER = "spl_tokens"
 
 
 class CanaryTokenizer(AggregateTokenizer):
@@ -37,26 +45,51 @@ def __init__(self, tokenizers: Dict):
 
         # for easy access of special tokens
         self.special_tokens = {}
-        for special in tokenizers['spl_tokens'].vocab:
+        for special in tokenizers[CANARY_SPECIAL_TOKENIZER].vocab:
             # Search for special prompting tokens
-            if (special.startswith("<|") and special.endswith("|>")) or special == "<pad>":
-                self.special_tokens[special] = self.token_to_id(special, lang_id='spl_tokens')
+            if (special.startswith("<|") and special.endswith("|>")) or special == CANARY_PAD:
+                self.special_tokens[special] = self.token_to_id(special, lang_id=CANARY_SPECIAL_TOKENIZER)
 
     @cached_property
     def eos_id(self) -> int:
-        return self.special_tokens["<|endoftext|>"]
+        return self.special_tokens[CANARY_EOS]
 
     @cached_property
     def bos_id(self) -> int:
-        return self.special_tokens["<|startoftranscript|>"]
+        return self.special_tokens[CANARY_BOS]
 
     @cached_property
     def nospeech_id(self) -> int:
-        return self.special_tokens["<|nospeech|>"]
+        return self.special_tokens[CANARY_NOSPEECH]
 
     @cached_property
     def pad_id(self) -> int:
-        return self.special_tokens["<pad>"]
+        return self.special_tokens[CANARY_PAD]
+
+    def text_to_ids(self, text, lang_id) -> list[int]:
+        if lang_id == CANARY_SPECIAL_TOKENIZER:
+            return self._tokenize_special_prompt(text)
+        if text.endswith(CANARY_EOS):
+            return super().text_to_ids(text[: -len(CANARY_EOS)], lang_id) + [self.eos_id]
+        return super().text_to_ids(text[-len(CANARY_EOS) :], lang_id)
+
+    def _tokenize_special_prompt(self, text: str) -> list[int]:
+        """
+        Tokenize the input special prompt of the following schema:
+
+        <|startoftranscript|><|source_lang|><|taskname|><|target_lang|><|pnc|>
+
+        Required because otherwise self.text_to_ids() returns a different result than what Canary had been trained with.
+        """
+        ans = []
+        assert text.count('>') == 5, f"Expected exactly 5 special tokens in Canary's prompt, got: {text}."
+        assert text.startswith(CANARY_BOS), text
+        for _ in range(5):
+            token = text[: text.find(">") + 1]
+            ans.append(self.special_tokens[token])
+            text = text[len(token) :]
+        assert len(text) == 0, text
+        return ans
 
     def spl_token_to_id(self, token):
         if token_id := self.special_tokens.get(f"<|{token}|>", None):
diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
index aed05673f6fa..4a47f0e49b1e 100644
--- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
+++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import sentencepiece
+import torch
 
 from nemo.collections.common.parts.utils import if_exist
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
@@ -127,7 +128,7 @@ def tokens_to_text(self, tokens):
         return self.tokenizer.decode_pieces(tokens)
 
     def ids_to_text(self, ids):
-        if isinstance(ids, np.ndarray):
+        if isinstance(ids, (np.ndarray, torch.Tensor)):
             ids = ids.tolist()
 
         if self.legacy:
diff --git a/tests/collections/__init__.py b/tests/collections/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/collections/asr/test_asr_multitask_model_bpe.py b/tests/collections/asr/test_asr_multitask_model_bpe.py
index d250fbcf74a1..986df09deacb 100644
--- a/tests/collections/asr/test_asr_multitask_model_bpe.py
+++ b/tests/collections/asr/test_asr_multitask_model_bpe.py
@@ -80,9 +80,18 @@ def asr_model(test_data_dir):
         'dir': None,
         'type': 'agg',
         'langs': {
-            'spl_tokens': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "canary"), 'type': 'bpe',},
-            'en': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), 'type': 'wpe',},
-            'de': {'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), 'type': 'wpe',},
+            'spl_tokens': {
+                'dir': os.path.join(test_data_dir, "asr", "tokenizers", "canary"),
+                'type': 'bpe',
+            },
+            'en': {
+                'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"),
+                'type': 'wpe',
+            },
+            'de': {
+                'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"),
+                'type': 'wpe',
+            },
         },
         'custom_tokenizer': {
             '_target_': 'nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer',
@@ -98,6 +107,9 @@ def asr_model(test_data_dir):
     modelConfig = DictConfig(
         {
             'prompt_format': 'canary',
+            'prompt_defaults': [
+                {"role": "user", "slots": {"source_lang": "en", "target_lang": "en", "task": "asr", "pnc": "yes"}}
+            ],
             'sample_rate': 16000,
             'preprocessor': DictConfig(preprocessor),
             'model_defaults': DictConfig(model_defaults),
@@ -304,10 +316,9 @@ def test_transcribe_tensor(self, asr_model, test_data_dir):
         audio, sr = sf.read(audio_file, dtype='float32')
 
         # Numpy array test
-        with pytest.raises(NotImplementedError):
-            outputs = asr_model.transcribe(audio, batch_size=1)
-        # assert len(outputs) == 1
-        # assert isinstance(outputs[0], str)
+        outputs = asr_model.transcribe(audio, batch_size=1)
+        assert len(outputs) == 1
+        assert isinstance(outputs[0], str)
 
     @pytest.mark.unit
     def test_build_tokenizer(self, asr_model, test_data_dir):
diff --git a/tests/collections/asr/test_custom_tokenizer.py b/tests/collections/asr/test_custom_tokenizer.py
index 5a033045b709..61692061661f 100644
--- a/tests/collections/asr/test_custom_tokenizer.py
+++ b/tests/collections/asr/test_custom_tokenizer.py
@@ -67,7 +67,9 @@ class DummyModel(ASRBPEMixin, Serialization):
                 "spl_tokens": {"dir": special_tokenizer_path, "type": "bpe"},
                 "en": {"dir": lang_tokenizer_path, "type": "bpe"},
             },
-            "custom_tokenizer": {"_target_": "nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer",},
+            "custom_tokenizer": {
+                "_target_": "nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer",
+            },
         }
     )
     model._setup_aggregate_tokenizer(config)
@@ -83,5 +85,11 @@ class DummyModel(ASRBPEMixin, Serialization):
     assert isinstance(tokenizer.tokenizers_dict["en"], SentencePieceTokenizer)
     assert tokenizer.tokenizers_dict["en"].vocab_size == 6
 
-    assert tokenizer.text_to_ids("<|startoftranscript|>", lang_id="spl_tokens") == [13, 4]  # "_" comes first
+    assert tokenizer.text_to_ids("<|startoftranscript|><|en|><|asr|><|en|><|pnc|>", lang_id="spl_tokens") == [
+        4,
+        9,
+        7,
+        9,
+        5,
+    ]
     assert tokenizer.text_to_ids("a", lang_id="en") == [14 + 1, 14 + 2]
diff --git a/tests/collections/common/prompt_formatters/conftest.py b/tests/collections/common/prompt_formatters/conftest.py
new file mode 100644
index 000000000000..e18f1072af24
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/conftest.py
@@ -0,0 +1,51 @@
+import pytest
+
+from nemo.collections.common.tokenizers import CanaryTokenizer, SentencePieceTokenizer
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import create_spt_model
+
+# Note: We don't really define special tokens for this test so every 'special token'
+#       will be represented as a number of regular tokens.
+TOKENIZER_TRAIN_TEXT = """
+Example system message.
+Example user message.
+Example assistant message.
+TEST
+[INST]
+[/INST]
+<s>
+</s>
+<<SYS>>
+<</SYS>>
+User: Assistant:
+user model
+Instruct Output 
+\n\n
+<start_of_turn> <end_of_turn>
+<|
+|>
+<|en|> <|de|> <|fr|> <|es|> <|transcribe|> <|translate|> <|pnc|> <|nopnc|> <|startoftranscript|> <|endoftext|>
+Feel free to add new tokens for your own tests!?
+But know that if you do so, you may need to update the token IDs in the existing tests! 
+So, it might be a good idea to create a new tokenizer instead when adding new prompt formats.
+"""
+
+
+@pytest.fixture(scope="session")
+def bpe_tokenizer(tmp_path_factory):
+    tmpdir = tmp_path_factory.mktemp("bpe_tokenizer")
+    text_path = tmpdir / "text.txt"
+    text_path.write_text(TOKENIZER_TRAIN_TEXT)
+    create_spt_model(str(text_path), vocab_size=512, sample_size=-1, do_lower_case=False, output_dir=str(tmpdir))
+    return SentencePieceTokenizer(str(tmpdir / "tokenizer.model"))
+
+
+@pytest.fixture(scope="session")
+def canary_tokenizer(bpe_tokenizer, tmp_path_factory):
+    tmpdir = tmp_path_factory.mktemp("spl_tokens")
+    spl_tokens = CanaryTokenizer.build_special_tokenizer(["transcribe", "en"], tmpdir)
+    return CanaryTokenizer(
+        tokenizers={
+            "spl_tokens": spl_tokens,
+            "en": bpe_tokenizer,
+        }
+    )
diff --git a/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py
new file mode 100644
index 000000000000..ff786766b246
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_canary_prompt_formatter.py
@@ -0,0 +1,50 @@
+from nemo.collections.common.prompts.canary import CanaryPromptFormatter
+
+
+def test_canary_prompt_formatter_training(canary_tokenizer):
+    formatter = CanaryPromptFormatter(canary_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {
+                "role": "user",
+                "slots": {
+                    "source_lang": "<|en|>",
+                    "target_lang": "<|en|>",
+                    "task": "<|transcribe|>",
+                    "pnc": "<|pnc|>",
+                    "prompt_language": "spl_tokens",
+                },
+            },
+            {"role": "assistant", "slots": {"text": "TEST", "prompt_language": "en"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [4, 8, 7, 8, 5, 11, 91, 30, 40, 3]
+    assert ans["context_ids"].tolist() == [4, 8, 7, 8, 5]
+    assert ans["answer_ids"].tolist() == [11, 91, 30, 40, 3]
+    assert ans["mask"].tolist() == [False] * 5 + [True] * 5
+    # fmt: on
+
+
+def test_canary_prompt_formatter_inference(canary_tokenizer):
+    formatter = CanaryPromptFormatter(canary_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {
+                "role": "user",
+                "slots": {
+                    "source_lang": "<|en|>",
+                    "target_lang": "<|en|>",
+                    "task": "<|transcribe|>",
+                    "pnc": "<|pnc|>",
+                    "prompt_language": "spl_tokens",
+                },
+            },
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [4, 8, 7, 8, 5]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py
new file mode 100644
index 000000000000..be1f6de1a873
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_gemma_prompt_formatter.py
@@ -0,0 +1,40 @@
+from nemo.collections.common.prompts.gemma import GemmaPromptFormatter
+
+
+def test_gemma_prompt_formatter_training(bpe_tokenizer):
+    formatter = GemmaPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [ 21,  53,  18,  26,  18,   6,  60,   9,   7,  75,  31,   1,  81,  20,
+         30, 104,  59,  18,  26,  18,   6,  60,   9,   7,  21,  53,  18,  26,
+         18,   6,  60,   9,   7,  73,  61,  69,   1,  81,  20,  30, 104,  59,
+         18,  26,  18,   6,  60,   9,   7]
+    assert ans["context_ids"].tolist() == [ 21,  53,  18,  26,  18,   6,  60,   9,   7,  75,  31,   1,  81,  20,
+         30, 104,  59,  18,  26,  18,   6,  60,   9,   7,  21,  53,  18,  26,
+         18,   6,  60,   9,   7,  73,  61,  69]
+    assert ans["answer_ids"].tolist() == [1,  81,  20,  30, 104,  59,
+         18,  26,  18,   6,  60,   9,   7]
+    assert ans["mask"].tolist() == [False] * 36 + [True] * 13
+    # fmt: on
+
+
+def test_gemma_prompt_formatter_inference(bpe_tokenizer):
+    formatter = GemmaPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [ 21,  53,  18,  26,  18,   6,  60,   9,   7,  75,  31,   1,  81,  20,
+                                          30, 104,  59,  18,  26,  18,   6,  60,   9,   7,  21,  53,  18,  26,
+                                          18,   6,  60,   9,   7,  73,  61,  69]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py
new file mode 100644
index 000000000000..9636dd31c768
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_llama2_prompt_formatter.py
@@ -0,0 +1,63 @@
+from nemo.collections.common.prompts.llama import Llama2PromptFormatter
+
+
+def test_llama2_prompt_formatter_training(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, -1]
+    assert ans["context_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    assert ans["answer_ids"].tolist() == [1, 81, 20, 30, -1]
+    assert ans["mask"].tolist() == [False] * 16 + [True] * 5
+    # fmt: on
+
+
+def test_llama2_prompt_formatter_inference(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    # fmt: on
+
+
+def test_llama2_prompt_formatter_training_with_system(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "system_and_user", "slots": {"system": "TEST", "message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, -1]
+    assert ans["context_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    assert ans["answer_ids"].tolist() == [1, 81, 20, 30, -1]
+    assert ans["mask"].tolist() == [False] * 33 + [True] * 5
+    # fmt: on
+
+
+def test_llama2_prompt_formatter_inference_with_system(bpe_tokenizer):
+    formatter = Llama2PromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "system_and_user", "slots": {"system": "TEST", "message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [-1, 54, 42, 49, 30, 50, 77, 13, 45, 13, 7, 7, 1, 81, 20, 30, 21, 66, 13, 45, 13, 7, 7, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py b/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py
new file mode 100644
index 000000000000..edc00d426952
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_mistral_prompt_formatter.py
@@ -0,0 +1,32 @@
+from nemo.collections.common.prompts.mistral import MistralPromptFormatter
+
+
+def test_mistral_prompt_formatter_training(bpe_tokenizer):
+    formatter = MistralPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+            {"role": "assistant", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids", "answer_ids", "mask"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50, 1, 81, 20, 30, 66, 8, 7]
+    assert ans["context_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    assert ans["answer_ids"].tolist() == [1, 81, 20, 30, 66, 8, 7]
+    assert ans["mask"].tolist() == [False] * 18 + [True] * 7
+    # fmt: on
+
+
+def test_mistral_prompt_formatter_inference(bpe_tokenizer):
+    formatter = MistralPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"message": "TEST"}},
+        ]
+    )
+    assert set(ans) == {"input_ids", "context_ids"}
+    # fmt: off
+    assert ans["input_ids"].tolist() == ans["context_ids"].tolist()
+    assert ans["input_ids"].tolist() == [21, 8, 7, 54, 42, 49, 30, 50, 1, 81, 20, 30, 54, 72, 42, 49, 30, 50]
+    # fmt: on
diff --git a/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py b/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py
new file mode 100644
index 000000000000..26ade7da1415
--- /dev/null
+++ b/tests/collections/common/prompt_formatters/test_prompt_formatter_api.py
@@ -0,0 +1,147 @@
+import pytest
+
+from nemo.collections.common.prompts.canary import PromptFormatter
+from nemo.collections.common.prompts.formatter import Modality
+
+
+class _DummyPromptFormatter(PromptFormatter):
+    NAME = "_dummy_test_formatter"
+    TEMPLATE = {
+        "user": {"template": "<s>|text|</s>", "slots": {"text": Modality.Text}},
+        "assistant": {"template": "|text|</s>", "slots": {"text": Modality.Text}},
+    }
+    OUTPUT_ROLE = "assistant"
+
+
+def test_prompt_formatter_empty_dialog_exception(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    with pytest.raises(AssertionError):
+        formatter.encode_dialog([])
+
+
+def test_prompt_formatter_inference(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}])
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "<s>hi</s>"
+
+
+def test_prompt_formatter_training(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"text": "hi"}},
+            {"role": "assistant", "slots": {"text": "hello"}},
+        ]
+    )
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "<s>hi</s> hello</s>", recovered
+
+
+def test_prompt_formatter_missing_role(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    with pytest.raises(AssertionError, match="A turn must have have a 'role' key"):
+        formatter.encode_dialog([{"slots": {"text": "hi"}}])
+
+
+def test_prompt_formatter_missing_slots(bpe_tokenizer):
+    formatter = _DummyPromptFormatter(bpe_tokenizer)
+    with pytest.raises(
+        AssertionError, match="A turn for role user must have have a non-empty value under 'slots' key"
+    ):
+        formatter.encode_dialog([{"role": "user"}])
+    with pytest.raises(
+        AssertionError, match="A turn for role user must have have a non-empty value under 'slots' key"
+    ):
+        formatter.encode_dialog([{"role": "user", "slots": {}}])
+
+
+def test_prompt_formatter_aggregate_tokenizer(canary_tokenizer):
+    # Note the 'canary_tokenizer' arg which is an aggregate tokenizer fixture.
+    formatter = _DummyPromptFormatter(canary_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {
+                "role": "user",
+                "slots": {
+                    "text": "hi",
+                    "prompt_language": "en",
+                },
+            }
+        ]
+    )
+    recovered = canary_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == " <s>hi</s>"
+
+
+def test_prompt_formatter_aggregate_tokenizer_missing_prompt_language(canary_tokenizer):
+    # Note the 'canary_tokenizer' arg which is an aggregate tokenizer fixture.
+    formatter = _DummyPromptFormatter(canary_tokenizer)
+
+    with pytest.raises(AssertionError, match="Missing key 'prompt_language' in slot_values"):
+        formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}])
+
+
+class _DummyPreamblePromptFormatter(PromptFormatter):
+    NAME = "_dummy_preamble_test_formatter"
+    TEMPLATE = {
+        "preamble": {"template": "TEST"},
+        "user": {"template": "<s>|text|</s>", "slots": {"text": Modality.Text}},
+        "assistant": {"template": "|text|</s>", "slots": {"text": Modality.Text}},
+    }
+    OUTPUT_ROLE = "assistant"
+
+
+def test_prompt_formatter_preamble_inference(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog([{"role": "user", "slots": {"text": "hi"}}])
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "TEST <s>hi</s>", recovered
+
+
+def test_prompt_formatter_premble_training(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog(
+        [
+            {"role": "user", "slots": {"text": "hi"}},
+            {"role": "assistant", "slots": {"text": "hello"}},
+        ]
+    )
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "TEST <s>hi</s> hello</s>"
+
+
+def test_prompt_formatter_explicit_preamble(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    ans = formatter.encode_dialog([{"role": "preamble"}, {"role": "user", "slots": {"text": "hi"}}])
+    recovered = bpe_tokenizer.ids_to_text(ans["input_ids"])
+    assert recovered == "TEST <s>hi</s>"
+
+
+def test_prompt_formatter_wrong_preamble_excpetions(bpe_tokenizer):
+    formatter = _DummyPreamblePromptFormatter(bpe_tokenizer)
+    with pytest.raises(AssertionError):
+        # Error: 2 preambles
+        formatter.encode_dialog(
+            [
+                {"role": "preamble"},
+                {"role": "preamble"},
+                {"role": "user", "slots": {"text": "hi"}},
+            ]
+        )
+    with pytest.raises(AssertionError):
+        # Error: preamble not at the beginning
+        formatter.encode_dialog(
+            [
+                {"role": "user", "slots": {"text": "hi"}},
+                {"role": "preamble"},
+            ]
+        )
+    with pytest.raises(AssertionError):
+        # Error: preamble with slots
+        formatter.encode_dialog(
+            [
+                {"role": "user", "slots": {"text": "hi"}},
+                {"role": "preamble", "slots": {"abc": "abc"}},
+            ]
+        )

From b0f3138a6be7fab3175deb8935f8492aeb1445bd Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithya.r@gmail.com>
Date: Sat, 1 Jun 2024 09:41:45 -0700
Subject: [PATCH 144/178] support null/None truncation field (#9355)

* support null/None truncation field

Signed-off-by: arendu <adithyare@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: arendu <arendu@users.noreply.github.com>

* Fix truncation when truncation field is empty

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix final truncation if truncation_field is not enough

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hsiehjackson <hsiehjackson@users.noreply.github.com>

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Signed-off-by: arendu <arendu@users.noreply.github.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: hsiehjackson <hsiehjackson@users.noreply.github.com>
Co-authored-by: arendu <arendu@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: hsiehjackson <hsiehjackson@users.noreply.github.com>
---
 .../megatron/gpt_sft_dataset.py               | 113 +++++++++++-------
 1 file changed, 72 insertions(+), 41 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index faaa10606aa0..e16543a7568d 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -101,7 +101,7 @@ def __init__(
         self.seed = seed
         self.label_key = label_key
         self.answer_only_loss = answer_only_loss
-        self.truncation_fields = truncation_field.split(',')
+        self.truncation_fields = truncation_field.split(',') if truncation_field is not None else []
         self.pad_to_max_length = pad_to_max_length
         self.index_mapping_dir = index_mapping_dir
         self.prompt_template = prompt_template
@@ -166,8 +166,9 @@ def _maybe_validate_prompt_template(self):
         ), f'{label_placeholder} must be at the end of prompt_template.'
 
         # Legacy checkpoints has self.truncation_fields = ['context'] and self.prompt_template_keys = ['input', 'output']
-        if self.prompt_template_keys[0] == 'input' and self.truncation_fields[0] == 'context':
-            self.truncation_fields[0] = self.prompt_template_keys[0]
+        if len(self.truncation_fields) > 0:
+            if self.prompt_template_keys[0] == 'input' and self.truncation_fields[0] == 'context':
+                self.truncation_fields[0] = self.prompt_template_keys[0]
 
         assert set(self.truncation_fields).issubset(
             self.prompt_template_keys
@@ -305,32 +306,61 @@ def _multiple_truncation(self, template_ids: List[List[int]], template_ids_keys:
         if total_ids > self.max_seq_length:
             truncation_length_total = total_ids - self.max_seq_length
             num_fields = len(self.truncation_fields)
-            # sorted equal divide length to each field
-            # examples:
-            #   truncation_length_total = 3
-            #   num_fields = 11
-            #   truncation_length_list = [3,4,4]
-            truncation_length_list = [
-                truncation_length_total // num_fields + (1 if i < truncation_length_total % num_fields else 0)
-                for i in range(num_fields)[::-1]
-            ]
-
-            for i, (ids, key) in enumerate(zip(template_ids, template_ids_keys)):
-                if key in self.truncation_fields:
-                    truncation_length = truncation_length_list.pop()
-                    if len(ids) < truncation_length:
-                        logging.warning(f'{key} is not long enough to truncate.')
-                        truncation_length = len(ids)
-
-                    if self.truncation_method == 'left':
-                        window_offset = truncation_length
-                    elif self.truncation_method == 'right':
-                        window_offset = 0
-                    else:
-                        raise ValueError(f'{self.truncation_method} is not supported')
+            if num_fields > 0:
+                # sorted equal divide length to each field
+                # examples:
+                #   truncation_length_total = 3
+                #   num_fields = 11
+                #   truncation_length_list = [3,4,4]
+                truncation_length_list = [
+                    truncation_length_total // num_fields + (1 if i < truncation_length_total % num_fields else 0)
+                    for i in range(num_fields)[::-1]
+                ]
 
-                    window_length = len(ids) - truncation_length
-                    template_ids[i] = ids[window_offset : window_offset + window_length]
+                for i, (ids, key) in enumerate(zip(template_ids, template_ids_keys)):
+                    if key in self.truncation_fields:
+                        truncation_length = truncation_length_list.pop()
+                        if len(ids) < truncation_length:
+                            logging.warning(f'{key} is not long enough to truncate.')
+                            truncation_length = len(ids)
+
+                        if self.truncation_method == 'left':
+                            window_offset = truncation_length
+                        elif self.truncation_method == 'right':
+                            window_offset = 0
+                        else:
+                            raise ValueError(f'{self.truncation_method} is not supported')
+
+                        window_length = len(ids) - truncation_length
+                        template_ids[i] = ids[window_offset : window_offset + window_length]
+            else:
+                # If truncation_field is empty, we truncate template_ids (List[List[int]]) to make total ids < self.max_seq_length.
+                logging.warning(
+                    f'`truncation_field` is empty, we truncate input from {self.truncation_method} based on truncation_method.'
+                )
+                template_ids_lengths = [len(ids) for ids in template_ids]
+                if self.truncation_method == 'left':
+                    iters = range(0, len(template_ids_lengths), 1)
+                elif self.truncation_method == 'right':
+                    iters = range(len(template_ids_lengths) - 1, -1, -1)
+                else:
+                    raise ValueError(f'{self.truncation_method} is not supported')
+
+                # Iterate all lengths of template_ids.
+                for i in iters:
+                    if template_ids_lengths[i] >= truncation_length_total:
+                        template_ids_lengths[i] -= truncation_length_total
+                        if self.truncation_method == 'left':
+                            template_ids[i] = template_ids[i][-template_ids_lengths[i] :]
+                        elif self.truncation_method == 'right':
+                            template_ids[i] = template_ids[i][: template_ids_lengths[i]]
+                        else:
+                            raise ValueError(f'{self.truncation_method} is not supported')
+                        break
+                    else:
+                        truncation_length_total -= template_ids_lengths[i]
+                        template_ids_lengths[i] = 0
+                        template_ids[i] = []
 
         context_ids = [i for ids in template_ids[:-1] for i in ids]
         label_ids = template_ids[-1]
@@ -362,31 +392,30 @@ def _process_example(self, example):
             # these pad/eos tokens are placeholders for virtual tokens
             context_ids = [self.tokenizer.eos_id] * self.virtual_tokens + context_ids
 
-        input_ids = context_ids
-        answer_start_idx = len(input_ids)
-
         # Adds bos token in the start
         if self.add_bos:
             context_ids = [self.tokenizer.bos_id] + context_ids
-            input_ids = [self.tokenizer.bos_id] + input_ids
-            answer_start_idx += 1
 
         # Adds sep token between text/prompt and answer
         if self.add_sep:
             context_ids = context_ids + [self.sep_id]
-            input_ids = input_ids + [self.sep_id]
-            answer_start_idx += 1
 
-        input_ids = input_ids + answer_ids
+        input_ids = context_ids + answer_ids
 
         # Only training need to consider eos token
         if self.add_eos:
             input_ids = input_ids + [self.tokenizer.eos_id]
 
         if len(input_ids) > self.max_seq_length:
-            logging.warning(f'Input ids length {len(input_ids)} exceed max sequence length {self.max_seq_length}')
+            # this only happens if tuncation_field is not enough to truncate.
+            # context_ids can be empty if we truncate contexts.
+            # answer_ids can be empty if we truncate answers.
+            logging.warning(
+                f'After truncation, input ids length {len(input_ids)} still exceeds max sequence length {self.max_seq_length}'
+            )
+            context_ids = context_ids[: self.max_seq_length]
             input_ids = input_ids[: self.max_seq_length]
-            answer_ids = input_ids[answer_start_idx:]
+            answer_ids = input_ids[len(context_ids) :]
 
         # store metadata in dataset, in case user may have keys required in the prediction json files
         metadata = {k: v for k, v in example.items() if k not in self.prompt_template_keys}
@@ -396,7 +425,7 @@ def _process_example(self, example):
 
         processed_example = {
             'input_ids': input_ids,
-            'answer_start_idx': answer_start_idx,
+            'answer_start_idx': len(context_ids),
             'context_ids': context_ids,
             'context_length': len(context_ids),
             'answer_ids': answer_ids,
@@ -426,7 +455,7 @@ def _collate_item(self, item, max_length, pad_id):
         return item
 
     def _build_loss_mask(self, processed_example):
-        """ Pad input_ids in batch to max batch length while building loss mask """
+        """Pad input_ids in batch to max batch length while building loss mask"""
         input_ids = processed_example['input_ids']
         answer_start_idx = processed_example['answer_start_idx']
         if self.answer_only_loss:
@@ -641,7 +670,9 @@ def collate_fn(self, batch):
         else:
             attention_mask = [self._create_attention_mask(max_length) for _ in batch]
             processed_batch.update(
-                {'attention_mask': torch.stack(attention_mask),}
+                {
+                    'attention_mask': torch.stack(attention_mask),
+                }
             )
 
         return processed_batch

From 9218c3aab7af7c2d7f3d6e45c0b027bafe25eba8 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Mon, 3 Jun 2024 09:53:39 -0700
Subject: [PATCH 145/178] cicd_remove_commented_code (#9364)

---
 .github/workflows/cicd-main.yml | 40 +--------------------------------
 1 file changed, 1 insertion(+), 39 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index b924cf975b18..29e84b933f14 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -43,33 +43,11 @@ jobs:
         docker container prune --filter "until=24h" --force
         docker image prune -a --filter "until=24h" --force
 
-#  checkout-repository:
-#    runs-on: self-hosted-azure
-#    container:
-#      image: nvcr.io/nvidia/pytorch:24.02-py3
-#      volumes:
-#        - ${{ github.workspace }}:/workspace
-#    steps:
-#    - name: Checkout repository
-#      uses: actions/checkout@v4
-#      with:
-#        path: ${{ github.run_id }}
-
 
   cicd-test-container-setup:
     needs: [cicd-cluster-clean]
     runs-on: self-hosted-azure-builder
     if: ${{ github.event.label.name == 'Run CICD' }}
-    # uses: actions/cache@v2
-    #container:
-#      image: nvcr.io/nvidia/pytorch:24.02-py3
-#      options: 
-#        # --user 0:128
-#        --device=/dev/nvidia0
-#        --gpus all
-#        --shm-size=8g 
-#        --env TRANSFORMERS_OFFLINE=0
-#        --env HYDRA_FULL_ERROR=1
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
@@ -114,23 +92,7 @@ jobs:
           # These checks are not crucial
           exit 0
         '
-
-    # - name: Build and push to local registry
-    #   uses: docker/build-push-action@v5
-    #   with:
-    #       context: .
-    #       push: true
-    #       tags: nemoci.azurecr.io/name/app:latest
-
-    # - name: Inspect
-    #   run: |
-    #     docker buildx imagetools inspect nemoci.azurecr.io/name/app:latest
-
-    #- name: Post-workflow execution
-    #  uses: gacts/run-and-post-run@v1
-    #  with:
-    #    post: |
-    #      chmod -R 777 .
+        ### \'\'
 
 
   OPTIONAL_L0_Unit_Tests_GPU:

From 48a2668821e86b4e514c9b04f16d5a7c7e51fd70 Mon Sep 17 00:00:00 2001
From: paul-gibbons <87940629+paul-gibbons@users.noreply.github.com>
Date: Mon, 3 Jun 2024 11:22:51 -0700
Subject: [PATCH 146/178] NeVa token fusion (#9245)

* token fusion via mlp downsampling + media_type default fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* inference update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* adapter fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* config refactor, remove image_token_len dependency, transpose mlp_downsample height and weight

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* removing image_token_len in text generation strategy

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* fix patch_dim text generation

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* crop-size fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* fixing RGB reversal bug

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* crop_size default -> None in text_generation_strategy

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* patch_dim padding for mlp_downsample

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* patch_dim padding update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* updating h/w patch_dim naming convention

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

---------

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>
Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>
Co-authored-by: paul-gibbons <paul-gibbons@users.noreply.github.com>
---
 .../neva/conf/llava_config.yaml               |  1 +
 .../multimodal_llm/neva/conf/neva_config.yaml |  2 +-
 .../multimodal_llm/neva/conf/neva_peft.yaml   |  1 +
 .../neva/conf/video_neva_config.yaml          |  2 +-
 .../multimodal/data/neva/neva_dataset.py      | 59 ++++++++++++++-----
 nemo/collections/multimodal/parts/utils.py    | 15 ++---
 .../megatron/adapters/parallel_adapters.py    | 53 ++++++++++++++---
 .../common/text_generation_strategy.py        | 30 ++++++++--
 8 files changed, 127 insertions(+), 36 deletions(-)

diff --git a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
index 68d554efb806..b47c719fef1d 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
@@ -74,6 +74,7 @@ model:
       from_pretrained: "openai/clip-vit-large-patch14" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [224, 224]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
index b9904981a5db..9ec6e51bb004 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
@@ -74,6 +74,7 @@ model:
       from_pretrained: "" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [224, 224]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
@@ -189,7 +190,6 @@ model:
     is_multimodal: True
     media_type: image # currently supported: image
     sep_image_conv_front: False
-    image_token_len: 256
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
     image_folder: null
     image_aspect_ratio: 'square'
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
index bde6718faf1a..5dfcec776b69 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
@@ -74,6 +74,7 @@ model:
       from_pretrained: "" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [224, 224]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
diff --git a/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
index e2ba8494f2cd..8341ff857202 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/video_neva_config.yaml
@@ -75,6 +75,7 @@ model:
       from_pretrained: "" # path or name
       from_hf: True
       patch_dim: 14
+      crop_size: [336, 336]
       hidden_size: 1024 # could be found from model but tricky in code
       vision_select_layer: -2   # default to the last layer
       class_token_length: 1
@@ -194,7 +195,6 @@ model:
     num_frames: 8 # selects the number of frames to use from the video
     sep_token_between_frames: False # TODO: allow usage of separator tokens between frames
     sep_image_conv_front: False
-    image_token_len: 256
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
     image_folder: null
     video_folder: null
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 70afc5b4a19a..07b5ad1a32df 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -145,25 +145,26 @@ def open_video(self, file_name):
                     cap = decord.VideoReader(f)
                     return self.flatten_frames(cap)
         else:
+            decord.bridge.set_bridge("torch")
             cap = decord.VideoReader(os.path.join(self.video_folder, file_name))
             return self.flatten_frames(cap)
         return None
 
     def flatten_frames(self, cap):
         if self.data_cfg['splice_single_frame'] == 'first':
-            frame = cap[0].asnumpy()[:, :, ::-1]
+            frame = cap[0].asnumpy()
             return Image.fromarray(frame).convert('RGB')
         elif self.data_cfg['splice_single_frame'] == 'middle':
-            frame = cap[len(cap) // 2].asnumpy()[:, :, ::-1]
+            frame = cap[len(cap) // 2].asnumpy()
             return Image.fromarray(frame).convert('RGB')
         elif self.data_cfg['splice_single_frame'] == 'last':
-            frame = cap[-1].asnumpy()[:, :, ::-1]
+            frame = cap[-1].asnumpy()
             return Image.fromarray(frame).convert('RGB')
         else:
             if self.data_cfg['num_frames'] == -1:
                 frames = []
                 for frame in cap:
-                    rgb_frame = frame.asnumpy()[:, :, ::-1]
+                    rgb_frame = frame.asnumpy()
                     img = Image.fromarray(rgb_frame).convert('RGB')
                     frames.append(img)
                 return frames
@@ -171,10 +172,7 @@ def flatten_frames(self, cap):
                 num_frames = min(len(cap), self.data_cfg['num_frames'])
                 indices = np.linspace(0, len(cap) - 1, num_frames, dtype=int)
                 frames = []
-                for i in indices:
-                    rgb_frame = cap[i].asnumpy()[:, :, ::-1]
-                    img = Image.fromarray(rgb_frame).convert('RGB')
-                    frames.append(img)
+                frames = cap.get_batch(indices)
 
                 while len(frames) < self.data_cfg['num_frames']:
                     frames.append(frames[-1])
@@ -262,9 +260,13 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
         return sources
 
     num_patches = image_token_len
+
     if media_type == 'video':
         num_patches *= multimodal_cfg['num_frames']
 
+    if multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+        num_patches //= 4
+
     if multimodal_cfg['use_im_start_end']:
         replace_token = DEFAULT_IMAGE_PATCH_TOKEN[model_type] * num_patches
     else:
@@ -922,9 +924,19 @@ def expand2square(pil_img, background_color):
             media_tensors = torch.tensor([])
             if images:
                 media_tensors = torch.stack(images)
-                cur_token_len = (media_tensors[0].shape[1] // 14) * (
-                    media_tensors[0].shape[2] // 14
-                )  # FIXME: 14 is hardcoded patch size
+                patch_dim = self.multimodal_cfg['patch_dim']
+
+                height_num_patches = media_tensors[0].shape[1] // patch_dim
+                width_num_patches = media_tensors[0].shape[2] // patch_dim
+
+                if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+                    if height_num_patches % 2 != 0:
+                        height_num_patches += 1
+                    if width_num_patches % 2 != 0:
+                        width_num_patches += 1
+
+                cur_token_len = height_num_patches * width_num_patches
+
                 sources = preprocess_multimodal(
                     copy.deepcopy(sources),
                     self.multimodal_cfg,
@@ -978,9 +990,19 @@ def expand2square(pil_img, background_color):
             media_tensors = frames
             if videos:
                 media_tensors = torch.stack(videos)
-                cur_token_len = (media_tensors[0].shape[-1] // 14) * (
-                    media_tensors[0].shape[-2] // 14
-                )  # FIXME: 14 is hardcoded patch size
+                patch_dim = self.multimodal_cfg['patch_dim']
+
+                height_num_patches = media_tensors[0].shape[-2] // patch_dim
+                width_num_patches = media_tensors[0].shape[-1] // patch_dim
+
+                if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+                    if height_num_patches % 2 != 0:
+                        height_num_patches += 1
+                    if width_num_patches % 2 != 0:
+                        width_num_patches += 1
+
+                cur_token_len = height_num_patches * width_num_patches
+
                 sources = preprocess_multimodal(
                     copy.deepcopy(sources),
                     self.multimodal_cfg,
@@ -1190,11 +1212,15 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
     add_extra_token = 1
     if getattr(model_cfg, 'no_seqlen_plus_one_input_tokens', False):
         add_extra_token = 0
-    crop_size = data_cfg.get("crop_size", (224, 224))
+    crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224))
     if mm_cfg.vision_encoder.from_hf:
         image_processor = CLIPImageProcessor.from_pretrained(
             mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
         )
+        assert crop_size == (
+            image_processor.crop_size['height'],
+            image_processor.crop_size['width'],
+        ), f"Crop size {crop_size} does not match the HuggingFace CLIP model's crop size {(image_processor.crop_size['height'], image_processor.crop_size['width'])}"
     else:
         # TODO(yuya): Fix this hard-code for our own CLIP
         image_processor = image_transform(
@@ -1212,8 +1238,8 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
             sep_image_conv_front=data_cfg.sep_image_conv_front,
             model_type=mm_cfg.llm.get("model_type", "nvgpt"),
             conv_template=data_cfg.get("conv_template", "nvgpt"),
+            patch_dim=model_cfg.mm_cfg.vision_encoder.patch_dim,
             crop_size=crop_size,
-            image_token_len=data_cfg.image_token_len,
             image_folder=data_cfg.get('image_folder', None),
             video_folder=data_cfg.get('video_folder', None),
             image_aspect_ratio=data_cfg.image_aspect_ratio,
@@ -1223,6 +1249,7 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
             context_length=model_cfg.encoder_seq_length,
             media_type=data_cfg.get('media_type', 'image'),
             num_frames=data_cfg.get('num_frames', -1),
+            mm_mlp_adapter_type=model_cfg.mm_cfg.get('mm_mlp_adapter_type', 'linear'),
         ),
         data_cfg=dict(
             splice_single_frame=data_cfg.get('splice_single_frame', None),
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 70dd2174a2b7..8f2549b8fcd0 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -15,6 +15,7 @@
 import tempfile
 from typing import Any, Callable, Tuple
 
+import decord
 import numpy as np
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
@@ -469,23 +470,23 @@ def expand2square(pil_img, background_color):
 
     # add video processor for video neva
     def video_processor(maybe_video_path):
-        from decord import VideoReader
 
         if isinstance(maybe_video_path, str):
-            vr = VideoReader(maybe_video_path)
+            decord.bridge.set_bridge("torch")
+            vr = decord.VideoReader(maybe_video_path)
             if neva_cfg.data.splice_single_frame == 'first':
-                frames = [Image.fromarray(vr[0].asnumpy()[:, :, ::-1]).convert('RGB')]
+                frames = [Image.fromarray(vr[0].asnumpy()).convert('RGB')]
             elif neva_cfg.data.splice_single_frame == 'middle':
-                frames = [Image.fromarray(vr[len(vr) // 2].asnumpy()[:, :, ::-1]).convert('RGB')]
+                frames = [Image.fromarray(vr[len(vr) // 2].asnumpy()).convert('RGB')]
             elif neva_cfg.data.splice_single_frame == 'last':
-                frames = [Image.fromarray(vr[-1].asnumpy()[:, :, ::-1]).convert('RGB')]
+                frames = [Image.fromarray(vr[-1].asnumpy()).convert('RGB')]
             else:
                 if neva_cfg.data.num_frames == -1:
-                    frames = [Image.fromarray(frame.asnumpy()[:, :, ::-1]).convert('RGB') for frame in vr]
+                    frames = [Image.fromarray(frame.asnumpy()).convert('RGB') for frame in vr]
                 else:
                     num_frames = min(len(vr), neva_cfg.data.num_frames)
                     indices = np.linspace(0, len(vr) - 1, num_frames, dtype=int)
-                    frames = [Image.fromarray(vr[i].asnumpy()[:, :, ::-1]).convert('RGB') for i in indices]
+                    frames = vr.get_batch(indices)
 
                     while len(frames) < neva_cfg.data.num_frames:
                         frames.append(frames[-1])
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 51510f1b881e..541ca9c28f3d 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -275,7 +275,9 @@ def _get_init_fn(self, init_method: str):
             raise NotImplementedError("out_init_method should be zero, normal, kaiming or xavier")
         return init_fn
 
-    def adapter_unfreeze(self,):
+    def adapter_unfreeze(
+        self,
+    ):
         """
         Can be customized to allow for selective training of only some params in the PEFT.
         """
@@ -402,7 +404,7 @@ class LoraQAdapter(ParallelLinearAdapter):
 
 class LoraDenseAttentionAdapter(ParallelLinearAdapter):
     """
-    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes
     and they do not use an bottleneck activation function
     """
 
@@ -411,7 +413,7 @@ class LoraDenseAttentionAdapter(ParallelLinearAdapter):
 
 class LoraHto4HAdapter(ParallelLinearAdapter):
     """
-    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes
     and they do not use an bottleneck activation function
     """
 
@@ -420,7 +422,7 @@ class LoraHto4HAdapter(ParallelLinearAdapter):
 
 class Lora4HtoHAdapter(ParallelLinearAdapter):
     """
-    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes
     and they do not use an bottleneck activation function
     """
 
@@ -688,14 +690,20 @@ def set_inference_table(self, prompt_representation: torch.Tensor):
         self.is_inference_ready = True
         return True
 
-    def clear_inference_table(self,):
+    def clear_inference_table(
+        self,
+    ):
         self.inference_table.fill_(0.0)
         self.is_inference_ready = False
 
-    def get_inference_table(self,):
+    def get_inference_table(
+        self,
+    ):
         return self.inference_table.data
 
-    def inner_forward(self,):
+    def inner_forward(
+        self,
+    ):
         input_embeds = self.embedding(self.indices).unsqueeze(0)
         intermediate_parallel, bias_parallel = self.first(input_embeds)
         intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel)
@@ -890,6 +898,29 @@ class LoraKQVAdapterWeightTyingConfig(ParallelLinearAdapterWeightTyingConfig):
     _target_: str = "{0}.{1}".format(LoraKQVAdapterWeightTying.__module__, LoraKQVAdapterWeightTying.__name__)
 
 
+class DownSampleBlock(nn.Module):
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[3] ** 0.5)
+        vit_embeds = vit_embeds.reshape(*vit_embeds.shape[:3], h, w, -1)
+        vit_embeds = self.flat_square(vit_embeds)
+        vit_embeds = vit_embeds.reshape(*vit_embeds.shape[:3], -1, vit_embeds.shape[-1])
+        return vit_embeds
+
+    def flat_square(self, x):
+        b, T, F, h, w, c = x.size()
+        if w % 2 == 1:
+            x = torch.cat([x, torch.zeros((b, T, F, h, 1, c), dtype=x.dtype).to(x.device)], dim=4)
+            b, T, F, h, w, c = x.size()
+        if h % 2 == 1:
+            x = torch.cat([x, torch.zeros((b, T, F, 1, w, c), dtype=x.dtype).to(x.device)], dim=3)
+            b, T, F, h, w, c = x.size()
+        x = x.view(b, T, F, h, int(w / 2), int(c * 2))
+        x = x.permute(0, 1, 2, 4, 3, 5).contiguous()
+        x = x.view(b, T, F, int(h / 2), int(w / 2), int(c * 4))
+        return x
+
+
 class MultimodalProjectorAdapter(nn.Module, AdapterModuleUtil):
     def __init__(self, adapter_type: str, in_features: int, out_features: int, bias: bool, **kwargs) -> None:
         super().__init__()
@@ -898,6 +929,14 @@ def __init__(self, adapter_type: str, in_features: int, out_features: int, bias:
             self.mm_projector = torch.nn.Linear(in_features, out_features, bias)
         elif adapter_type == 'identity':
             self.mm_projector = lambda x: x
+        elif adapter_type == 'mlp_downsample':
+            self.mm_projector = torch.nn.Sequential(
+                DownSampleBlock(),
+                torch.nn.LayerNorm(in_features * 4),
+                torch.nn.Linear(in_features * 4, out_features, bias),
+                torch.nn.GELU(),
+                torch.nn.Linear(out_features, out_features, bias),
+            )
         else:
             mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', adapter_type)
             if mlp_gelu_match:
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index 44a80465c34b..e8e2859e439f 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -20,7 +20,7 @@
 from typing import List, Set, Tuple
 
 import torch
-
+from transformers import CLIPImageProcessor
 from nemo.collections.nlp.modules.common.lm_utils import pad_batch
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
@@ -533,7 +533,6 @@ class NevaModelTextGenerationStrategy(TextGenerationStrategy):
     def __init__(self, model):
         super().__init__(model)
         self.forward_model = self.model.model
-        self.num_media_latents = model.cfg.data.get("image_token_len", 576)
         self.tokenizer = self.model.tokenizer
         self.image_paths = []
         self.cfg = self.model.cfg
@@ -545,8 +544,10 @@ def __init__(self, model):
             sep_image_conv_front=self.data_cfg.sep_image_conv_front,
             conv_template=self.data_cfg.get("conv_template", "nvgpt"),
             model_type=self.cfg.mm_cfg.llm.get("model_type", "nvgpt"),
-            image_token_len=self.data_cfg.image_token_len,
-            image_folder=self.data_cfg.image_folder,
+            patch_dim=self.cfg.mm_cfg.vision_encoder.patch_dim,
+            crop_size=self.cfg.mm_cfg.vision_encoder.get("crop_size", None),
+            image_folder=self.data_cfg.get('image_folder', None),
+            video_folder=self.data_cfg.get('video_folder', None),
             image_aspect_ratio=self.data_cfg.image_aspect_ratio,
             use_im_start_end=getattr(self.cfg.mm_cfg, 'use_im_start_end', False),
             image_processor=None,
@@ -554,7 +555,28 @@ def __init__(self, model):
             context_length=self.cfg.encoder_seq_length,
             media_type=getattr(self.data_cfg, 'media_type', 'image'),
             num_frames=getattr(self.data_cfg, 'num_frames', 1),
+            mm_mlp_adapter_type=getattr(self.cfg.mm_cfg, 'mm_mlp_adapter_type', 'linear'),
         )
+        if self.multimodal_cfg['crop_size'] is None:
+            image_processor = CLIPImageProcessor.from_pretrained(
+                self.cfg.mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
+            )
+            self.multimodal_cfg['crop_size'] = (
+                image_processor.crop_size['height'],
+                image_processor.crop_size['width'],
+            )
+
+        patch_dim = self.multimodal_cfg['patch_dim']
+        height_num_patches = self.multimodal_cfg['crop_size'][0] // patch_dim
+        width_num_patches = self.multimodal_cfg['crop_size'][1] // patch_dim
+
+        if self.multimodal_cfg['mm_mlp_adapter_type'] == 'mlp_downsample':
+            if height_num_patches % 2 != 0:
+                height_num_patches += 1
+            if width_num_patches % 2 != 0:
+                width_num_patches += 1
+
+        self.num_media_latents = height_num_patches * width_num_patches
 
     def clip_max_len(self, maxlen: int) -> int:
         """clip the max len based on the LM model max sequence length"""

From bd014d9d71a258da6c69c80df8244a9598c752f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Mon, 3 Jun 2024 14:40:16 -0400
Subject: [PATCH 147/178] Fix prompt formatter's defaults=None case in
 multi-task model (#9366)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
---
 nemo/collections/asr/models/aed_multitask_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 880f8bb3a004..edb591921782 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -133,7 +133,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         prompt_cls = PromptFormatter.resolve(self.prompt_format)
         self.prompt = prompt_cls(
             tokenizer=self.tokenizer,
-            defaults=OmegaConf.to_container(cfg.get("prompt_defaults")),
+            defaults=OmegaConf.to_container(pd) if (pd := cfg.get("prompt_defaults")) is not None else None,
         )
 
         # Setup audio preprocessor

From a0488f63fbfb555f05461dcf235f9a58559a99eb Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 3 Jun 2024 15:28:09 -0700
Subject: [PATCH 148/178] Update Gemma conversion script (#9365)

* Update Gemma conversion script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 .../checkpoint_converters/convert_gemma_jax_to_nemo.py |  3 ++-
 .../checkpoint_converters/convert_gemma_pyt_to_nemo.py | 10 ++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
index c35906dc78c1..1cbeeb41c66d 100644
--- a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
@@ -14,6 +14,7 @@
 
 """
 Requires to install: `pip install orbax jax flax jaxlib`
+Requires to clone: https://github.com/google-deepmind/gemma.git
 Required to set: `export PYTHONPATH=/path/to/google/gemma_jax:$PYTHONPATH`
    python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_jax_to_nemo.py \
    --input_name_or_path /path/to/gemma/checkpoints/jax/7b \
@@ -27,8 +28,8 @@
 
 import jax
 import torch
+from gemma.params import load_params, nest_params, param_remapper
 from omegaconf import OmegaConf
-from params import load_params, nest_params, param_remapper
 from transformer import TransformerConfig
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
diff --git a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
index 583ee7893c0f..d14e5f7de551 100644
--- a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
@@ -14,6 +14,7 @@
 
 """
 Requires to install: `pip install fairscale==0.4.13 immutabledict==4.1.0 tensorstore==0.1.45`
+Requires to clone: https://github.com/google/gemma_pytorch.git
 Required to set: `export PYTHONPATH=/path/to/google/gemma_pytorchh:$PYTHONPATH`
    python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_pyt_to_nemo.py \
    --input_name_or_path /path/to/gemma/checkpoints/pyt/7b.ckpt \
@@ -26,9 +27,9 @@
 from argparse import ArgumentParser
 
 import torch
-from model.config import get_config_for_2b, get_config_for_7b
-from model.model import CausalLM
-from model.tokenizer import Tokenizer
+from gemma.config import get_config_for_2b, get_config_for_7b
+from gemma.model import CausalLM
+from gemma.tokenizer import Tokenizer
 from omegaconf import OmegaConf
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -152,7 +153,8 @@ def adjust_tensor_shapes(model, nemo_state_dict):
             # [(head_num + 2 * num_query_groups) * head_size, hidden_size]
             # -> [head_num, head_size, hidden_size], 2 * [num_query_groups, head_size, hidden_size]
             q_weight, k_weight, v_weight = qkv_weight.split(
-                [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size], dim=0,
+                [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size],
+                dim=0,
             )
             q_weight = q_weight.reshape(head_num, head_size, hidden_size)
             k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)

From 63833cdf2244a762407ea815901fa5b2573a53c3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 3 Jun 2024 22:59:35 -0700
Subject: [PATCH 149/178] Fix GreedyBatchedCTCInfer regression from
 GreedyCTCInfer. (#9347) (#9350) (#9371)

* Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer. (#9347)

* Fix GreedyBatchedCTCInfer regression from GreedyCTCInfer.

decoder_lengths is allowed to be on CPU even when decoder_output is on
GPU. This matches the behavior of GreedyCTCInfer. Even though that
behavior is unintentional, there is code depending on that behavior,
including our jupyter notebooks.


* Apply isort and black reformatting


---------


(cherry picked from commit aed9d071c700080b3eb024e8a5d7f091f20f0183)

* Add Packaging to install documentation


* Mark confidence tests as please fix me


---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Daniel Galvez <galv@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 README.rst                                    |  6 +-
 .../parts/submodules/ctc_greedy_decoding.py   | 12 +++-
 .../asr/confidence/test_asr_confidence.py     |  2 +
 .../asr/decoding/test_ctc_decoding.py         | 71 +++++++++++++++++--
 4 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/README.rst b/README.rst
index 4a68acc286cd..cc31074d531d 100644
--- a/README.rst
+++ b/README.rst
@@ -237,7 +237,7 @@ To install the nemo_toolkit, use the following installation method:
 .. code-block:: bash
 
     apt-get update && apt-get install -y libsndfile1 ffmpeg
-    pip install Cython
+    pip install Cython packaging
     pip install nemo_toolkit['all']
 
 Depending on the shell used, you may need to use the ``"nemo_toolkit[all]"`` specifier instead in the above command.
@@ -263,7 +263,7 @@ If you want to work with a specific version of NeMo from a particular GitHub bra
 .. code-block:: bash
 
     apt-get update && apt-get install -y libsndfile1 ffmpeg
-    pip install Cython
+    pip install Cython packaging
     python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]
 
 
@@ -300,7 +300,7 @@ Run the following code:
     conda install -c conda-forge pynini
 
     # install Cython manually
-    pip install cython
+    pip install cython packaging
 
     # clone the repo and install in development mode
     git clone https://github.com/NVIDIA/NeMo
diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index a7f57c82279a..74204cf73d8e 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -394,7 +394,17 @@ def forward(
 
         if decoder_lengths is None:
             logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
-            decoder_lengths = torch.tensor([decoder_output.shape[1]], dtype=torch.long).expand(decoder_output.shape[0])
+            decoder_lengths = torch.tensor(
+                [decoder_output.shape[1]], dtype=torch.long, device=decoder_output.device
+            ).expand(decoder_output.shape[0])
+
+        # GreedyCTCInfer::forward(), by accident, works with
+        # decoder_lengths on either CPU or GPU when decoder_output is
+        # on GPU. For the sake of backwards compatibility, we also
+        # allow decoder_lengths to be on the CPU device. In this case,
+        # we simply copy the decoder_lengths from CPU to GPU. If both
+        # tensors are already on the same device, this is a no-op.
+        decoder_lengths = decoder_lengths.to(decoder_output.device)
 
         if decoder_output.ndim == 2:
             hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
diff --git a/tests/collections/asr/confidence/test_asr_confidence.py b/tests/collections/asr/confidence/test_asr_confidence.py
index edf35bb17b0b..015264a9debe 100644
--- a/tests/collections/asr/confidence/test_asr_confidence.py
+++ b/tests/collections/asr/confidence/test_asr_confidence.py
@@ -72,6 +72,7 @@ def audio_and_texts(test_data_dir):
 
 
 class TestASRConfidenceBenchmark:
+    @pytest.mark.pleasefixme
     @pytest.mark.integration
     @pytest.mark.with_downloads
     @pytest.mark.parametrize('model_name', ("ctc", "rnnt"))
@@ -103,6 +104,7 @@ def test_run_confidence_benchmark(
                 atol=TOL,
             )
 
+    @pytest.mark.pleasefixme
     @pytest.mark.integration
     @pytest.mark.with_downloads
     @pytest.mark.parametrize('model_name', ("ctc", "rnnt"))
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index a42d61f051ad..580344fed395 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -200,8 +200,41 @@ def test_subword_decoding_greedy_forward_hypotheses(self, tmp_tokenizer, alignme
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('preserve_frame_confidence', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
+    @pytest.mark.parametrize(
+        "logprobs_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "length_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
     def test_batched_decoding_logprobs(
-        self, tmp_tokenizer, alignments, timestamps, preserve_frame_confidence, length_is_none
+        self,
+        tmp_tokenizer,
+        alignments,
+        timestamps,
+        preserve_frame_confidence,
+        length_is_none,
+        logprobs_device,
+        length_device,
     ):
         cfg = CTCBPEDecodingConfig(
             strategy='greedy',
@@ -217,7 +250,7 @@ def test_batched_decoding_logprobs(
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_signal = torch.randn(size=(B, T, V))
+        input_signal = torch.randn(size=(B, T, V), device=logprobs_device)
         # Set the blank index to a very high probability to make sure
         # that we always handle at least a few blanks.
         input_signal[:, 0, unbatched_decoding.tokenizer.tokenizer.vocab_size] = 1000
@@ -225,7 +258,7 @@ def test_batched_decoding_logprobs(
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B])
+            length = torch.randint(low=1, high=T, size=[B], device=length_device)
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(
@@ -249,7 +282,33 @@ def test_batched_decoding_logprobs(
     @pytest.mark.unit
     @pytest.mark.parametrize('timestamps', [False, True])
     @pytest.mark.parametrize('length_is_none', [False, True])
-    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none):
+    @pytest.mark.parametrize(
+        "labels_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "length_device",
+        [
+            torch.device("cpu"),
+            pytest.param(
+                torch.device("cuda"),
+                marks=pytest.mark.skipif(
+                    not torch.cuda.is_available(),
+                    reason='CUDA required for test.',
+                ),
+            ),
+        ],
+    )
+    def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none, labels_device, length_device):
         cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
         unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
         cfg.strategy = 'greedy_batched'
@@ -258,7 +317,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         torch.manual_seed(1)
         B, T = 4, 20
         V = unbatched_decoding.tokenizer.tokenizer.vocab_size + 1
-        input_labels = torch.randint(V, size=(B, T))
+        input_labels = torch.randint(V, size=(B, T), device=labels_device)
         # Set some indices to blank to make sure that we always handle
         # at least a few blanks.
         input_labels[:, 0] = unbatched_decoding.tokenizer.tokenizer.vocab_size
@@ -266,7 +325,7 @@ def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none
         if length_is_none:
             length = None
         else:
-            length = torch.randint(low=1, high=T, size=[B])
+            length = torch.randint(low=1, high=T, size=[B], device=length_device)
 
         with torch.inference_mode():
             hyps, _ = unbatched_decoding.ctc_decoder_predictions_tensor(

From df491434bc9e9fe1bb97c9487efbdbd0f5672327 Mon Sep 17 00:00:00 2001
From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com>
Date: Tue, 4 Jun 2024 09:37:41 -0400
Subject: [PATCH 150/178] move AED chunked infer script (#9367)

Signed-off-by: stevehuang52 <heh@nvidia.com>
---
 examples/asr/asr_chunked_inference/README.md          |  8 ++++++--
 .../aed}/speech_to_text_aed_chunked_infer.py          | 11 +++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)
 rename examples/asr/{speech_multitask => asr_chunked_inference/aed}/speech_to_text_aed_chunked_infer.py (96%)

diff --git a/examples/asr/asr_chunked_inference/README.md b/examples/asr/asr_chunked_inference/README.md
index 5b4c79613ed9..fec2e2901c18 100644
--- a/examples/asr/asr_chunked_inference/README.md
+++ b/examples/asr/asr_chunked_inference/README.md
@@ -1,6 +1,6 @@
-# Streaming / Buffered ASR
+# Streaming / Buffered / Chunked ASR
 
-Contained within this directory are scripts to perform streaming or buffered inference of audio files using CTC / Transducer ASR models.
+Contained within this directory are scripts to perform streaming or buffered inference of audio files using CTC / Transducer ASR models, and chunked inference for MultitaskAED models (e.g., "nvidia/canary-1b").
 
 ## Difference between streaming and buffered ASR
 
@@ -9,3 +9,7 @@ While we primarily showcase the defaults of these models in buffering mode, note
 If you reduce your chunk size, the latency for your first prediction is reduced, and the model appears to predict the text with shorter delay. On the other hand, since the amount of information in the chunk is reduced, it causes higher WER.
 
 On the other hand, if you increase your chunk size, then the delay between spoken sentence and the transcription increases (this is buffered ASR). While the latency is increased, you are able to obtain more accurate transcripts since the model has more context to properly transcribe the text.
+
+## Chunked Inference
+
+For MultitaskAED models, we provide a script to perform chunked inference. This script will split the input audio into non-overlapping chunks and perform inference on each chunk. The script will then concatenate the results to provide the final transcript.
diff --git a/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
similarity index 96%
rename from examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py
rename to examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
index 52d3a86c1018..39b7547923cd 100644
--- a/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py
+++ b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
@@ -88,7 +88,9 @@ class TranscriptionConfig:
 
     # Chunked configs
     chunk_len_in_secs: float = 40.0  # Chunk length in seconds
-    model_stride: int = 8  # Model downsampling factor, 8 for Citrinet and FasConformer models and 4 for Conformer models.
+    model_stride: int = (
+        8  # Model downsampling factor, 8 for Citrinet and FasConformer models and 4 for Conformer models.
+    )
 
     # Decoding strategy for MultitaskAED models
     decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
@@ -209,7 +211,12 @@ def autocast(*args, **kwargs):
     with autocast(dtype=amp_dtype):
         with torch.no_grad():
             hyps = get_buffered_pred_feat_multitaskAED(
-                frame_asr, model_cfg.preprocessor, model_stride_in_secs, asr_model.device, manifest, filepaths,
+                frame_asr,
+                model_cfg.preprocessor,
+                model_stride_in_secs,
+                asr_model.device,
+                manifest,
+                filepaths,
             )
 
     output_filename, pred_text_attr_name = write_transcription(

From 677203ab36d743c3398158f0f1d5fba552306993 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 4 Jun 2024 18:02:39 -0700
Subject: [PATCH 151/178] Re-enable cuda graphs in training modes. (#9338)
 (#9343)

* Re-enable cuda graphs in training modes.

"global" capture mode was sporadically crashing because of pinning
host memory in other threads spawned by the data loader when
num_workers > 0.

Add relevant changs to TDT cuda graphs decoding as well.

I didn't test the TDT change because I'm not sure how. But it seems low risk.


* Apply isort and black reformatting


---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Signed-off-by: galv <galv@users.noreply.github.com>
Co-authored-by: Daniel Galvez <galv@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
---
 examples/asr/transcribe_speech.py             |  4 +---
 examples/asr/transcribe_speech_parallel.py    |  6 ++----
 .../cuda_graph_rnnt_greedy_decoding.py        |  8 +++++---
 .../asr/parts/submodules/rnnt_decoding.py     |  4 ++--
 .../parts/submodules/rnnt_greedy_decoding.py  |  6 +++---
 .../submodules/rnnt_loop_labels_computer.py   | 19 +++++++++++++------
 .../submodules/tdt_loop_labels_computer.py    | 18 +++++++++++++-----
 7 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index b63e9db5fef1..5f5543fe11e5 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -164,9 +164,7 @@ class TranscriptionConfig:
 
     # Decoding strategy for RNNT models
     # enable CUDA graphs for transcription
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(
-        fused_batch_size=-1, greedy=GreedyBatchedRNNTInferConfig(use_cuda_graph_decoder=True)
-    )
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
 
     # Decoding strategy for AED models
     multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py
index df2f31072851..446e40714460 100644
--- a/examples/asr/transcribe_speech_parallel.py
+++ b/examples/asr/transcribe_speech_parallel.py
@@ -101,10 +101,8 @@ class ParallelTranscriptionConfig:
     use_cer: bool = False
 
     # decoding strategy for RNNT models
-    # enable CUDA graphs for transcription
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(
-        fused_batch_size=-1, greedy=GreedyBatchedRNNTInferConfig(use_cuda_graph_decoder=True)
-    )
+    # Double check whether fused_batch_size=-1 is right
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
 
     # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
     decoder_type: Optional[str] = None
diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
index 93cef4d4138e..aa49435ded16 100644
--- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
@@ -37,7 +37,7 @@
 
 def create_outer_for_loop_kernel():
     """
-    Creates a kernel that evaluates whether or not to enter the for loop body. 
+    Creates a kernel that evaluates whether or not to enter the for loop body.
     Effectively substitutes for `for time_idx in range(trip_count)`
     such that that for loop can run on a GPU.
     """
@@ -171,8 +171,10 @@ def _reinitialize(self, max_time, batch_size, encoder_output, encoder_output_len
 
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.device)
-        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
-            self.graph, stream=stream_for_graph
+        with (
+            torch.cuda.stream(stream_for_graph),
+            torch.inference_mode(),
+            torch.cuda.graph(self.graph, stream=stream_for_graph, capture_error_mode="thread_local"),
         ):
             # This is failing...
             self.f = torch.zeros(
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
index 2416d916ac13..eb4088f84cae 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -331,7 +331,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
                         loop_labels=self.cfg.greedy.get('loop_labels', True),
-                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
                     )
                 else:
                     self.decoding = rnnt_greedy_decoding.GreedyBatchedTDTInfer(
@@ -347,7 +347,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         include_duration_confidence=self.tdt_include_duration_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
-                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
                     )
 
             else:
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index fa7a5cc95fec..420e49c96142 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -592,7 +592,7 @@ def __init__(
         preserve_frame_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
         loop_labels: bool = True,
-        use_cuda_graph_decoder: bool = False,
+        use_cuda_graph_decoder: bool = True,
     ):
         super().__init__(
             decoder_model=decoder_model,
@@ -2360,7 +2360,7 @@ class GreedyBatchedRNNTInferConfig:
     tdt_include_duration_confidence: bool = False
     confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig())
     loop_labels: bool = True
-    use_cuda_graph_decoder: bool = False
+    use_cuda_graph_decoder: bool = True
 
     def __post_init__(self):
         # OmegaConf.structured ensures that post_init check is always executed
@@ -2712,7 +2712,7 @@ def __init__(
         preserve_frame_confidence: bool = False,
         include_duration_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
-        use_cuda_graph_decoder: bool = False,
+        use_cuda_graph_decoder: bool = True,
     ):
         super().__init__(
             decoder_model=decoder_model,
diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
index 718deb7a409c..c0783c301c44 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
@@ -630,14 +630,18 @@ def _partial_graphs_compile(self):
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.before_outer_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.before_outer_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._before_outer_loop()
 
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.before_inner_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.before_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._before_inner_loop_get_decoder_output()
             self._before_inner_loop_get_joint_output()
@@ -645,14 +649,18 @@ def _partial_graphs_compile(self):
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.inner_loop_code, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.inner_loop_code, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._inner_loop_code()
 
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.after_inner_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.after_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._after_inner_loop()
 
@@ -660,12 +668,11 @@ def _full_graph_compile(self):
         """Compile full graph for decoding"""
         # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
         stream_for_graph = torch.cuda.Stream(self.state.device)
-        stream_for_graph.wait_stream(torch.cuda.default_stream(self.state.device))
         self.full_graph = torch.cuda.CUDAGraph()
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.full_graph, stream=stream_for_graph),
+            torch.cuda.graph(self.full_graph, stream=stream_for_graph, capture_error_mode="thread_local"),
         ):
             self._before_outer_loop()
 
diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
index 7ad7065e019c..4132c453d570 100644
--- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -691,14 +691,18 @@ def _partial_graphs_compile(self):
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.before_outer_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.before_outer_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._before_outer_loop()
 
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.before_inner_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.before_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._before_inner_loop_get_decoder_output()
             self._before_inner_loop_get_joint_output()
@@ -706,14 +710,18 @@ def _partial_graphs_compile(self):
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.inner_loop_code, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.inner_loop_code, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._inner_loop_code()
 
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.separate_graphs.after_inner_loop, stream=stream_for_graph),
+            torch.cuda.graph(
+                self.separate_graphs.after_inner_loop, stream=stream_for_graph, capture_error_mode="thread_local"
+            ),
         ):
             self._after_inner_loop()
 
@@ -726,7 +734,7 @@ def _full_graph_compile(self):
         with (
             torch.cuda.stream(stream_for_graph),
             torch.inference_mode(),
-            torch.cuda.graph(self.full_graph, stream=stream_for_graph),
+            torch.cuda.graph(self.full_graph, stream=stream_for_graph, capture_error_mode="thread_local"),
         ):
             self._before_outer_loop()
 

From a67dbd89ab347a4a6738a059431972534fc72d70 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:17:11 -0700
Subject: [PATCH 152/178] bugfix if using mcore distOpt with sft (#9356)

* bugfix if using mcore distOpt

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 .../megatron_gpt_sft_model.py                 |  2 +-
 .../language_modeling/megatron_retro_model.py | 28 +++++++++++--------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 1b59b90d2968..44a08e163c91 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -366,7 +366,7 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
         no_sync_func = None
         grad_sync_func = None
         param_sync_func = None
-        if not forward_only and self.with_distributed_adam:
+        if not forward_only and self.with_distributed_adam and not self.use_mcore_dist_optim:
             no_sync_func = partial(
                 self._optimizer.no_sync,
                 greedy_grad_copy=self.megatron_amp_O2,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
index 377ccbee163b..2a8e5713573b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
@@ -409,7 +409,7 @@ def id_func(output_tensor):
         return fwd_output_only_func
 
     def build_retro_config(self) -> RetroConfig:
-        """ This method build RetroConfig from the already built TransformerConfig
+        """This method build RetroConfig from the already built TransformerConfig
         by adding Retro relevant variables. This method runs after running build_transformer_config() method.
         """
         retro_config = self.transformer_config
@@ -445,7 +445,10 @@ def build_retro_config(self) -> RetroConfig:
             except Exception as e:
                 raise Exception(
                     "When using Transformer Engine >= 1.3, environment vars NVTE_FLASH_ATTN and NVTE_FUSED_ATTN most both be defined and set to '0'. Currently, NVTE_FLASH_ATTN == %s, NVTE_FUSED_ATTN == %s."
-                    % (os.getenv("NVTE_FLASH_ATTN", "[unset]"), os.getenv("NVTE_FUSED_ATTN", "[unset]"),)
+                    % (
+                        os.getenv("NVTE_FLASH_ATTN", "[unset]"),
+                        os.getenv("NVTE_FUSED_ATTN", "[unset]"),
+                    )
                 )
 
         return retro_config
@@ -469,9 +472,9 @@ def build_train_valid_test_datasets(self):
         ]
 
         if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
-            train_valid_test_num_samples[
-                1
-            ] = 1  # This is to make sure we only have one epoch on every validation iteration
+            train_valid_test_num_samples[1] = (
+                1  # This is to make sure we only have one epoch on every validation iteration
+            )
 
         self._train_ds, self._validation_ds, self._test_ds = build_train_valid_test_datasets(
             cfg=self.cfg,
@@ -539,8 +542,11 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
         no_sync_func = None
         grad_sync_func = None
         param_sync_func = None
-        if not forward_only and self.with_distributed_adam:
-            no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+        if not forward_only and self.with_distributed_adam and not self.use_mcore_dist_optim:
+            no_sync_func = partial(
+                self._optimizer.no_sync,
+                greedy_grad_copy=self.megatron_amp_O2,
+            )
             grad_sync_func = self.reduce_overlap_gradients
             param_sync_func = self.sync_overlap_parameters
 
@@ -596,10 +602,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
 
     def validation_step(self, dataloader_iter, dataloader_idx=0):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         mode = 'test' if self.trainer.testing else 'val'
         # Initialize userbuffer communicators.

From c4f97d5fa2e1117f2dac5e576defe392a782f188 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:17:29 -0700
Subject: [PATCH 153/178] fix typo infer_seq_lenght -> infer_seq_length (#9370)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
---
 nemo/lightning/megatron_parallel.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 8106b83a41d1..5955276eda56 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -200,7 +200,7 @@ def forward(
         _forward_step = forward_step or self.forward_step
         _loss_reduction = loss_reduction or self.loss_reduction
         _micro_batch_size: int = micro_batch_size or self.infer_micro_batch_size(data)
-        _seq_length: int = seq_length or self.infer_seq_lenght(data)
+        _seq_length: int = seq_length or self.infer_seq_length(data)
         _num_microbatches: int = num_microbatches or self.infer_num_microbatches(data)
 
         pipeline = self.pipeline
@@ -396,7 +396,7 @@ def infer_micro_batch_size(self, data: Union[DataT, Iterator[DataT], List[Iterat
 
         raise ValueError("Cannot infer `micro_batch_size` from data, please specify it manually")
 
-    def infer_seq_lenght(self, data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]]) -> int:
+    def infer_seq_length(self, data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]]) -> int:
         if hasattr(data, "seq_length"):
             return data.seq_length
         if hasattr(data, "data_config"):
@@ -406,10 +406,10 @@ def infer_seq_lenght(self, data: Union[DataT, Iterator[DataT], List[Iterator[Dat
             # TODO: Check if at least 2 dims
             return data.size(1)
         elif isinstance(data, dict):
-            return self.infer_seq_lenght(next(iter(data.values())))
+            return self.infer_seq_length(next(iter(data.values())))
         elif isinstance(data, (list, tuple)) and len(data) > 0:
             _tensor: Tensor = data[0]
-            return self.infer_seq_lenght(_tensor)
+            return self.infer_seq_length(_tensor)
 
         raise ValueError("Cannot infer `seq_length` from data, please specify it manually")
 

From a9c33f863bc20e125432b1e5ec93a7fe9d3dc61a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Jun 2024 10:25:26 -0600
Subject: [PATCH 154/178] Rachitg/ag (#9083)

* Rachitg/ag (#9081)

* disable overlap for qkv

Signed-off-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* bug fix

* bugfix

---------

Signed-off-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Rachit Garg <rachitgarg91@gmail.com>
Co-authored-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: michal2409 <michal2409@users.noreply.github.com>

---------

Signed-off-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Rachit Garg <rachitgarg91@gmail.com>
Signed-off-by: michal2409 <michal2409@users.noreply.github.com>
Co-authored-by: Rachit Garg <rachitgarg91@gmail.com>
Co-authored-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: michal2409 <michal2409@users.noreply.github.com>
---
 .../nlp/modules/common/megatron/adapters/parallel_adapters.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 541ca9c28f3d..07b6ae84bca6 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -256,7 +256,9 @@ def __init__(
             from pkg_resources import packaging
 
             te_version = packaging.version.Version(version("transformer-engine"))
-            if te_version >= packaging.version.Version("1.5.0dev") and not model_parallel_config.tp_comm_overlap:
+            if te_version >= packaging.version.Version("1.5.0dev") and (
+                not self.input_is_parallel and model_parallel_config.tp_comm_disable_qkv
+            ):
                 # TE 1.5 introduces the option `return_layernorm_output_gathered`, so the all gather
                 # in the forward method is not needed, so set self._sequence_parallel to False
                 # unless TP communication overlap is used

From 926b420aebeae6f0b6e5af2bbf2d2109a72de402 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Jun 2024 10:57:19 -0600
Subject: [PATCH 155/178] Adding the original change made for label_models
 (#9377) (#9378)

Signed-off-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
---
 nemo/collections/asr/models/label_models.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py
index 23ab5469e60c..071c53417ae2 100644
--- a/nemo/collections/asr/models/label_models.py
+++ b/nemo/collections/asr/models/label_models.py
@@ -136,7 +136,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         if 'loss' in cfg:
             cfg_eval_loss = copy.deepcopy(cfg.loss)
 
-            if 'angular' in cfg.loss._target_:
+            if '_target_' in cfg.loss and 'angular' in cfg.loss._target_:
                 OmegaConf.set_struct(cfg, True)
                 with open_dict(cfg):
                     cfg.decoder.angular = True
@@ -341,7 +341,8 @@ def forward_for_export(self, processed_signal, processed_signal_len):
     @typecheck()
     def forward(self, input_signal, input_signal_length):
         processed_signal, processed_signal_len = self.preprocessor(
-            input_signal=input_signal, length=input_signal_length,
+            input_signal=input_signal,
+            length=input_signal_length,
         )
 
         if self.spec_augmentation is not None and self.training:
@@ -627,7 +628,9 @@ def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, d
         dataset = AudioToSpeechLabelDataset(manifest_filepath=manifest_filepath, labels=None, featurizer=featurizer)
 
         dataloader = torch.utils.data.DataLoader(
-            dataset=dataset, batch_size=batch_size, collate_fn=dataset.fixed_seq_collate_fn,
+            dataset=dataset,
+            batch_size=batch_size,
+            collate_fn=dataset.fixed_seq_collate_fn,
         )
 
         logits = []

From 55a97386fff1f0d01fc998e0071cebb851ed313a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Jun 2024 13:08:06 -0700
Subject: [PATCH 156/178] Dgalvez/fix greedy batch strategy name r2.0.0rc0
 (#9243) (#9253)

* Lazily warn about using greedy strategy instead of greedy_batch
strategy.

Previously, the warning would often run spuriously, since several
existing code paths simply call "change_decoding_strategy()" after
having first initialized a Module, rather than changing the config
before initializing the Module. This can be confusing.

The only problem I can see with this is that using logging inside a
forward() method might interfere with some compiler toolkits like
Torchscript or thunder.compile. Presumably it would be easy to add a
conditional statement to avoid this statement in a compiler context if
necessary.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Co-authored-by: Daniel Galvez <galv@users.noreply.github.com>
---
 .../asr/parts/submodules/ctc_decoding.py      | 23 +++---
 .../asr/decoding/test_ctc_decoding.py         |  4 +-
 .../asr/test_asr_ctc_encoder_model_bpe.py     |  7 +-
 .../asr/test_asr_ctcencdec_model.py           |  7 +-
 .../asr/test_asr_hybrid_rnnt_ctc_model_bpe.py | 33 ++++++---
 .../test_asr_hybrid_rnnt_ctc_model_char.py    | 72 +++++++++++++------
 6 files changed, 96 insertions(+), 50 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/ctc_decoding.py b/nemo/collections/asr/parts/submodules/ctc_decoding.py
index 70d63c0f8c6f..d2bfb629293e 100644
--- a/nemo/collections/asr/parts/submodules/ctc_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_decoding.py
@@ -213,20 +213,20 @@ def __init__(self, decoding_cfg, blank_id: int):
         self.batch_dim_index = self.cfg.get('batch_dim_index', 0)
         self.word_seperator = self.cfg.get('word_seperator', ' ')
 
-        possible_strategies = ['greedy', 'greedy_batched', 'beam', 'pyctcdecode', 'flashlight']
+        possible_strategies = ['greedy', 'greedy_batch', 'beam', 'pyctcdecode', 'flashlight']
         if self.cfg.strategy not in possible_strategies:
             raise ValueError(f"Decoding strategy must be one of {possible_strategies}. Given {self.cfg.strategy}")
 
         # Update preserve alignments
         if self.preserve_alignments is None:
-            if self.cfg.strategy in ['greedy', 'greedy_batched']:
+            if self.cfg.strategy in ['greedy', 'greedy_batch']:
                 self.preserve_alignments = self.cfg.greedy.get('preserve_alignments', False)
             else:
                 self.preserve_alignments = self.cfg.beam.get('preserve_alignments', False)
 
         # Update compute timestamps
         if self.compute_timestamps is None:
-            if self.cfg.strategy in ['greedy', 'greedy_batched']:
+            if self.cfg.strategy in ['greedy', 'greedy_batch']:
                 self.compute_timestamps = self.cfg.greedy.get('compute_timestamps', False)
             elif self.cfg.strategy in ['beam']:
                 self.compute_timestamps = self.cfg.beam.get('compute_timestamps', False)
@@ -234,10 +234,10 @@ def __init__(self, decoding_cfg, blank_id: int):
         # initialize confidence-related fields
         self._init_confidence(self.cfg.get('confidence_cfg', None))
 
-        # Confidence estimation is not implemented for strategies other than `greedy` and `greedy_batched`
+        # Confidence estimation is not implemented for strategies other than `greedy` and `greedy_batch`
         if (
             not self.preserve_frame_confidence
-            and self.cfg.strategy not in ('greedy', 'greedy_batched')
+            and self.cfg.strategy not in ('greedy', 'greedy_batch')
             and self.cfg.beam.get('preserve_frame_confidence', False)
         ):
             raise NotImplementedError(f"Confidence calculation is not supported for strategy `{self.cfg.strategy}`")
@@ -247,11 +247,6 @@ def __init__(self, decoding_cfg, blank_id: int):
             self.compute_timestamps |= self.preserve_frame_confidence
 
         if self.cfg.strategy == 'greedy':
-            logging.warning(
-                "CTC decoding strategy 'greedy' is slower than 'greedy_batched', which implements the same exact interface. Consider changing your strategy to 'greedy_batched' for a free performance improvement.",
-                mode=logging_mode.ONCE,
-            )
-
             self.decoding = ctc_greedy_decoding.GreedyCTCInfer(
                 blank_id=self.blank_id,
                 preserve_alignments=self.preserve_alignments,
@@ -260,7 +255,7 @@ def __init__(self, decoding_cfg, blank_id: int):
                 confidence_method_cfg=self.confidence_method_cfg,
             )
 
-        elif self.cfg.strategy == "greedy_batched":
+        elif self.cfg.strategy == "greedy_batch":
             self.decoding = ctc_greedy_decoding.GreedyBatchedCTCInfer(
                 blank_id=self.blank_id,
                 preserve_alignments=self.preserve_alignments,
@@ -1023,7 +1018,9 @@ class CTCDecoding(AbstractCTCDecoding):
     """
 
     def __init__(
-        self, decoding_cfg, vocabulary,
+        self,
+        decoding_cfg,
+        vocabulary,
     ):
         blank_id = len(vocabulary)
         self.vocabulary = vocabulary
@@ -1300,7 +1297,7 @@ def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]:
 
 @dataclass
 class CTCDecodingConfig:
-    strategy: str = "greedy_batched"
+    strategy: str = "greedy_batch"
 
     # preserve decoding alignments
     preserve_alignments: Optional[bool] = None
diff --git a/tests/collections/asr/decoding/test_ctc_decoding.py b/tests/collections/asr/decoding/test_ctc_decoding.py
index 580344fed395..02332f170759 100644
--- a/tests/collections/asr/decoding/test_ctc_decoding.py
+++ b/tests/collections/asr/decoding/test_ctc_decoding.py
@@ -244,7 +244,7 @@ def test_batched_decoding_logprobs(
         )
         unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
 
-        cfg.strategy = 'greedy_batched'
+        cfg.strategy = 'greedy_batch'
         batched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
 
         torch.manual_seed(1)
@@ -311,7 +311,7 @@ def test_batched_decoding_logprobs(
     def test_batched_decoding_labels(self, tmp_tokenizer, timestamps, length_is_none, labels_device, length_device):
         cfg = CTCBPEDecodingConfig(strategy='greedy', compute_timestamps=timestamps)
         unbatched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
-        cfg.strategy = 'greedy_batched'
+        cfg.strategy = 'greedy_batch'
         batched_decoding = CTCBPEDecoding(decoding_cfg=cfg, tokenizer=tmp_tokenizer)
 
         torch.manual_seed(1)
diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
index 2005c0e8d41c..0d7c555ee778 100644
--- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
+++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
@@ -269,7 +269,7 @@ def test_vocab_change(self, test_data_dir, asr_model):
     def test_decoding_change(self, asr_model):
         assert asr_model.decoding is not None
         assert isinstance(asr_model.decoding, CTCBPEDecoding)
-        assert asr_model.decoding.cfg.strategy == "greedy_batched"
+        assert asr_model.decoding.cfg.strategy == "greedy_batch"
         assert asr_model.decoding.preserve_alignments is False
         assert asr_model.decoding.compute_timestamps is False
 
@@ -309,7 +309,10 @@ def test_ASRDatasetConfig_for_AudioToBPEDataset(self):
         REMAP_ARGS = {'trim_silence': 'trim', 'labels': 'tokenizer'}
 
         result = assert_dataclass_signature_match(
-            audio_to_text.AudioToBPEDataset, configs.ASRDatasetConfig, ignore_args=IGNORE_ARGS, remap_args=REMAP_ARGS,
+            audio_to_text.AudioToBPEDataset,
+            configs.ASRDatasetConfig,
+            ignore_args=IGNORE_ARGS,
+            remap_args=REMAP_ARGS,
         )
         signatures_match, cls_subset, dataclass_subset = result
 
diff --git a/tests/collections/asr/test_asr_ctcencdec_model.py b/tests/collections/asr/test_asr_ctcencdec_model.py
index d2587913b879..28a07fd54663 100644
--- a/tests/collections/asr/test_asr_ctcencdec_model.py
+++ b/tests/collections/asr/test_asr_ctcencdec_model.py
@@ -150,7 +150,7 @@ def test_vocab_change(self, asr_model):
     def test_decoding_change(self, asr_model):
         assert asr_model.decoding is not None
         assert isinstance(asr_model.decoding, CTCDecoding)
-        assert asr_model.decoding.cfg.strategy == "greedy_batched"
+        assert asr_model.decoding.cfg.strategy == "greedy_batch"
         assert asr_model.decoding.preserve_alignments is False
         assert asr_model.decoding.compute_timestamps is False
 
@@ -279,7 +279,10 @@ def test_ASRDatasetConfig_for_AudioToCharDataset(self):
         REMAP_ARGS = {'trim_silence': 'trim'}
 
         result = assert_dataclass_signature_match(
-            audio_to_text.AudioToCharDataset, configs.ASRDatasetConfig, ignore_args=IGNORE_ARGS, remap_args=REMAP_ARGS,
+            audio_to_text.AudioToCharDataset,
+            configs.ASRDatasetConfig,
+            ignore_args=IGNORE_ARGS,
+            remap_args=REMAP_ARGS,
         )
         signatures_match, cls_subset, dataclass_subset = result
 
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
index 994d832ec6e5..1743acc6878c 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
@@ -64,12 +64,18 @@ def hybrid_asr_model(test_data_dir):
 
     decoder = {
         '_target_': 'nemo.collections.asr.modules.RNNTDecoder',
-        'prednet': {'pred_hidden': model_defaults['pred_hidden'], 'pred_rnn_layers': 1,},
+        'prednet': {
+            'pred_hidden': model_defaults['pred_hidden'],
+            'pred_rnn_layers': 1,
+        },
     }
 
     joint = {
         '_target_': 'nemo.collections.asr.modules.RNNTJoint',
-        'jointnet': {'joint_hidden': 32, 'activation': 'relu',},
+        'jointnet': {
+            'joint_hidden': 32,
+            'activation': 'relu',
+        },
     }
 
     decoding = {'strategy': 'greedy_batch', 'greedy': {'max_symbols': 30}}
@@ -111,7 +117,8 @@ def hybrid_asr_model(test_data_dir):
 
 class TestEncDecHybridRNNTCTCBPEModel:
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.with_downloads()
     @pytest.mark.unit
@@ -125,7 +132,8 @@ def test_constructor(self, hybrid_asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_forward(self, hybrid_asr_model):
@@ -160,7 +168,8 @@ def test_forward(self, hybrid_asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_save_restore_artifact(self, hybrid_asr_model):
@@ -178,7 +187,8 @@ def test_save_restore_artifact(self, hybrid_asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_save_restore_artifact_spe(self, hybrid_asr_model, test_data_dir):
@@ -224,7 +234,8 @@ def test_save_restore_artifact_agg(self, hybrid_asr_model, test_data_dir):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_vocab_change(self, test_data_dir, hybrid_asr_model):
@@ -255,7 +266,8 @@ def test_vocab_change(self, test_data_dir, hybrid_asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_decoding_change(self, hybrid_asr_model):
@@ -297,7 +309,7 @@ def test_decoding_change(self, hybrid_asr_model):
 
         assert hybrid_asr_model.ctc_decoding is not None
         assert isinstance(hybrid_asr_model.ctc_decoding, CTCBPEDecoding)
-        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy_batched"
+        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy_batch"
         assert hybrid_asr_model.ctc_decoding.preserve_alignments is False
         assert hybrid_asr_model.ctc_decoding.compute_timestamps is False
 
@@ -309,7 +321,8 @@ def test_decoding_change(self, hybrid_asr_model):
         assert hybrid_asr_model.cur_decoder == "ctc"
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_decoding_type_change(self, hybrid_asr_model):
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
index 923263787def..a0d5627f1a65 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
@@ -117,7 +117,8 @@ def hybrid_asr_model():
 
 class TestEncDecHybridRNNTCTCModel:
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_constructor(self, hybrid_asr_model):
@@ -129,7 +130,8 @@ def test_constructor(self, hybrid_asr_model):
         assert isinstance(instance2, EncDecHybridRNNTCTCModel)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_forward(self, hybrid_asr_model):
@@ -163,7 +165,8 @@ def test_forward(self, hybrid_asr_model):
         assert diff <= 1e-6
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_vocab_change(self, hybrid_asr_model):
@@ -186,10 +189,12 @@ def test_vocab_change(self, hybrid_asr_model):
         assert hybrid_asr_model.ctc_decoder.vocabulary == hybrid_asr_model.joint.vocabulary
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_decoding_change(self, hybrid_asr_model):
@@ -231,7 +236,7 @@ def test_decoding_change(self, hybrid_asr_model):
 
         assert hybrid_asr_model.ctc_decoding is not None
         assert isinstance(hybrid_asr_model.ctc_decoding, CTCDecoding)
-        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy_batched"
+        assert hybrid_asr_model.ctc_decoding.cfg.strategy == "greedy_batch"
         assert hybrid_asr_model.ctc_decoding.preserve_alignments is False
         assert hybrid_asr_model.ctc_decoding.compute_timestamps is False
 
@@ -242,7 +247,8 @@ def test_decoding_change(self, hybrid_asr_model):
         assert hybrid_asr_model.ctc_decoding.compute_timestamps is True
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_decoding_type_change(self, hybrid_asr_model):
@@ -306,7 +312,8 @@ def test_BeamRNNTInferConfig(self):
         assert dataclass_subset is None
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -349,11 +356,13 @@ def test_greedy_decoding(self, greedy_class, loop_labels: Optional[bool]):
             _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer],
+        "greedy_class",
+        [greedy_decode.GreedyRNNTInfer],
     )
     def test_greedy_multi_decoding(self, greedy_class):
         token_list = [" ", "a", "b", "c"]
@@ -386,7 +395,8 @@ def test_greedy_multi_decoding(self, greedy_class):
             _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len, partial_hypotheses=partial_hyp)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -430,11 +440,13 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class, loop_labels: Opti
             _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer],
+        "greedy_class",
+        [greedy_decode.GreedyRNNTInfer],
     )
     def test_greedy_multi_decoding_stateless_decoder(self, greedy_class):
         token_list = [" ", "a", "b", "c"]
@@ -467,7 +479,8 @@ def test_greedy_multi_decoding_stateless_decoder(self, greedy_class):
             _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len, partial_hypotheses=partial_hyp)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -522,7 +535,8 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Opt
                     assert torch.is_tensor(label)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -556,7 +570,12 @@ def test_beam_decoding(self, beam_config):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list)
 
-        beam = beam_decode.BeamRNNTInfer(decoder, joint_net, beam_size=beam_size, **beam_config,)
+        beam = beam_decode.BeamRNNTInfer(
+            decoder,
+            joint_net,
+            beam_size=beam_size,
+            **beam_config,
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)
@@ -566,12 +585,16 @@ def test_beam_decoding(self, beam_config):
             _ = beam(encoder_output=enc_out, encoded_lengths=enc_len)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
         "beam_config",
-        [{"search_type": "greedy"}, {"search_type": "default", "score_norm": False, "return_best_hypothesis": False},],
+        [
+            {"search_type": "greedy"},
+            {"search_type": "default", "score_norm": False, "return_best_hypothesis": False},
+        ],
     )
     def test_beam_decoding_preserve_alignments(self, beam_config):
         token_list = [" ", "a", "b", "c"]
@@ -616,7 +639,8 @@ def test_beam_decoding_preserve_alignments(self, beam_config):
                     assert torch.is_tensor(label)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -659,7 +683,8 @@ def test_greedy_decoding_SampledRNNTJoint(self, greedy_class, loop_labels: Optio
             _ = greedy(encoder_output=enc_out, encoded_lengths=enc_len)
 
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
@@ -693,7 +718,12 @@ def test_beam_decoding_SampledRNNTJoint(self, beam_config):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = SampledRNNTJoint(jointnet_cfg, vocab_size, n_samples=2, vocabulary=token_list)
 
-        beam = beam_decode.BeamRNNTInfer(decoder, joint_net, beam_size=beam_size, **beam_config,)
+        beam = beam_decode.BeamRNNTInfer(
+            decoder,
+            joint_net,
+            beam_size=beam_size,
+            **beam_config,
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)

From 86c474bb1fed7208b41911c0bb5e67e44d012b33 Mon Sep 17 00:00:00 2001
From: jgerh <163925524+jgerh@users.noreply.github.com>
Date: Wed, 5 Jun 2024 15:13:17 -0700
Subject: [PATCH 157/178] Update README.rst (#9393)

Revised content per https://gitlab-master.nvidia.com/nemo-framework-tme/documentation/-/issues/25. Also removed reference to NIMs in LLMs and MMs Deployment and Optimization. It should be NVIDIA NeMo Microservices and not NIM. Removed  nemo:24.03.framework and nemo:24.01.speech in Docker Containers section and replaced with 24.05 . Please verify all changes.

Signed-off-by: jgerh <163925524+jgerh@users.noreply.github.com>
---
 README.rst | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/README.rst b/README.rst
index cc31074d531d..89ed934527d8 100644
--- a/README.rst
+++ b/README.rst
@@ -129,7 +129,10 @@ In addition to supervised fine-tuning (SFT), NeMo also supports the latest param
 LLMs and MMs Deployment and Optimization
 ########################################
 
-NeMo LLMs and MMs can be deployed and optimized with `NVIDIA Inference Microservices (Early Access) <https://developer.nvidia.com/nemo-microservices-early-access>`_, in short, NIMs.
+NeMo LLMs and MMs can be deployed and optimized with `NVIDIA NeMo Microservices <https://developer.nvidia.com/nemo-microservices-early-access>`_.
+
+Speech AI
+#########
 
 NeMo ASR and TTS models can be optimized for inference and deployed for production use cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
 
@@ -198,7 +201,7 @@ The NeMo Framework can be installed in a variety of ways, depending on your need
 
 * Conda / Pip - Refer to `Conda <#conda>`_ and `Pip <#pip>`_ for installation instructions.
 
-  * This is the recommended method for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains.
+  * This is the recommended method for ASR and TTS domains.
   * When using a Nvidia PyTorch container as the base, this is the recommended method for all domains.
 
 * Docker Containers - Refer to `Docker containers <#docker-containers>`_ for installation instructions.
@@ -207,7 +210,7 @@ The NeMo Framework can be installed in a variety of ways, depending on your need
 
 * LLMs and MMs Dependencies - Refer to `LLMs and MMs Dependencies <#install-llms-and-mms-dependencies>`_ for installation instructions.
 
-**Important: We strongly recommended that you start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3`**
+**Important: We strongly recommended that you start with a base NVIDIA PyTorch container: nvcr.io/nvidia/pytorch:24.02-py3.**
 
 Conda
 ^^^^^^
@@ -439,11 +442,7 @@ Transformer Engine requires PyTorch to be built with at least CUDA 11.8.
 Megatron Core
 ^^^^^^^^^^^^^
 
-Megatron Core is required for LLM and MM domains.
-
-Megatron Core is a library for scaling large Transformer-based models. NeMo LLMs and MMs leverage Megatron Core for model parallelism, 
-
-transformer architectures, and optimized PyTorch datasets.
+Megatron Core is required for LLM and MM domains. Megatron Core is a library for scaling large Transformer-based models. NeMo LLMs and MMs leverage Megatron Core for model parallelism, transformer architectures, and optimized PyTorch datasets.
 
 To install Megatron Core, run the following code:
 
@@ -464,18 +463,13 @@ NeMo Text Processing, specifically Inverse Text Normalization, is now a separate
 Docker Containers
 -----------------
 
-NeMo containers are launched concurrently with NeMo version updates. For example, the release of NeMo ``r1.23.0`` comes with the container ``nemo:24.01.speech``. The latest containers are:
-
-* NeMo LLM and MM container - `nvcr.io/nvidia/nemo:24.03.framework`
-* NeMo Speech container - `nvcr.io/nvidia/nemo:24.01.speech`
-
-You can find additional information about released containers on the `NeMo releases page <https://github.com/NVIDIA/NeMo/releases>`_.
+NeMo containers are launched concurrently with NeMo version updates. NeMo Framework now supports LLMs, MMs, ASR, and TTS in a single consolidated Docker container. You can find additional information about released containers on the `NeMo releases page <https://github.com/NVIDIA/NeMo/releases>`_.
 
 To use a pre-built container, run the following code:
 
 .. code-block:: bash
 
-    docker pull nvcr.io/nvidia/nemo:24.01.speech
+    docker pull nvcr.io/nvidia/nemo:24.05
 
 To build a nemo container with Dockerfile from a branch, run the following code:
 
@@ -519,4 +513,4 @@ Licenses
 
 * `NeMo GitHub Apache 2.0 license <https://github.com/NVIDIA/NeMo?tab=Apache-2.0-1-ov-file#readme>`__
 
-* NeMo is licensed under the `NVIDIA AI PRODUCT AGREEMENT <https://www.nvidia.com/en-us/data-center/products/nvidia-ai-enterprise/eula/>`__. By pulling and using the container, you accept the terms and conditions of this license.
\ No newline at end of file
+* NeMo is licensed under the `NVIDIA AI PRODUCT AGREEMENT <https://www.nvidia.com/en-us/data-center/products/nvidia-ai-enterprise/eula/>`__. By pulling and using the container, you accept the terms and conditions of this license.

From fe9843d66342f3ca61ce5b7e20cd88105b103a7d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Jun 2024 15:21:02 -0700
Subject: [PATCH 158/178] a2a fix removed tp world size and group from init
 (#8944) (#8952)

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>
Co-authored-by: anmolgupt <14880251+anmolgupt@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../megatron/adapters/parallel_adapters.py    | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 07b6ae84bca6..61903e6b3673 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -165,8 +165,6 @@ def __init__(
         self.alpha = alpha if alpha is not None else self.dim
         self.input_is_parallel = input_is_parallel
         self.dropout_position = dropout_position
-        self.tp_world_size = None
-        self.tp_group = None
         self.use_a2a = a2a_experimental
 
         # megatron_gpt_peft_models will provide this arg, but deprecated ones do not.
@@ -212,8 +210,6 @@ def __init__(
             lin_out_gather_output = True if input_is_parallel else False
             if self.use_a2a and input_is_parallel and self._sequence_parallel:
                 lin_out_gather_output = False
-                self.tp_world_size = get_tensor_model_parallel_world_size()
-                self.tp_group = get_tensor_model_parallel_group()
             self.linear_out = ColumnParallelLinear(
                 dim,
                 out_features,
@@ -309,7 +305,7 @@ def forward(self, x):
             # this function also handles the backward pass correctly
             if self.use_a2a:
                 # all2all hidden_size / TP to seq_len / TP
-                x = all2all_hp2sp(x, self.tp_world_size, self.tp_group)
+                x = all2all_hp2sp(x)
             else:
                 x = scatter_to_sequence_parallel_region(x)
 
@@ -333,9 +329,9 @@ class _All2AllHp2Sp(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, world_size, group):
-        ctx.world_size = world_size
-        ctx.group = group
+    def forward(ctx, input_):
+        world_size = get_tensor_model_parallel_world_size()
+        group = get_tensor_model_parallel_group()
         send_list = list(input_.chunk(world_size, dim=0))
         send_list = [tensor.contiguous() for tensor in send_list]
         receive_list = [torch.empty_like(send_list[0]) for _ in range(world_size)]
@@ -345,16 +341,18 @@ def forward(ctx, input_, world_size, group):
 
     @staticmethod
     def backward(ctx, grad_output):
-        send_list = list(grad_output.chunk(ctx.world_size, dim=-1))
+        world_size = get_tensor_model_parallel_world_size()
+        group = get_tensor_model_parallel_group()
+        send_list = list(grad_output.chunk(world_size, dim=-1))
         send_list = [tensor.contiguous() for tensor in send_list]
-        receive_list = [torch.empty_like(send_list[0]) for _ in range(ctx.world_size)]
-        torch.distributed.all_to_all(receive_list, send_list, group=ctx.group)
+        receive_list = [torch.empty_like(send_list[0]) for _ in range(world_size)]
+        torch.distributed.all_to_all(receive_list, send_list, group=group)
         x = torch.cat(receive_list, dim=0)
-        return x, None, None
+        return x
 
 
-def all2all_hp2sp(input_, world_size, group):
-    return _All2AllHp2Sp.apply(input_, world_size, group)
+def all2all_hp2sp(input_):
+    return _All2AllHp2Sp.apply(input_)
 
 
 @dataclass

From 63c241abc2bd458ba3f445b1c0fac5de5646a39a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Jun 2024 15:22:07 -0700
Subject: [PATCH 159/178] Add config option for FP32 embedding grads (#8953)

* Add config option for FP32 embedding grads (#8946)

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ericharper <ericharper@users.noreply.github.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Signed-off-by: ericharper <ericharper@users.noreply.github.com>
Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: ericharper <ericharper@users.noreply.github.com>
---
 .../language_modeling/megatron_gpt_model.py   |  7 +-
 .../megatron_lm_encoder_decoder_model.py      | 86 +++++++++++--------
 2 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index a5b4450c7b44..db35fc42293e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -549,6 +549,7 @@ def configure_optimizers(self):
         if self.with_distributed_adam and not self.use_mcore_dist_optim:
 
             # Special handling for embedding grads
+            with_fp32_embedding_grads = self.cfg.get('with_fp32_embedding_grads', True)
             modules = self.get_model_module_list()
             if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
                 module = modules[0]  # first virtual rank has the embeddings
@@ -558,7 +559,7 @@ def configure_optimizers(self):
                 word_embeddings = (
                     module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight()
                 )
-                word_embeddings._with_fp32_optimizer = True
+                word_embeddings._with_fp32_optimizer = with_fp32_embedding_grads
                 if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.get(
                     'share_embeddings_and_output_weights', True
                 ):
@@ -573,7 +574,7 @@ def configure_optimizers(self):
                 else:
                     position_embeddings = module.position_embeddings_weight()
                 if position_embeddings is not None:
-                    position_embeddings._with_fp32_optimizer = True
+                    position_embeddings._with_fp32_optimizer = with_fp32_embedding_grads
 
             # Handle case where embeddings are used in output layer
             if parallel_state.is_pipeline_last_stage(ignore_virtual=True) and self.cfg.get(
@@ -583,7 +584,7 @@ def configure_optimizers(self):
                 word_embeddings = (
                     module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight()
                 )
-                word_embeddings._with_fp32_optimizer = True
+                word_embeddings._with_fp32_optimizer = with_fp32_embedding_grads
                 if parallel_state.get_pipeline_model_parallel_world_size() > 1:
                     word_embeddings._disable_greedy_grad_copy = not self.megatron_amp_O2
                     word_embeddings._disable_overlap_grad_sync = True
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 4c39bd877b4a..90c6a40b1d40 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -190,9 +190,10 @@ def configure_optimizers(self):
                 param._disable_overlap_grad_sync = True
 
             # Make sure embedding grads are reduced in FP32
+            with_fp32_embedding_grads = self.cfg.get('with_fp32_embedding_grads', True)
             for name, param in self.named_parameters():
                 if 'word_embedding' in name or 'position_embedding' in name or 'output_layer' in name:
-                    param._with_fp32_optimizer = True
+                    param._with_fp32_optimizer = with_fp32_embedding_grads
 
         return super().configure_optimizers()
 
@@ -346,8 +347,8 @@ def _execute_fwd_bwd_function(self, data_iterator, forward_only, tensor_shape, d
 
     def fwd_bwd_step(self, dataloader_iter, forward_only):
         """
-            Dataloader produces a global batch which is turned into a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Dataloader produces a global batch which is turned into a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         # Get seq length of batch
         tensor_shape = [self.max_encoder_seq_length, self.cfg.micro_batch_size, self.cfg.encoder.hidden_size]
@@ -361,12 +362,12 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
 
     def training_step(self, dataloader_iter):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            Batch should be a list of microbatches and those microbatches should on CPU.
-            Microbatches are then moved to GPU during the pipeline.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        Batch should be a list of microbatches and those microbatches should on CPU.
+        Microbatches are then moved to GPU during the pipeline.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
         # we zero grads here because we also call backward in the megatron fwd/bwd functions
         self._optimizer.zero_grad()
@@ -408,7 +409,11 @@ def training_step(self, dataloader_iter):
         lr = self._optimizer.param_groups[0]['lr']
         self.log('lr', lr, rank_zero_only=True, batch_size=1)
         self.log(
-            'global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True, batch_size=1,
+            'global_step',
+            self.trainer.global_step,
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
         )
         # TODO: make sure compute_consumed_samples works for pipeline parallelism
         self.log(
@@ -432,21 +437,21 @@ def max_encoder_seq_length(self) -> int:
         return self.cfg.seq_length
 
     def backward(self, *args, **kwargs):
-        """ LightningModule hook to do backward.
-            We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
-            No need to call it here.
+        """LightningModule hook to do backward.
+        We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
+        No need to call it here.
         """
         return
 
     def optimizer_zero_grad(self, *args, **kwargs):
-        """ LightningModule hook to zero grad.
-            We want this to do nothing as we are zeroing grads during the training_step.
+        """LightningModule hook to zero grad.
+        We want this to do nothing as we are zeroing grads during the training_step.
         """
         return
 
     def allreduce_gradients(self):
         """Reduce gradients across data parallel ranks.
-           Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
+        Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
         """
         # Bucketize and all-reduce
         buckets = {}
@@ -768,10 +773,16 @@ def _test_validation_epoch_end(self, step_outputs, prefix):
     def on_validation_epoch_end(self):
         # FIXME: do we need this? 'global_step' is logged in training_step
         self.log('global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True, batch_size=1)
-        return self._test_validation_epoch_end(step_outputs=self.validation_step_outputs, prefix="val",)
+        return self._test_validation_epoch_end(
+            step_outputs=self.validation_step_outputs,
+            prefix="val",
+        )
 
     def on_test_epoch_end(self):
-        return self._test_validation_epoch_end(step_outputs=self.test_step_outputs, prefix="test",)
+        return self._test_validation_epoch_end(
+            step_outputs=self.test_step_outputs,
+            prefix="test",
+        )
 
     def loss_func(self, loss_mask, tokens_loss):
         """
@@ -784,7 +795,7 @@ def loss_func(self, loss_mask, tokens_loss):
         return loss
 
     def process_micro_batch(self, micro_batch):
-        """ Micro batch returned by MegatronT5 dataloader"""
+        """Micro batch returned by MegatronT5 dataloader"""
 
         data_b = micro_batch
 
@@ -800,8 +811,8 @@ def process_micro_batch(self, micro_batch):
         return tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask
 
     def _process_global_batch_without_megatron_batch_sampler(self, global_batch, tokenizer=None):
-        """ Prepares the global batch for megatron-core fwd/bwd functions.
-            Global batch is a list of micro batches.
+        """Prepares the global batch for megatron-core fwd/bwd functions.
+        Global batch is a list of micro batches.
         """
         tokenizer = self.tokenizer if tokenizer is None else tokenizer
         text_enc_list = []
@@ -1076,7 +1087,11 @@ def dummy():
         # Setting it to anything else will cause hanging due to tensor shape mismatches.
         output_tensor = fwd_bwd_func(
             forward_step_func=forward_step_func,
-            data_iterator=iter([batch_for_pipeline,]),
+            data_iterator=iter(
+                [
+                    batch_for_pipeline,
+                ]
+            ),
             model=[self.enc_dec_model],
             forward_only=True,
             num_microbatches=1,
@@ -1242,7 +1257,11 @@ def dummy():
 
             output_tensor = fwd_bwd_func(
                 forward_step_func=forward_step_func,
-                data_iterator=iter([batch_for_pipeline,]),
+                data_iterator=iter(
+                    [
+                        batch_for_pipeline,
+                    ]
+                ),
                 model=[self.enc_dec_model],
                 forward_only=True,
                 num_microbatches=1,
@@ -1322,13 +1341,13 @@ def dummy():
                         # choose top-k hypotheses with length penalty applied
                         len_penalties = compute_beam_search_len_penalty(decoder_seq_lengths, beam_alpha)
                         scores = scores / len_penalties
-                        scores, indices = sample_token_fn(scores.view(-1, beam_size ** 2), dim=1, log_softmax=False)
+                        scores, indices = sample_token_fn(scores.view(-1, beam_size**2), dim=1, log_softmax=False)
                         scores = scores.view(-1, 1) * len_penalties
 
                         # select predicted sequences which correspond to the chosen hypotheses
                         predicted_tokens_dec = predicted_tokens_dec.unsqueeze(1).repeat(1, beam_size, 1)
                         predicted_tokens_dec = torch.cat((predicted_tokens_dec, token_ids.unsqueeze(2)), dim=2)
-                        predicted_tokens_dec = predicted_tokens_dec.view(batch_size, beam_size ** 2, -1)
+                        predicted_tokens_dec = predicted_tokens_dec.view(batch_size, beam_size**2, -1)
                         p_len = predicted_tokens_dec.size(2)
                         predicted_tokens_dec_ids = indices.unsqueeze(2).repeat(1, 1, p_len)
                         predicted_tokens_dec = predicted_tokens_dec.gather(1, predicted_tokens_dec_ids).view(-1, p_len)
@@ -1336,7 +1355,7 @@ def dummy():
                         # select logits which correspond to the chosen hypotheses
                         predicted_log_probs = predicted_log_probs.unsqueeze(1).repeat(1, beam_size, 1)
                         predicted_log_probs = torch.cat((predicted_log_probs, log_probs.unsqueeze(2)), dim=2)
-                        predicted_log_probs = predicted_log_probs.view(batch_size, beam_size ** 2, -1)
+                        predicted_log_probs = predicted_log_probs.view(batch_size, beam_size**2, -1)
                         predicted_log_probs = predicted_log_probs.gather(1, predicted_tokens_dec_ids[:, :, 1:]).view(
                             -1, p_len - 1
                         )
@@ -1482,16 +1501,16 @@ def complete(self, request: Dict):
         return response
 
     def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any:
-        """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
-            When using pipeline parallelism, we need the global batch to remain on the CPU,
-            since the memory overhead will be too high when using a large number of microbatches.
-            Microbatches are transferred from CPU to GPU inside the pipeline.
+        """PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
+        When using pipeline parallelism, we need the global batch to remain on the CPU,
+        since the memory overhead will be too high when using a large number of microbatches.
+        Microbatches are transferred from CPU to GPU inside the pipeline.
         """
         return batch
 
     def _validate_trainer(self):
-        """ Certain trainer configurations can break training.
-            Here we try to catch them and raise an error.
+        """Certain trainer configurations can break training.
+        Here we try to catch them and raise an error.
         """
         if self.trainer.accumulate_grad_batches > 1:
             raise ValueError(
@@ -1502,8 +1521,7 @@ def list_available_models(self):
         pass
 
     def build_model_parallel_config(self):
-        """ Hidden size needs to be set from the cfg.encoder for the pipeline schedule.
-        """
+        """Hidden size needs to be set from the cfg.encoder for the pipeline schedule."""
 
         model_parallel_config = super().build_model_parallel_config()
         try:

From 908015dccfba616de2170126b869deaa50cf40a3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Jun 2024 15:27:11 -0700
Subject: [PATCH 160/178] Changes to enable CUDA graph for LLM (#8955)

* Changes to enable CUDA graph for LLM (#8751)

* Use next instead of get_batch

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* CUDA graph changes

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Change to enable CG with weight caching

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Revert "Use next instead of get_batch"

This reverts commit 0021bb444cdd1b27674fc0cfea909c1a42475336.

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Copy jbaczek/mcore_parallel_state_api_change branch leaving out changes to nemo/export/quantize/quantizer.py

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Revert "Copy jbaczek/mcore_parallel_state_api_change branch leaving out changes to nemo/export/quantize/quantizer.py"

This reverts commit b4f736ed2b39f6c48d2868ac3febb82c763ab3fb.

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Remove skip_weight_update argument

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Bug fix + cleanup

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Cleanup

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Use new TE API for FP8 Param transpose

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Change config param cuda_graph to enable_cuda_graph

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Enable TE RNGStatesTracker through config

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Change te_rng_tracker to use_te_rng_tracker

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* FP8 weight transpose handled inside TE

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Cleanup

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Revert "Revert "Copy jbaczek/mcore_parallel_state_api_change branch leaving out changes to nemo/export/quantize/quantizer.py""

This reverts commit e31862481216f9adf7fa584a0c0262916c935639.

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Fix merge conflicts

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Fix merge conflicts

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

* Fix merge conflicts

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>

---------

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>
Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jan Baczek <jbaczek@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: ericharper <ericharper@users.noreply.github.com>

---------

Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com>
Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: ericharper <ericharper@users.noreply.github.com>
Co-authored-by: vasunvidia <108759426+vasunvidia@users.noreply.github.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jan Baczek <jbaczek@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: ericharper <ericharper@users.noreply.github.com>
---
 .../gpt_full_te_layer_autocast_spec.py        | 13 ++-
 .../language_modeling/megatron_base_model.py  |  1 +
 .../modules/common/megatron/megatron_init.py  |  2 +
 nemo/core/optim/distributed_adam.py           | 84 ++++++++++++++-----
 4 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index 02858b119bfa..6cce2b42be9c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -149,7 +149,7 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
+        attention_mask: torch.Tensor = None,
         encoder_output: Optional[torch.Tensor] = None,
         enc_dec_attn_mask: Optional[torch.Tensor] = None,
         inference_params: Optional[Any] = None,
@@ -169,7 +169,7 @@ def forward(
         with torch.autocast(device_type="cuda", dtype=self.dtype):
             return super().forward(
                 hidden_states,
-                attention_mask,
+                attention_mask=attention_mask,
                 encoder_output=encoder_output,
                 enc_dec_attn_mask=enc_dec_attn_mask,
                 inference_params=inference_params,
@@ -242,25 +242,30 @@ def __init__(self, config, layer_number=1, hidden_dropout=None):
     def forward(
         self,
         hidden_states,
-        attention_mask,
+        is_first_microbatch=None,
+        attention_mask=None,
         context=None,
         context_mask=None,
         rotary_pos_emb=None,
         inference_params=None,
         packed_seq_params=None,  # TODO: handle this
     ):
+        # Use is_first_microbatch argument during CUDA graph capture. Use self.is_first_microbatch otherwise.
         hidden_states = super().forward(
             hidden_states,
             attention_mask=attention_mask,
             encoder_output=context,
             enc_dec_attn_mask=context_mask,
             inference_params=inference_params,
-            is_first_microbatch=self.is_first_microbatch,
+            is_first_microbatch=is_first_microbatch if is_first_microbatch is not None else self.is_first_microbatch,
             # checkpoint_core_attention,
         )
         self.is_first_microbatch = False
         context = None
 
+        # CUDA graph requires returned values to be Tensors
+        if self.config.enable_cuda_graph and self.training:
+            return hidden_states
         return hidden_states, context
 
     def _get_layer_offset(self):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index a27f9fd5e5e4..03f494732337 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -206,6 +206,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
             init_mpi_proc_group=cfg.get('ub_tp_comm_overlap', False),
             seed=self.cfg.get('seed', 1234),
             apex_transformer_log_level=self.cfg.get('apex_transformer_log_level', 30),
+            use_te_rng_tracker=self.cfg.get('use_te_rng_tracker', False),
         )
 
         # This must be called after initialize model parallel since it needs to know the data parallel size
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
index 341e534bcd89..55e386bb22e5 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
@@ -76,6 +76,7 @@ def initialize_model_parallel_for_nemo(
     seed=1234,
     apex_transformer_log_level=30,
     use_tp_pp_dp_mapping=False,
+    use_te_rng_tracker=False,
 ):
 
     if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED:
@@ -128,6 +129,7 @@ def initialize_model_parallel_for_nemo(
     set_pipeline_model_parallel_world_size(app_state.pipeline_model_parallel_size)
     set_pipeline_model_parallel_split_rank(app_state.pipeline_model_parallel_split_rank)
 
+    tensor_parallel.random.initialize_rng_tracker(use_te_rng_tracker=use_te_rng_tracker)
     if seed is not None:
         # @chcui not setting seed is for model conversion. always set seed for training/inference.
         _set_random_seed(seed)
diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
index 32bd7e6c1154..c7ade1c62ae1 100644
--- a/nemo/core/optim/distributed_adam.py
+++ b/nemo/core/optim/distributed_adam.py
@@ -136,7 +136,8 @@ def hook(*unused):
                     self._grad_copy(param)
                     if self.overlap_grad_sync and not getattr(param, '_disable_overlap_grad_sync', False):
                         self._try_start_bucket_grad_sync(
-                            params=[param], ignore_last_bucket=need_to_initialize,
+                            params=[param],
+                            ignore_last_bucket=need_to_initialize,
                         )
 
         return hook
@@ -167,10 +168,14 @@ def init_params(
         # Initialize FP8 and non-FP8 tensors separately
         if any(is_float8tensor(param) for param in params):
             super().init_params(
-                filter(is_float8tensor, params), param_sync_dtype=torch.uint8, **kwargs,
+                filter(is_float8tensor, params),
+                param_sync_dtype=torch.uint8,
+                **kwargs,
             )
         super().init_params(
-            params, param_sync_dtype=param_sync_dtype, **kwargs,
+            params,
+            param_sync_dtype=param_sync_dtype,
+            **kwargs,
         )
 
     def init_params_bucket(
@@ -200,7 +205,10 @@ def init_params_bucket(
         params = remaining_params
         start_bucket_id = len(self.state["buckets"])
         super().init_params_bucket(
-            fp32_params, grad_sync_dtype=torch.float32, param_sync_dtype=param_sync_dtype, **kwargs,
+            fp32_params,
+            grad_sync_dtype=torch.float32,
+            param_sync_dtype=param_sync_dtype,
+            **kwargs,
         )
         end_bucket_id = len(self.state["buckets"])
         fp32_buckets = self.state["buckets"][start_bucket_id:end_bucket_id]
@@ -216,7 +224,10 @@ def init_params_bucket(
         params = remaining_params
         start_bucket_id = len(self.state["buckets"])
         super().init_params_bucket(
-            fp8_params, grad_sync_dtype=grad_sync_dtype, param_sync_dtype=torch.uint8, **kwargs,
+            fp8_params,
+            grad_sync_dtype=grad_sync_dtype,
+            param_sync_dtype=torch.uint8,
+            **kwargs,
         )
         end_bucket_id = len(self.state["buckets"])
         fp8_buckets = self.state["buckets"][start_bucket_id:end_bucket_id]
@@ -225,12 +236,18 @@ def init_params_bucket(
         normal_buckets = []
         start_bucket_id = len(self.state["buckets"])
         super().init_params_bucket(
-            params, grad_sync_dtype=grad_sync_dtype, param_sync_dtype=param_sync_dtype, **kwargs,
+            params,
+            grad_sync_dtype=grad_sync_dtype,
+            param_sync_dtype=param_sync_dtype,
+            **kwargs,
         )
         end_bucket_id = len(self.state["buckets"])
         normal_buckets = self.state["buckets"][start_bucket_id:end_bucket_id]
 
-        def add_param_to_bucket(param: torch.nn.Parameter, bucket: self.StateBucket,) -> None:
+        def add_param_to_bucket(
+            param: torch.nn.Parameter,
+            bucket: self.StateBucket,
+        ) -> None:
             """Add trivial param fragment to bucket"""
             param_fragments = self.state[param]["fragments"]
             param_group_id = param_fragments[0].param_group_id
@@ -283,7 +300,11 @@ def _init_param_state(
         # Initialize non-FP8 params as usual
         if not is_float8tensor(param):
             super()._init_param_state(
-                param, param_group_id, param_id, param_sync_dtype=param_sync_dtype, **kwargs,
+                param,
+                param_group_id,
+                param_id,
+                param_sync_dtype=param_sync_dtype,
+                **kwargs,
             )
 
         # Return immediately if already initialized
@@ -293,7 +314,11 @@ def _init_param_state(
         # Initialize with FP32 copy of param
         fp32_param = param.float()
         super()._init_param_state(
-            fp32_param, param_group_id, param_id, param_sync_dtype=torch.uint8, **kwargs,
+            fp32_param,
+            param_group_id,
+            param_id,
+            param_sync_dtype=torch.uint8,
+            **kwargs,
         )
         self.state[param].update(self.state[fp32_param])
         del self.state[fp32_param]
@@ -360,7 +385,9 @@ def init_param_buffer(self) -> None:
 
         # Copy values into param buffer
         _multi_tensor_copy(
-            param_flat_views, param_buffer_views, dummy_overflow_buf=self._dummy_overflow_buf,
+            param_flat_views,
+            param_buffer_views,
+            dummy_overflow_buf=self._dummy_overflow_buf,
         )
 
         # Make all params a view into the param buffer
@@ -393,7 +420,10 @@ def zero_grad(self, *args, **kwargs) -> None:
                     param.main_grad = self.grad_buffer_view(param)
 
     def grad_norm(
-        self, parameters: Optional[Iterable[torch.nn.Parameter]] = None, norm_type: float = 2.0, force: bool = False,
+        self,
+        parameters: Optional[Iterable[torch.nn.Parameter]] = None,
+        norm_type: float = 2.0,
+        force: bool = False,
     ) -> torch.Tensor:
         assert norm_type == 2
 
@@ -411,7 +441,8 @@ def grad_norm(
 
             # Sum over all procs to get grad norm
             torch.distributed.all_reduce(
-                grad_norm_sq, op=torch.distributed.ReduceOp.SUM,
+                grad_norm_sq,
+                op=torch.distributed.ReduceOp.SUM,
             )
             self._grad_norm = grad_norm_sq.sqrt()
 
@@ -479,7 +510,9 @@ def _param_copy_fragments(self, fragments: Iterable[DistributedFusedAdam.Paramet
 
         # Copy data from parameter buckets to parameters
         _multi_tensor_copy(
-            buffers_in, buffers_out, dummy_overflow_buf=self._dummy_overflow_buf,
+            buffers_in,
+            buffers_out,
+            dummy_overflow_buf=self._dummy_overflow_buf,
         )
 
         # Update transpose caches
@@ -487,8 +520,6 @@ def _param_copy_fragments(self, fragments: Iterable[DistributedFusedAdam.Paramet
         for param in params:
             if is_float8tensor(param):
                 param._reset_caches()
-                param.transpose(update_cache=True)
-                param._lazy_transpose_cache = True
 
     @torch.no_grad()
     def _check_params_shard_dtypes(self, params_buckets: Dict[int, DistributedFusedAdam.ParameterBucket]) -> None:
@@ -570,11 +601,15 @@ def _check_params_shard_dtypes(self, params_buckets: Dict[int, DistributedFusedA
             packed_scales = torch.empty(num_fp8_params, dtype=torch.float32, device=self.device)
             packed_scale_views = [packed_scales[i].view(1) for i in range(num_fp8_params)]
             _multi_tensor_copy(
-                scales, packed_scale_views, dummy_overflow_buf=self._dummy_overflow_buf,
+                scales,
+                packed_scale_views,
+                dummy_overflow_buf=self._dummy_overflow_buf,
             )
             torch.reciprocal(packed_scales, out=packed_scales)
             _multi_tensor_copy(
-                packed_scale_views, scale_invs, dummy_overflow_buf=self._dummy_overflow_buf,
+                packed_scale_views,
+                scale_invs,
+                dummy_overflow_buf=self._dummy_overflow_buf,
             )
 
             # Reduce amaxes
@@ -582,13 +617,19 @@ def _check_params_shard_dtypes(self, params_buckets: Dict[int, DistributedFusedA
             packed_amaxes = torch.empty(num_fp8_params, dtype=torch.float32, device=self.device)
             packed_amax_views = [packed_amaxes[i].view(1) for i in range(num_fp8_params)]
             _multi_tensor_copy(
-                amaxes, packed_amax_views, dummy_overflow_buf=self._dummy_overflow_buf,
+                amaxes,
+                packed_amax_views,
+                dummy_overflow_buf=self._dummy_overflow_buf,
             )
             torch.distributed.all_reduce(
-                packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=self.distributed_process_group,
+                packed_amaxes,
+                op=torch.distributed.ReduceOp.MAX,
+                group=self.distributed_process_group,
             )
             _multi_tensor_copy(
-                packed_amax_views, amaxes, dummy_overflow_buf=self._dummy_overflow_buf,
+                packed_amax_views,
+                amaxes,
+                dummy_overflow_buf=self._dummy_overflow_buf,
             )
 
             # Reset
@@ -602,7 +643,8 @@ def sharded_state_dict(self, model_sharded_state_dict, optimizer_state_dict=None
             optimizer_state_dict = self.state_dict()
 
         id_to_sharded_param_map = get_param_id_to_sharded_param_map(
-            model_sharded_state_dict=model_sharded_state_dict, optim_params_iter=self.parameters(),
+            model_sharded_state_dict=model_sharded_state_dict,
+            optim_params_iter=self.parameters(),
         )
         # Convert state
         step = optimizer_state_dict['state'].pop('step')

From ae06138a5012375e39a93b512d76b5fbc4d8c229 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Jun 2024 15:27:54 -0700
Subject: [PATCH 161/178] Enhance Distributed Adam (#9051)

* Enhance Distributed Adam (#9037)

* Fix deprecated env.

Signed-off-by: Wil Kong <alpha0422@gmail.com>

* Use user desired value for distributed adam.

Signed-off-by: Wil Kong <alpha0422@gmail.com>

* Preserve memory format in parameter buffer of distributed adam.

Signed-off-by: Wil Kong <alpha0422@gmail.com>

* Fix the contiguous_param_buffer bug about bprop overlap and redundant copy after all-gather.

Signed-off-by: Wil Kong <alpha0422@gmail.com>

* Provide API to lock SHArP tree for distributed adam within nodes.

Signed-off-by: Wil Kong <alpha0422@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Wil Kong <alpha0422@gmail.com>

---------

Signed-off-by: Wil Kong <alpha0422@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: ericharper <ericharper@users.noreply.github.com>

---------

Signed-off-by: Wil Kong <alpha0422@gmail.com>
Signed-off-by: ericharper <ericharper@users.noreply.github.com>
Co-authored-by: Wil Kong <alpha0422@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: ericharper <ericharper@users.noreply.github.com>
---
 .../language_modeling/megatron_base_model.py  | 23 +++--
 nemo/core/optim/distributed_adam.py           | 94 ++++++++++++++-----
 nemo/utils/callbacks/cuda_graph.py            | 20 +++-
 3 files changed, 100 insertions(+), 37 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 03f494732337..b2594731d177 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -779,18 +779,21 @@ def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any:
             model_dtype = torch.float32
             if self.megatron_amp_O2 and hasattr(self, 'autocast_dtype'):
                 model_dtype = self.autocast_dtype
-            optim_kwargs['param_sync_dtype'] = model_dtype
+            # Don't override user desired value
+            if 'param_sync_dtype' not in optim_config:
+                optim_kwargs['param_sync_dtype'] = model_dtype
 
             # Determine whether to store master params in optimizer
-            if self.cfg.get('fp8_params', False):
-                optim_kwargs['store_params'] = True
-            elif optim_dtype == model_dtype:
-                optim_kwargs['store_params'] = False
-            elif optim_dtype == torch.float32 and model_dtype == torch.bfloat16:
-                optim_kwargs['store_params'] = False
-                optim_kwargs['store_param_remainders'] = True
-            else:
-                optim_kwargs['store_params'] = True
+            if 'store_params' not in optim_config:
+                if self.cfg.get('fp8_params', False):
+                    optim_kwargs['store_params'] = True
+                elif optim_dtype == model_dtype:
+                    optim_kwargs['store_params'] = False
+                elif optim_dtype == torch.float32 and model_dtype == torch.bfloat16:
+                    optim_kwargs['store_params'] = False
+                    optim_kwargs['store_param_remainders'] = True
+                else:
+                    optim_kwargs['store_params'] = True
 
         return super().setup_optimization(optim_config=optim_config, optim_kwargs=optim_kwargs)
 
diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
index c7ade1c62ae1..94f117e7f525 100644
--- a/nemo/core/optim/distributed_adam.py
+++ b/nemo/core/optim/distributed_adam.py
@@ -34,9 +34,62 @@
 from megatron.core.dist_checkpointing.optimizer import get_param_id_to_sharded_param_map, optim_state_to_sharding_state
 from transformer_engine.pytorch.cpp_extensions import cast_to_fp8
 
-from nemo.utils import str_to_dtype
+from nemo.utils import logging, str_to_dtype
 from nemo.utils.te_utils import is_float8tensor
 
+_distribute_within_nodes_pgs = {}
+
+
+def create_distribute_within_nodes_pgs():
+    """Create process groups for distributing with nodes.
+
+    User can reuse this function to reorder communicators for SHArP.
+    """
+    global _distribute_within_nodes_pgs
+    assert torch.distributed.is_initialized()
+    if _distribute_within_nodes_pgs:
+        return _distribute_within_nodes_pgs
+
+    world_size = torch.distributed.get_world_size()
+    rank = torch.distributed.get_rank()
+    devices = torch.cuda.device_count()
+    nodes = world_size // devices
+
+    if nodes * devices != world_size:
+        logging.warning("Expected all nodes have the same amout of devices, disable distribute_within_nodes.")
+        return {}
+
+    node_id = rank // devices
+    device_id = rank % devices
+
+    distributed_pgs = []
+    for i in range(nodes):
+        ranks = [i * devices + j for j in range(devices)]
+        pg = torch.distributed.new_group(ranks=ranks)
+        distributed_pgs.append(pg)
+
+    redundant_pgs = []
+    for i in range(devices):
+        ranks = [i + j * devices for j in range(nodes)]
+        pg = torch.distributed.new_group(ranks=ranks)
+        redundant_pgs.append(pg)
+
+    # To re-order SHArP communicator right after distributed init,
+    # we have to expose redundant_process_group to user.
+    # User has too invoke allreduce through redundant_process_group
+    # before all other communicators to lock SHArP tree.
+    _distribute_within_nodes_pgs = {
+        'world_size': world_size,
+        'rank': rank,
+        'devices': devices,
+        'nodes': nodes,
+        'node_id': node_id,
+        'device_id': device_id,
+        'distributed_process_group': distributed_pgs[node_id],
+        'redundant_process_group': redundant_pgs[device_id],
+    }
+    return _distribute_within_nodes_pgs
+
 
 class MegatronDistributedFusedAdam(DistributedFusedAdam):
     """Adam optimizer with ZeRO algorithm
@@ -78,27 +131,12 @@ def __init__(
             kwargs['distributed_process_group'] = self_groups[rank]
             kwargs['redundant_process_group'] = kwargs['process_group']
         elif distribute_within_nodes:
-            world_size = torch.distributed.get_world_size()
-            rank = torch.distributed.get_rank()
-            devices = torch.cuda.device_count()
-            nodes = world_size // devices
-            assert nodes * devices == world_size, "Expected all nodes have teh same amout of devices."
-            node_id = rank // devices
-            device_id = rank % devices
-
-            distributed_pgs = []
-            for i in range(nodes):
-                ranks = [i * devices + j for j in range(devices)]
-                pg = torch.distributed.new_group(ranks=ranks)
-                distributed_pgs.append(pg)
-            kwargs['distributed_process_group'] = distributed_pgs[node_id]
-
-            redundant_pgs = []
-            for i in range(devices):
-                ranks = [i + j * devices for j in range(nodes)]
-                pg = torch.distributed.new_group(ranks=ranks)
-                redundant_pgs.append(pg)
-            kwargs['redundant_process_group'] = redundant_pgs[device_id]
+            dist_pg_infos = create_distribute_within_nodes_pgs()
+            if dist_pg_infos:
+                kwargs['distributed_process_group'] = dist_pg_infos['distributed_process_group']
+                kwargs['redundant_process_group'] = dist_pg_infos['redundant_process_group']
+                global _distribute_within_nodes_pgs
+                _distribute_within_nodes_pgs = {}
 
         # Make sure dtypes are in right type
         for keyword in ('dtype', 'grad_sync_dtype', 'param_sync_dtype'):
@@ -380,6 +418,8 @@ def init_param_buffer(self) -> None:
                         f"Attempted to change a parameter with dtype={param.dtype} "
                         f"into a buffer view with dtype={param_buffer_view.dtype}"
                     )
+                if param.is_contiguous(memory_format=torch.channels_last):
+                    param = param.permute(0, 2, 3, 1)
                 param_flat_views.append(param.detach().view(-1))
             param_buffer_views.append(param_buffer_view)
 
@@ -395,7 +435,15 @@ def init_param_buffer(self) -> None:
             if is_float8tensor(param):
                 param._data = buffer_view.view(param.size())
             else:
-                param.data = buffer_view.view(param.size())
+                # Preserve memory format for param here, i.e. NHWC tensors
+                # `param.data.set_()` failed to change storage.
+                # `param.set_()` invalidates bprop hook.
+                param.data = torch.as_strided(
+                    buffer_view,
+                    param.size(),
+                    param.stride(),
+                    storage_offset=buffer_view.storage_offset(),
+                )
 
     def try_grad_sync(self, params: Iterable[torch.nn.Parameter]) -> None:
         """Attempt to launch gradient synchronization"""
diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py
index 77dc33e7b567..ec0650a90e7d 100644
--- a/nemo/utils/callbacks/cuda_graph.py
+++ b/nemo/utils/callbacks/cuda_graph.py
@@ -159,7 +159,13 @@ def update_metrics(self, key, value, batch_size):
 
 
 def get_optimizer_step(state):
-    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -> None:
+    def optimizer_step(
+        self,
+        epoch,
+        batch_idx,
+        optimizer,
+        optimizer_closure=None,
+    ) -> None:
         # Not all optimizer supports set_to_none.
         if not hasattr(optimizer, "support_set_to_none"):
             optimizer.support_set_to_none = is_param_in_hook_signature(
@@ -175,7 +181,10 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -
             with torch.cuda.stream(state.stream):
                 optimizer.zero_grad(**zero_grad_kwargs)
                 self.__orig_optimizer_step__(
-                    epoch, batch_idx, optimizer, optimizer_closure=optimizer_closure,
+                    epoch,
+                    batch_idx,
+                    optimizer,
+                    optimizer_closure=optimizer_closure,
                 )
             torch.cuda.current_stream().wait_stream(state.stream)
 
@@ -194,7 +203,10 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -
                 # `zero_grad()` being not captured.
                 optimizer.zero_grad(**zero_grad_kwargs)
                 self.__orig_optimizer_step__(
-                    epoch, batch_idx, optimizer, optimizer_closure=optimizer_closure,
+                    epoch,
+                    batch_idx,
+                    optimizer,
+                    optimizer_closure=optimizer_closure,
                 )
             torch.cuda.synchronize()
 
@@ -270,7 +282,7 @@ def __init__(self, capture_iteration=-1):
             raise Exception("Warmup must run at least 11 DDP-enabled eager iterations before capture.")
         if torch.distributed.is_initialized():
             raise Exception("CUDAGraphCallback should be initialized before process group.")
-        os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
 
         self.state = CUDAGraphState(capture_iteration=capture_iteration)
 

From 2f4588cbbfc083b67db1e605f2e7c57e1f272656 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 5 Jun 2024 17:17:38 -0700
Subject: [PATCH 162/178] Force diarizer to use CUDA if cuda is available and
 if device=None. (#9380) (#9390)

* Fixed clustering diarizer to load MSDD to GPU by default if cuda on


* Fixed clustering diarizer to load MSDD to GPU by default if cuda on


* Apply isort and black reformatting


---------

Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: tango4j <tango4j@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: tango4j <tango4j@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../asr/models/clustering_diarizer.py         | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/nemo/collections/asr/models/clustering_diarizer.py b/nemo/collections/asr/models/clustering_diarizer.py
index 533f276c0018..93913a43c1b5 100644
--- a/nemo/collections/asr/models/clustering_diarizer.py
+++ b/nemo/collections/asr/models/clustering_diarizer.py
@@ -74,10 +74,10 @@ def get_available_model_names(class_name):
 
 class ClusteringDiarizer(torch.nn.Module, Model, DiarizationMixin):
     """
-    Inference model Class for offline speaker diarization. 
-    This class handles required functionality for diarization : Speech Activity Detection, Segmentation, 
-    Extract Embeddings, Clustering, Resegmentation and Scoring. 
-    All the parameters are passed through config file 
+    Inference model Class for offline speaker diarization.
+    This class handles required functionality for diarization : Speech Activity Detection, Segmentation,
+    Extract Embeddings, Clustering, Resegmentation and Scoring.
+    All the parameters are passed through config file
     """
 
     def __init__(self, cfg: Union[DictConfig, Any], speaker_model=None):
@@ -137,7 +137,10 @@ def _init_speaker_model(self, speaker_model=None):
         Initialize speaker embedding model with model name or path passed through config
         """
         if speaker_model is not None:
-            self._speaker_model = speaker_model
+            if self._cfg.device is None and torch.cuda.is_available():
+                self._speaker_model = speaker_model.to(torch.device('cuda'))
+            else:
+                self._speaker_model = speaker_model
         else:
             model_path = self._cfg.diarizer.speaker_embeddings.model_path
             if model_path is not None and model_path.endswith('.nemo'):
@@ -158,7 +161,6 @@ def _init_speaker_model(self, speaker_model=None):
                 self._speaker_model = EncDecSpeakerLabelModel.from_pretrained(
                     model_name=model_path, map_location=self._cfg.device
                 )
-
         self.multiscale_args_dict = parse_scale_configs(
             self._diarizer_params.speaker_embeddings.parameters.window_length_in_sec,
             self._diarizer_params.speaker_embeddings.parameters.shift_length_in_sec,
@@ -171,7 +173,9 @@ def _setup_vad_test_data(self, manifest_vad_input):
             'sample_rate': self._cfg.sample_rate,
             'batch_size': self._cfg.get('batch_size'),
             'vad_stream': True,
-            'labels': ['infer',],
+            'labels': [
+                'infer',
+            ],
             'window_length_in_sec': self._vad_window_length_in_sec,
             'shift_length_in_sec': self._vad_shift_length_in_sec,
             'trim_silence': False,
@@ -192,8 +196,8 @@ def _setup_spkr_test_data(self, manifest_file):
 
     def _run_vad(self, manifest_file):
         """
-        Run voice activity detection. 
-        Get log probability of voice activity detection and smoothes using the post processing parameters. 
+        Run voice activity detection.
+        Get log probability of voice activity detection and smoothes using the post processing parameters.
         Using generated frame level predictions generated manifest file for later speaker embedding extraction.
         input:
         manifest_file (str) : Manifest file containing path to audio file and label as infer
@@ -338,7 +342,7 @@ def _perform_speech_activity_detection(self):
     def _extract_embeddings(self, manifest_file: str, scale_idx: int, num_scales: int):
         """
         This method extracts speaker embeddings from segments passed through manifest_file
-        Optionally you may save the intermediate speaker embeddings for debugging or any use. 
+        Optionally you may save the intermediate speaker embeddings for debugging or any use.
         """
         logging.info("Extracting embeddings for Diarization")
         self._setup_spkr_test_data(manifest_file)

From 95ca2f45034447ecd11bf29a0ab55d9079133db1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Thu, 6 Jun 2024 02:47:30 +0200
Subject: [PATCH 163/178] ci: Properly catch failed tests by introduction of
 workflow templates (#9324)

* ci: Refactor tests into reusable template

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci: Fix sending alerts on failure

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* disable slack

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix alerting

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci: Increase timeout for `L0_Unit_Tests_CPU`

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* increase timeout

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* increase timeout for `Speech_Checkpoints_tests`

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* improve readability

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* test

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* finalize

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* add missing rm statement for `L2_PTQ_Llama2_Export_Only`

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* all your comments are belong to us

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* remove github output

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* revive more comments

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* add L2: ASR dev run - part two

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Signed-off-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .github/workflows/_test_template.yml |   58 +
 .github/workflows/cicd-main.yml      | 8506 +++++++++++---------------
 2 files changed, 3519 insertions(+), 5045 deletions(-)
 create mode 100644 .github/workflows/_test_template.yml

diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
new file mode 100644
index 000000000000..31e9452d0fe5
--- /dev/null
+++ b/.github/workflows/_test_template.yml
@@ -0,0 +1,58 @@
+name: ~test template
+
+on:
+  workflow_call:
+    inputs:
+      RUNNER:
+        type: string
+        description: Runner to use for test
+        required: true
+      TIMEOUT:
+        type: number
+        description: Max runtime of test in minutes
+        required: false
+        default: 10
+      SCRIPT:
+        type: string
+        description: Test script to execute
+        required: true
+      AFTER_SCRIPT:
+        type: string
+        description: Script to run after main test
+        required: false
+        default: ":"
+      IS_OPTIONAL:
+        type: boolean
+        description: Failure will cancel all other tests if set to true
+        required: false
+        default: false
+    outputs:
+      conclusion:
+        description: Conclusion of main test step
+        value: ${{ jobs.main.outputs.conclusion }}
+
+jobs:
+  main:
+    runs-on: ${{ inputs.RUNNER }} 
+    timeout-minutes: ${{ inputs.TIMEOUT }}
+    outputs:
+      conclusion: ${{ steps.main.conclusion }}
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - id: main
+          run: ${{ inputs.SCRIPT }}
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: failure() && inputs.IS_OPTIONAL == false
+        - name: after_script
+          if: always() && inputs.AFTER_SCRIPT != ':'
+          run: ${{ inputs.AFTER_SCRIPT }}
\ No newline at end of file
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 29e84b933f14..815b8b5d69be 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -97,193 +97,88 @@ jobs:
 
   OPTIONAL_L0_Unit_Tests_GPU:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options:
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: "L0: Unit Tests GPU"
-      run: |
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
-    #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-    #  if: "failure()"
-      
+      IS_OPTIONAL: true
 
   L0_Unit_Tests_CPU:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-cpu
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: "L0: Unit Tests CPU"
-      run: |
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-cpu
+      TIMEOUT: 80
+      SCRIPT: |
         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-      if: "failure()"
 
   L0_Setup_Test_Data_And_Models:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options:
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python -m tests.setup --save_dir /home/TestData/nlp
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python -m tests.setup --save_dir /home/TestData/nlp
 
-##     - name: L2: Multimodal Imagen Train
+  ##     - name: L2: Multimodal Imagen Train
 
   # L2: Community LLM Checkpoints tests
   L2_Community_LLM_Checkpoints_tests_Llama:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
-            --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \
-            --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-            --precision=16
-        - name: Cleanup
-          if: "always()"
-          run: |
-            rm -rf /home/TestData/nlp/megatron_llama/model_weights
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
+          --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \
+          --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          --precision=16
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_llama/model_weights
 
   L2_Community_LLM_Checkpoints_tests_Llama3:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v2
-        - run: |
-            CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
-            --input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \
-            --output_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo \
-            --precision=16
-            rm -f /home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo
-        - name: Cleanup
-          if: "always()"
-          run: |
-            rm -rf /home/TestData/nlp/megatron_llama/llama3-ci-hf/model_weights
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
+          --input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \
+          --output_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo \
+          --precision=16          
+      AFTER_SCRIPT: |
+        rm -f /home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo
+        rm -rf /home/TestData/nlp/megatron_llama/llama3-ci-hf/model_weights
 
   L2_Community_LLM_Checkpoints_tests_StarCoder:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            mkdir -p /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }};
-            python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
-            --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
-            --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}
-        - name: Cleanup
-          if: "always()"
-          run: |
-            rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; 
-            rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/
-            rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        mkdir -p /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }};
+        python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
+        --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
+        --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; 
+        rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/
+        rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights
 
   L2_Community_LLM_Checkpoints_tests_Falcon:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
-            --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
-            --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
-            rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
-        - name: Cleanup
-          if: "always()"
-          run: |
-            rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+          python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
+          --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
+          --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
+          rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights
+        
   # this test is using a 7B model which is too large for GitHub CI
   # replace the model in this test with a toy model or move the test
   # to the nightly CI
@@ -313,93 +208,51 @@ jobs:
 
   L2_PTQ_Llama2_Export_Only:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options:
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-            quantization.algorithm=null \
-            model_save=/home/TestData/nlp/megatron_llama/ci_baseline
-
-            rm -rf /home/TestData/nlp/megatron_llama/ci_baseline
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_quantization.py \
+          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          quantization.algorithm=null \
+          model_save=/home/TestData/nlp/megatron_llama/ci_baseline
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_llama/ci_baseline
 
   L2_PTQ_Llama2_FP8:
-     needs: [cicd-test-container-setup]
-     runs-on: self-hosted-azure
-     timeout-minutes: 10
-     container:
-       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-       options:
-         # --user 0:128
-         --device=/dev/nvidia0
-         --gpus all
-         --shm-size=8g
-         --env TRANSFORMERS_OFFLINE=0
-         --env HYDRA_FULL_ERROR=1
-         --volume /mnt/datadrive/TestData:/home/TestData
-     steps:
-         - name: Checkout repository
-           uses: actions/checkout@v4
-         - run: |
-             python examples/nlp/language_modeling/megatron_quantization.py \
-             model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-             tensor_model_parallel_size=2 \
-             trainer.devices=2 \
-             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-             quantization.algorithm=fp8 \
-             quantization.num_calib_size=8 \
-             inference.batch_size=2 \
-             export.inference_tensor_parallel=2 \
-             model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
-
-             rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
-         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-           if: "failure()"
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_quantization.py \
+          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          tensor_model_parallel_size=2 \
+          trainer.devices=2 \
+          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+          quantization.algorithm=fp8 \
+          quantization.num_calib_size=8 \
+          inference.batch_size=2 \
+          export.inference_tensor_parallel=2 \
+          model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
 
   L2_PTQ_Llama2_INT8_SQ:
-     needs: [cicd-test-container-setup]
-     runs-on: self-hosted-azure
-     timeout-minutes: 10
-     container:
-       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-       options:
-         # --user 0:128
-         --device=/dev/nvidia0
-         --gpus all
-         --shm-size=8g
-         --env TRANSFORMERS_OFFLINE=0
-         --env HYDRA_FULL_ERROR=1
-         --volume /mnt/datadrive/TestData:/home/TestData
-     steps:
-         - name: Checkout repository
-           uses: actions/checkout@v4
-         - run: |
-             python examples/nlp/language_modeling/megatron_quantization.py \
-             model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-             quantization.algorithm=int8_sq \
-             quantization.num_calib_size=8 \
-             inference.batch_size=2 \
-             model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
-
-             rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
-         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-           if: "failure()"
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_quantization.py \
+        model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+        quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+        quantization.algorithm=int8_sq \
+        quantization.num_calib_size=8 \
+        inference.batch_size=2 \
+        model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
 
   # TODO: investigate int4_awq stuck issues and restore the test
   #L2_PTQ_Llama2_INT4_AWQ:
@@ -437,274 +290,172 @@ jobs:
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc.py \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_results
-            rm -rf examples/asr/speech_to_text_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc.py \
+          model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+          model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+          trainer.devices=1 \
+          trainer.accelerator="gpu" \
+          +trainer.fast_dev_run=True \
+          exp_manager.exp_dir=examples/asr/speech_to_text_results
+      AFTER_SCRIPT: |
+          rm -rf examples/asr/speech_to_text_results
 
   ASR_dev_run_Speech_to_Text_WPE_-_CitriNet:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/citrinet/" --config-name="config_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results
-            rm -rf examples/asr/speech_to_text_wpe_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
+          --config-path="../conf/citrinet/" --config-name="config_bpe" \
+          model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+          model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+          model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+          model.tokenizer.type="wpe" \
+          trainer.devices=1 \
+          trainer.accelerator="gpu" \
+          +trainer.fast_dev_run=True \
+          exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results
+      AFTER_SCRIPT: |
+          rm -rf examples/asr/speech_to_text_wpe_results
 
   ASR_dev_run_Speech_Pre-training_-_CitriNet:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/speech_pretraining/speech_pre_training.py \
-            --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_pre_training_results
-            rm -rf examples/asr/speech_pre_training_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/speech_pretraining/speech_pre_training.py \
+        --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_pre_training_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_pre_training_results
 
   ASR_dev_run_Speech_To_Text_Finetuning:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/speech_to_text_finetune.py \
-            --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
-            model.tokenizer.update_tokenizer=False \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_finetuning_results
-            rm -rf examples/asr/speech_finetuning_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/speech_to_text_finetune.py \
+        --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
+        model.tokenizer.update_tokenizer=False \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_finetuning_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_finetuning_results
 
   OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/speech_to_text_finetune.py \
-            --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
-            ~model.train_ds.hf_data_cfg \
-            model.train_ds.num_workers=1 \
-            model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
-            model.train_ds.streaming=true \
-            +model.train_ds.hf_data_cfg.path="librispeech_asr" \
-            +model.train_ds.hf_data_cfg.name=null \
-            +model.train_ds.hf_data_cfg.split="test.clean" \
-            +model.train_ds.hf_data_cfg.streaming=true \
-            ~model.validation_ds.hf_data_cfg \
-            model.validation_ds.streaming=true \
-            +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
-            +model.validation_ds.hf_data_cfg.name=null \
-            +model.validation_ds.hf_data_cfg.split="test.clean" \
-            +model.validation_ds.hf_data_cfg.streaming=true \
-            ~model.test_ds \
-            init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
-            model.tokenizer.update_tokenizer=False \
-            model.optim.sched.warmup_steps=0 \
-            +model.optim.sched.max_steps=3 \
-            trainer.max_epochs=null \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_finetuning_results
-            rm -rf examples/asr/speech_finetuning_results
-        #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-        #  if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |-
+        python examples/asr/speech_to_text_finetune.py \
+        --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
+        ~model.train_ds.hf_data_cfg \
+        model.train_ds.num_workers=1 \
+        model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
+        model.train_ds.streaming=true \
+        +model.train_ds.hf_data_cfg.path="librispeech_asr" \
+        +model.train_ds.hf_data_cfg.name=null \
+        +model.train_ds.hf_data_cfg.split="test.clean" \
+        +model.train_ds.hf_data_cfg.streaming=true \
+        ~model.validation_ds.hf_data_cfg \
+        model.validation_ds.streaming=true \
+        +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
+        +model.validation_ds.hf_data_cfg.name=null \
+        +model.validation_ds.hf_data_cfg.split="test.clean" \
+        +model.validation_ds.hf_data_cfg.streaming=true \
+        ~model.test_ds \
+        init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
+        model.tokenizer.update_tokenizer=False \
+        model.optim.sched.warmup_steps=0 \
+        +model.optim.sched.max_steps=3 \
+        trainer.max_epochs=null \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_finetuning_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_finetuning_results
+      IS_OPTIONAL: true
 
   ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            model.train_ds.batch_size=4 \
-            model.validation_ds.batch_size=4 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results
-            rm -rf examples/asr/speech_to_text_wpe_conformer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
+        --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+        model.tokenizer.type="wpe" \
+        model.train_ds.batch_size=4 \
+        model.validation_ds.batch_size=4 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_wpe_conformer_results
 
   # L2: ASR dev run - part two
   ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-            --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-            model.tokenizer.type="wpe" \
-            model.encoder.d_model=144 \
-            model.train_ds.batch_size=4 \
-            model.validation_ds.batch_size=4 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results
-            rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
+        --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+        model.tokenizer.type="wpe" \
+        model.encoder.d_model=144 \
+        model.train_ds.batch_size=4 \
+        model.validation_ds.batch_size=4 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results
 
   L2_Speech_to_Text_EMA:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc.py \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=2 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            +exp_manager.ema.enable=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_results
-            rm -rf examples/asr/speech_to_text_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc.py \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        trainer.devices=2 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        +exp_manager.ema.enable=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_results
+
 
   # L2_Speech_to_Text_AED:
   #   needs: [cicd-test-container-setup]
@@ -758,530 +509,315 @@ jobs:
   # L2: Speaker dev run
   L2_Speaker_dev_run_Speaker_Recognition:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/speaker_tasks/recognition/speaker_reco.py \
-            model.train_ds.batch_size=10 \
-            model.validation_ds.batch_size=2 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
-            model.decoder.num_classes=2 \
-            trainer.max_epochs=10 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results
-            rm -rf examples/speaker_tasks/recognition/speaker_recognition_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/speaker_tasks/recognition/speaker_reco.py \
+        model.train_ds.batch_size=10 \
+        model.validation_ds.batch_size=2 \
+        model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
+        model.decoder.num_classes=2 \
+        trainer.max_epochs=10 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results
+      AFTER_SCRIPT: |
+        rm -rf examples/speaker_tasks/recognition/speaker_recognition_results
 
   L2_Speaker_dev_run_Speaker_Diarization:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \
-            model.diarizer.speaker_embeddings.model_path=titanet_large \
-            model.train_ds.batch_size=5 \
-            model.validation_ds.batch_size=5 \
-            model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
-            model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
-            model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results
-            rm -rf examples/speaker_tasks/diarization/speaker_diarization_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \
+        model.diarizer.speaker_embeddings.model_path=titanet_large \
+        model.train_ds.batch_size=5 \
+        model.validation_ds.batch_size=5 \
+        model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
+        model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
+        model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results
+      AFTER_SCRIPT: |
+        rm -rf examples/speaker_tasks/diarization/speaker_diarization_results
 
   L2_Speaker_dev_run_Speech_to_Label:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/speech_classification/speech_to_label.py \
-            model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
-            model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-            model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
-            ~model.preprocessor.window_size \
-            ~model.preprocessor.window_stride \
-            ~model.preprocessor.window \
-            ~model.preprocessor.n_mels \
-            ~model.preprocessor.n_mfcc \
-            ~model.preprocessor.n_fft \
-            exp_manager.exp_dir=examples/asr/speech_to_label_results
-            rm -rf examples/asr/speech_to_label_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/speech_classification/speech_to_label.py \
+        model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
+        model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
+        model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
+        ~model.preprocessor.window_size \
+        ~model.preprocessor.window_stride \
+        ~model.preprocessor.window \
+        ~model.preprocessor.n_mels \
+        ~model.preprocessor.n_mfcc \
+        ~model.preprocessor.n_fft \
+        exp_manager.exp_dir=examples/asr/speech_to_label_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_label_results
 
   L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \
-            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \
-            diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \
-            diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \
-            diarizer.asr.model_path=QuartzNet15x5Base-En \
-            diarizer.asr.parameters.asr_based_vad=True \
-            diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results
-            rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \
+        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
+        diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
+        diarizer.speaker_embeddings.parameters.save_embeddings=True \
+        diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \
+        diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \
+        diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \
+        diarizer.asr.model_path=QuartzNet15x5Base-En \
+        diarizer.asr.parameters.asr_based_vad=True \
+        diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results
+      AFTER_SCRIPT: |
+        rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results
 
   L2_Speaker_dev_run_Clustering_Diarizer_Inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |  
-            python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
-            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \
-            diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \
-            diarizer.speaker_embeddings.parameters.multiscale_weights=null \
-            diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
-            diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results
-            rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
+        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
+        diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
+        diarizer.speaker_embeddings.parameters.save_embeddings=True \
+        diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \
+        diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \
+        diarizer.speaker_embeddings.parameters.multiscale_weights=null \
+        diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
+        diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results
+      AFTER_SCRIPT: |
+        rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results
 
   L2_Speaker_dev_run_Neural_Diarizer_Inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \
-            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-            diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \
-            diarizer.speaker_embeddings.parameters.save_embeddings=True \
-            diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
-            diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results
-            rm -rf examples/speaker_tasks/diarization/neural_diarizer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \
+        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
+        diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \
+        diarizer.speaker_embeddings.parameters.save_embeddings=True \
+        diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
+        diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results
+      AFTER_SCRIPT: |
+        rm -rf examples/speaker_tasks/diarization/neural_diarizer_results
 
   L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python tools/speech_data_simulator/multispeaker_simulator.py \
-            --config-path=conf --config-name=data_simulator.yaml \
-            data_simulator.random_seed=42 \
-            data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \
-            data_simulator.outputs.output_dir=./test_simulator \
-            data_simulator.session_config.num_sessions=2 \
-            data_simulator.session_config.session_length=60
-            rm -rf ./test_simulator
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python tools/speech_data_simulator/multispeaker_simulator.py \
+        --config-path=conf --config-name=data_simulator.yaml \
+        data_simulator.random_seed=42 \
+        data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \
+        data_simulator.outputs.output_dir=./test_simulator \
+        data_simulator.session_config.num_sessions=2 \
+        data_simulator.session_config.session_length=60
+      AFTER_SCRIPT: |
+        rm -rf ./test_simulator
 
   # L2: ASR Multi-dataloader dev run
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_ctc/speech_to_text_ctc.py \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            +trainer.num_sanity_val_steps=1 \
-            exp_manager.exp_dir=examples/asr/speech_to_text_results
-            rm -rf examples/asr/speech_to_text_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_ctc/speech_to_text_ctc.py \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        +trainer.num_sanity_val_steps=1 \
+        exp_manager.exp_dir=examples/asr/speech_to_text_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_results
 
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/speech_classification/speech_to_label.py \
-            model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
-            model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            +trainer.num_sanity_val_steps=1 \
-            model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
-            ~model.preprocessor.window_size \
-            ~model.preprocessor.window_stride \
-            ~model.preprocessor.window \
-            ~model.preprocessor.n_mels \
-            ~model.preprocessor.n_mfcc \
-            ~model.preprocessor.n_fft \
-            exp_manager.exp_dir=examples/asr/speech_to_label_results
-            rm -rf examples/asr/speech_to_label_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/speech_classification/speech_to_label.py \
+        model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
+        model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        +trainer.num_sanity_val_steps=1 \
+        model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
+        ~model.preprocessor.window_size \
+        ~model.preprocessor.window_stride \
+        ~model.preprocessor.window \
+        ~model.preprocessor.n_mels \
+        ~model.preprocessor.n_mfcc \
+        ~model.preprocessor.n_fft \
+        exp_manager.exp_dir=examples/asr/speech_to_label_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_label_results
 
   # L2: ASR Adapters
   L2_ASR_Adapters_Linear_Adapters:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_adapters/train_asr_adapter.py \
-            model.pretrained_model="stt_en_conformer_ctc_small" \
-            model.adapter.adapter_name="an4" \
-            model.adapter.linear.in_features=176 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.max_steps=5 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results
-            rm -rf examples/asr/speech_to_text_adapters_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_adapters/train_asr_adapter.py \
+        model.pretrained_model="stt_en_conformer_ctc_small" \
+        model.adapter.adapter_name="an4" \
+        model.adapter.linear.in_features=176 \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        trainer.max_steps=5 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_adapters_results
 
   L2_ASR_Adapters_RelPos_MHA_Adapters:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/asr_adapters/train_asr_adapter.py \
-            model.pretrained_model="stt_en_conformer_ctc_small" \
-            model.adapter.adapter_name="encoder:an4" \
-            model.adapter.adapter_type="tiny_attn" \
-            model.adapter.tiny_attn.n_feat=176 \
-            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.max_steps=5 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=True \
-            exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results
-            rm -rf examples/asr/speech_to_text_adapters_mha_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/asr/asr_adapters/train_asr_adapter.py \
+        model.pretrained_model="stt_en_conformer_ctc_small" \
+        model.adapter.adapter_name="encoder:an4" \
+        model.adapter.adapter_type="tiny_attn" \
+        model.adapter.tiny_attn.n_feat=176 \
+        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+        trainer.max_steps=5 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=True \
+        exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results
+      AFTER_SCRIPT: |
+        rm -rf examples/asr/speech_to_text_adapters_mha_results
 
   # L2: Speech Transcription
   L2_Speech_Transcription_Speech_to_Text_Transcribe:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/asr/transcribe_speech.py \
-            pretrained_name="QuartzNet15x5Base-En" \
-            audio_dir="/home/TestData/an4_transcribe/test_subset/" \
-            output_filename="stt_test_res.json" \
-            amp=true
-            rm -rf stt_test_res.json
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/asr/transcribe_speech.py \
+        pretrained_name="QuartzNet15x5Base-En" \
+        audio_dir="/home/TestData/an4_transcribe/test_subset/" \
+        output_filename="stt_test_res.json" \
+        amp=true
+      AFTER_SCRIPT: |
+        rm -rf stt_test_res.json
 
   # L2: Transducer alignment
   L2_Transducer_alignment_Running_pytest:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
 
   # L2: Segmentation Tool
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd tools/ctc_segmentation && \
-            TIME=`date +"%Y-%m-%d-%T"` && \
-            /bin/bash run_segmentation.sh \
-            --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
-            --DATA_DIR=/home/TestData/ctc_segmentation/eng \
-            --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \
-            --LANGUAGE=en \
-            --USE_NEMO_NORMALIZATION="TRUE" && \
-            python /home/TestData/ctc_segmentation/verify_alignment.py \
-            -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
-            -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \
-            rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd tools/ctc_segmentation && \
+        $=`date +"%Y-%m-%d-%T"` && \
+        /bin/bash run_segmentation.sh \
+        --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
+        --DATA_DIR=/home/TestData/ctc_segmentation/eng \
+        --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \
+        --LANGUAGE=en \
+        --USE_NEMO_NORMALIZATION="TRUE" && \
+        python /home/TestData/ctc_segmentation/verify_alignment.py \
+        -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
+        -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt;
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}
 
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd tools/ctc_segmentation && \
-            TIME=`date +"%Y-%m-%d-%T"` && \
-            /bin/bash run_segmentation.sh \
-            --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \
-            --DATA_DIR=/home/TestData/ctc_segmentation/ru \
-            --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \
-            --LANGUAGE=ru \
-            --ADDITIONAL_SPLIT_SYMBOLS=";" && \
-            python /home/TestData/ctc_segmentation/verify_alignment.py \
-            -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
-            -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \
-            rm -rf /home/TestData/ctc_segmentation/ru/output${TIME}
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd tools/ctc_segmentation && \
+        TIME=`date +"%Y-%m-%d-%T"` && \
+        /bin/bash run_segmentation.sh \
+        --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \
+        --DATA_DIR=/home/TestData/ctc_segmentation/ru \
+        --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \
+        --LANGUAGE=ru \
+        --ADDITIONAL_SPLIT_SYMBOLS=";" && \
+        python /home/TestData/ctc_segmentation/verify_alignment.py \
+        -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
+        -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt;
+
+        rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}
 
   # L2: G2P Models
   L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/tts/g2p && \
-                TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
-                python g2p_train_and_evaluate.py \
-                    train_manifest=/home/TestData/g2p/g2p.json \
-                    validation_manifest=/home/TestData/g2p/g2p.json \
-                    model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
-                    model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \
-                    trainer.max_epochs=1 \
-                    model.max_source_len=64 \
-                    trainer.devices=1 \
-                    do_training=True \
-                    do_testing=True \
-                    exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \
-                    +exp_manager.use_datetime_version=False\
-                    +exp_manager.version=test \
-                    --config-name=g2p_conformer_ctc && \
-                python g2p_inference.py \
-                    pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
-                    manifest_filepath=/home/TestData/g2p/g2p.json \
-                    phoneme_field=text
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/tts/g2p && \
+            TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
+            python g2p_train_and_evaluate.py \
+                train_manifest=/home/TestData/g2p/g2p.json \
+                validation_manifest=/home/TestData/g2p/g2p.json \
+                model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
+                model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \
+                trainer.max_epochs=1 \
+                model.max_source_len=64 \
+                trainer.devices=1 \
+                do_training=True \
+                do_testing=True \
+                exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \
+                +exp_manager.use_datetime_version=False\
+                +exp_manager.version=test \
+                --config-name=g2p_conformer_ctc && \
+            python g2p_inference.py \
+                pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
+                manifest_filepath=/home/TestData/g2p/g2p.json \
+                phoneme_field=text
 
     # TODO: pleasefixme @redoctopus
     # - name: ByT5G2P training, evaluation and inference
@@ -1311,43 +847,28 @@ jobs:
 
   L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/tts/g2p && \
-                TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
-                python g2p_heteronym_classification_train_and_evaluate.py \
-                    train_manifest=/home/TestData/g2p/manifest.json \
-                    validation_manifest=/home/TestData/g2p/manifest.json \
-                    test_manifest=/home/TestData/g2p/manifest.json \
-                    model.wordids=/home/TestData/g2p/wordids.tsv \
-                    trainer.max_epochs=1 \
-                    model.max_seq_length=64 \
-                    do_training=True \
-                    do_testing=True \
-                    exp_manager.exp_dir=${OUTPUT_DIR} \
-                    +exp_manager.use_datetime_version=False\
-                    +exp_manager.version=test && \
-                python g2p_heteronym_classification_inference.py \
-                    manifest=/home/TestData/g2p/manifest.json \
-                    pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
-                    output_manifest=preds.json
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/tts/g2p && \
+            TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
+            python g2p_heteronym_classification_train_and_evaluate.py \
+                train_manifest=/home/TestData/g2p/manifest.json \
+                validation_manifest=/home/TestData/g2p/manifest.json \
+                test_manifest=/home/TestData/g2p/manifest.json \
+                model.wordids=/home/TestData/g2p/wordids.tsv \
+                trainer.max_epochs=1 \
+                model.max_seq_length=64 \
+                do_training=True \
+                do_testing=True \
+                exp_manager.exp_dir=${OUTPUT_DIR} \
+                +exp_manager.use_datetime_version=False\
+                +exp_manager.version=test && \
+            python g2p_heteronym_classification_inference.py \
+                manifest=/home/TestData/g2p/manifest.json \
+                pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
+                output_manifest=preds.json
 
   # L2: Dialogue Classification
 
@@ -1395,328 +916,217 @@ jobs:
 
   L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
-            model.dataset.task_name=debug_sample \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.dataset.num_tasks=6 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-cased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_bert_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        model.dataset.data_dir=/home/TestData/nlp/sgd_small \
+        model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
+        model.dataset.task_name=debug_sample \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.dataset.num_tasks=6 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=bert-base-cased \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf sgd_gen_bert_outputs
 
   L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
-            model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
-            model.dataset.task=assistant \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_bert_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
+        model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
+        model.dataset.task=assistant \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf sgd_gen_bert_intent_classification_outputs
 
   L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \
-            model.dataset.task=zero_shot \
-            model.dataset.prompt_template="This example is" \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_gen_zero_shot_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
+        model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
+        model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \
+        model.dataset.task=zero_shot \
+        model.dataset.prompt_template="This example is" \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf sgd_gen_zero_shot_intent_classification_outputs
 
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="This example is related to" \
-            model.library=megatron \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_zero_shot_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/design_dataset \
+        model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
+        model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \
+        model.dataset.task=design \
+        model.dataset.prompt_template="This example is related to" \
+        model.library=megatron \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf design_zero_shot_intent_classification_outputs
 
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="This example is related to" \
-            model.library=huggingface \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_zero_shot_intent_classification_bart_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/design_dataset \
+        model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
+        model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \
+        model.dataset.task=design \
+        model.dataset.prompt_template="This example is related to" \
+        model.library=huggingface \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf design_zero_shot_intent_classification_bart_outputs
 
   L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-            model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \
-            model.dataset.task=design \
-            model.dataset.prompt_template="" \
-            model.library=huggingface \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf design_dialogue_nearest_neighbour_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/design_dataset \
+        model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \
+        model.dataset.task=design \
+        model.dataset.prompt_template="" \
+        model.library=huggingface \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf design_dialogue_nearest_neighbour_classification_outputs
 
   # L2: Dialogue Generation
   L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-            model.dataset.dialogues_example_dir=answer_extender_s2s \
-            model.dataset.task=ms_marco \
-            model.library=huggingface \
-            model.dataset.debug_mode=True \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=facebook/bart-large \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf answer_extender_s2s
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
+        model.dataset.dialogues_example_dir=answer_extender_s2s \
+        model.dataset.task=ms_marco \
+        model.library=huggingface \
+        model.dataset.debug_mode=True \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=facebook/bart-large \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf answer_extender_s2s
 
   L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-            model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \
-            model.dataset.task_name=debug_sample \
-            model.dataset.task=sgd_generation \
-            model.dataset.input_field=utterance+system_actions \
-            model.dataset.output_field=system_utterance \
-            model.dataset.use_cache=false \
-            model.dataset.system_utterance=next_turn \
-            model.dataset.debug_mode=True \
-            model.dataset.prompt_template=slots_values \
-            model.library=huggingface \
-            trainer.max_steps=1 \
-            trainer.max_epochs=1 \
-            model.train_ds.batch_size=2 \
-            model.validation_ds.batch_size=2 \
-            model.test_ds.batch_size=2 \
-            model.nemo_path=null \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.language_model.pretrained_model_name=facebook/bart-large \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf sgd_answer_extender_s2s
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/sgd_small \
+        model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \
+        model.dataset.task_name=debug_sample \
+        model.dataset.task=sgd_generation \
+        model.dataset.input_field=utterance+system_actions \
+        model.dataset.output_field=system_utterance \
+        model.dataset.use_cache=false \
+        model.dataset.system_utterance=next_turn \
+        model.dataset.debug_mode=True \
+        model.dataset.prompt_template=slots_values \
+        model.library=huggingface \
+        trainer.max_steps=1 \
+        trainer.max_epochs=1 \
+        model.train_ds.batch_size=2 \
+        model.validation_ds.batch_size=2 \
+        model.test_ds.batch_size=2 \
+        model.nemo_path=null \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.language_model.pretrained_model_name=facebook/bart-large \
+        trainer.accelerator=gpu \
+        exp_manager=null
+      AFTER_SCRIPT: |
+        rm -rf sgd_answer_extender_s2s
 
 #     - name: L2: Dialogue Generation Part 2
 #       when {
@@ -1752,82 +1162,54 @@ jobs:
   # L2: COPY
   L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/dialogue && \
-            python dialogue.py \
-            do_training=False \
-            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-            model.dataset.dialogues_example_dir=answer_extender \
-            model.library=huggingface \
-            model.dataset.task=ms_marco \
-            model.dataset.debug_mode=True \
-            trainer.val_check_interval=0.0 \
-            trainer.devices=1 \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name=gpt2 \
-            trainer.accelerator=gpu \
-            exp_manager=null  && \
-            rm -rf answer_extender
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/dialogue && \
+        python dialogue.py \
+        do_training=False \
+        model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
+        model.dataset.dialogues_example_dir=answer_extender \
+        model.library=huggingface \
+        model.dataset.task=ms_marco \
+        model.dataset.debug_mode=True \
+        trainer.val_check_interval=0.0 \
+        trainer.devices=1 \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name=gpt2 \
+        trainer.accelerator=gpu \
+        exp_manager=null  && \
+        rm -rf answer_extender
 
   # L2: Duplex Text Normalization
   L2_Duplex_Text_Normalization_with_Tarred_dataset:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/duplex_text_normalization && \
-            python duplex_text_normalization_train.py \
-            data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
-            mode=tn \
-            lang=en \
-            tagger_model.do_training=false \
-            decoder_model.transformer=t5-small \
-            data.validation_ds.batch_size=2 \
-            data.train_ds.use_cache=false \
-            data.validation_ds.use_cache=false \
-            data.test_ds.batch_size=2 \
-            data.train_ds.decoder_data_augmentation=false \
-            data.train_ds.num_workers=2 \
-            decoder_trainer.devices=[0,1] \
-            decoder_trainer.accelerator="gpu" \
-            data.train_ds.use_tarred_dataset=true \
-            +decoder_trainer.fast_dev_run=true \
-            decoder_exp_manager.create_checkpoint_callback=false \
-            data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
-            data.test_ds.use_cache=false \
-            data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/duplex_text_normalization && \
+        python duplex_text_normalization_train.py \
+        data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
+        mode=tn \
+        lang=en \
+        tagger_model.do_training=false \
+        decoder_model.transformer=t5-small \
+        data.validation_ds.batch_size=2 \
+        data.train_ds.use_cache=false \
+        data.validation_ds.use_cache=false \
+        data.test_ds.batch_size=2 \
+        data.train_ds.decoder_data_augmentation=false \
+        data.train_ds.num_workers=2 \
+        decoder_trainer.devices=[0,1] \
+        decoder_trainer.accelerator="gpu" \
+        data.train_ds.use_tarred_dataset=true \
+        +decoder_trainer.fast_dev_run=true \
+        decoder_exp_manager.create_checkpoint_callback=false \
+        data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
+        data.test_ds.use_cache=false \
+        data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
 
 # Runs out of memory on the 12G TITAN V (GPU 0 on main CI)
 # TODO: add when megatron bert is supported again in NeMo
@@ -1860,345 +1242,221 @@ jobs:
   # L2: BERT Text Classification
   L2_BERT_Text_Classification_with_BERT_Test:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/text_classification && \
-            python text_classification_with_bert.py \
-            model.dataset.num_classes=6 \
-            model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-            model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-            model.language_model.pretrained_model_name=distilbert-base-uncased \
-            model.train_ds.batch_size=10 \
-            model.dataset.max_seq_length=50 \
-            model.dataset.use_cache=false \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/text_classification && \
+        python text_classification_with_bert.py \
+        model.dataset.num_classes=6 \
+        model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
+        model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
+        model.language_model.pretrained_model_name=distilbert-base-uncased \
+        model.train_ds.batch_size=10 \
+        model.dataset.max_seq_length=50 \
+        model.dataset.use_cache=false \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        exp_manager=null
 
   # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0
   L2_Parallel_BERT_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            # Cannot do fast_dev_run because squad needs whole dev dataset
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        # Cannot do fast_dev_run because squad needs whole dev dataset
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
+        model.dataset.use_cache=false \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        model.test_ds.num_samples=2 \
+        model.test_ds.batch_size=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        model.dataset.version_2_with_negative=false \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   L2_Parallel_BERT_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            # Cannot do fast_dev_run because squad needs whole dev dataset
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=bert-base-uncased \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        # Cannot do fast_dev_run because squad needs whole dev dataset
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
+        model.dataset.use_cache=false \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
+        model.language_model.pretrained_model_name=bert-base-uncased \
+        model.dataset.version_2_with_negative=true \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0
   L2_Parallel_BART_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=facebook/bart-base \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
+        model.dataset.use_cache=false \
+        model.dataset.check_if_answer_in_context=false \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        model.test_ds.num_samples=2 \
+        model.test_ds.batch_size=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.language_model.pretrained_model_name=facebook/bart-base \
+        model.dataset.version_2_with_negative=false \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   L2_Parallel_BART_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=facebook/bart-base \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
+        model.dataset.use_cache=false \
+        model.dataset.check_if_answer_in_context=false \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
+        model.language_model.pretrained_model_name=facebook/bart-base \
+        model.dataset.version_2_with_negative=true \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0
   L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            model.test_ds.num_samples=2 \
-            model.test_ds.batch_size=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.language_model.pretrained_model_name=gpt2 \
-            model.dataset.version_2_with_negative=false \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
+        model.dataset.use_cache=false \
+        model.dataset.check_if_answer_in_context=false \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        model.test_ds.num_samples=2 \
+        model.test_ds.batch_size=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.language_model.pretrained_model_name=gpt2 \
+        model.dataset.version_2_with_negative=false \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/question_answering && \
-            python question_answering.py \
-            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-            model.dataset.use_cache=false \
-            model.dataset.check_if_answer_in_context=false \
-            model.train_ds.batch_size=2 \
-            model.train_ds.num_samples=2 \
-            model.validation_ds.batch_size=2 \
-            model.validation_ds.num_samples=2 \
-            trainer.max_epochs=1 \
-            trainer.max_steps=1 \
-            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-            model.language_model.pretrained_model_name=gpt2 \
-            model.dataset.version_2_with_negative=true \
-            trainer.precision=16 \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/question_answering && \
+        python question_answering.py \
+        model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
+        model.dataset.use_cache=false \
+        model.dataset.check_if_answer_in_context=false \
+        model.train_ds.batch_size=2 \
+        model.train_ds.num_samples=2 \
+        model.validation_ds.batch_size=2 \
+        model.validation_ds.num_samples=2 \
+        trainer.max_epochs=1 \
+        trainer.max_steps=1 \
+        model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
+        model.language_model.pretrained_model_name=gpt2 \
+        model.dataset.version_2_with_negative=true \
+        trainer.precision=16 \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        exp_manager=null
 
   # L2: Intent and Slot Classification Tasks
   L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/intent_slot_classification && \
-            python intent_slot_classification.py \
-            model.data_dir=/home/TestData/nlp/retail \
-            model.validation_ds.prefix=dev \
-            model.test_ds.prefix=dev \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager.exp_dir=checkpoints
-            rm -rf checkpoints
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/intent_slot_classification && \
+        python intent_slot_classification.py \
+        model.data_dir=/home/TestData/nlp/retail \
+        model.validation_ds.prefix=dev \
+        model.test_ds.prefix=dev \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        exp_manager.exp_dir=checkpoints
+      AFTER_SCRIPT: |
+        rm -rf checkpoints
 
   L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/intent_slot_classification && \
-            python multi_label_intent_slot_classification.py \
-            model.data_dir=/home/TestData/nlp/new_multiatis \
-            model.validation_ds.prefix=dev \
-            model.test_ds.prefix=dev \
-            trainer.devices=1 \
-            +trainer.fast_dev_run=true \
-            exp_manager.exp_dir=checkpoints2
-            rm -rf checkpoints2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/intent_slot_classification && \
+        python multi_label_intent_slot_classification.py \
+        model.data_dir=/home/TestData/nlp/new_multiatis \
+        model.validation_ds.prefix=dev \
+        model.test_ds.prefix=dev \
+        trainer.devices=1 \
+        +trainer.fast_dev_run=true \
+        exp_manager.exp_dir=checkpoints2
+      AFTER_SCRIPT: |
+        rm -rf checkpoints2
 
     # TODO: add when megatron-bert is supported again
     # stage('L2: Model Parallel Size 2 Megatron Text Classification') {
@@ -2309,350 +1567,246 @@ jobs:
   # L2: Parallel NLP Examples 2
   L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            python token_classification_train.py \
-            pretrained_model=ner_en_bert \
-            model.dataset.data_dir=/home/TestData/nlp/ner/ \
-            model.train_ds.batch_size=2 \
-            model.dataset.use_cache=false \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            model.dataset.class_balancing="weighted_loss" \
-            exp_manager.exp_dir=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/token_classification && \
+        python token_classification_train.py \
+        pretrained_model=ner_en_bert \
+        model.dataset.data_dir=/home/TestData/nlp/ner/ \
+        model.train_ds.batch_size=2 \
+        model.dataset.use_cache=false \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        model.dataset.class_balancing="weighted_loss" \
+        exp_manager.exp_dir=null
 
   L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-            python punctuation_capitalization_train_evaluate.py \
-              pretrained_model=punctuation_en_bert \
-              model.train_ds.ds_item="${data_dir}" \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              exp_manager.exp_dir=null && \
-            rm -rf "${data_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/token_classification && \
+        data_dir="$(mktemp -d -p "$(pwd)")" && \
+        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
+        python punctuation_capitalization_train_evaluate.py \
+          pretrained_model=punctuation_en_bert \
+          model.train_ds.ds_item="${data_dir}" \
+          model.validation_ds.ds_item="${data_dir}" \
+          model.test_ds.ds_item="${data_dir}" \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=1 \
+          trainer.accelerator="gpu" \
+          +trainer.fast_dev_run=true \
+          exp_manager.exp_dir=null;
+
+        rm -rf "${data_dir}"
 
   L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            python token_classification_train.py \
-            model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            model.dataset.use_cache=false \
-            model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
-            exp_manager.exp_dir=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-        
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/token_classification && \
+        python token_classification_train.py \
+        model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        model.dataset.use_cache=false \
+        model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
+        exp_manager.exp_dir=null
+
   L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/token_classification/token_classification_evaluate.py \
-            model.dataset.data_dir=/home/TestData/nlp/ner/ \
-            model.dataset.use_cache=false \
-            pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/token_classification/token_classification_evaluate.py \
+        model.dataset.data_dir=/home/TestData/nlp/ner/ \
+        model.dataset.use_cache=false \
+        pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
 
   L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              model.test_ds.ds_item="${data_dir}" \
-              ~model.train_ds \
-              ~model.validation_ds \
-              +model.test_ds.use_cache=false \
-              pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \
-            rm -rf "${data_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        data_dir="$(mktemp -d -p "$(pwd)")" && \
+        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
+        python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
+          +do_training=false \
+          +do_testing=true \
+          model.test_ds.ds_item="${data_dir}" \
+          ~model.train_ds \
+          ~model.validation_ds \
+          +model.test_ds.use_cache=false \
+          pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo;
+
+        rm -rf "${data_dir}"
+        
 
   L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${tmp_data_dir}" \
-              model.validation_ds.ds_item="${tmp_data_dir}" \
-              model.test_ds.ds_item="${tmp_data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=true && \
-            tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \
-            mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \
-            rm -rf "${tmp_data_dir}" && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${tmp_data_dir_2}" \
-              model.validation_ds.ds_item="${tmp_data_dir_2}" \
-              model.test_ds.ds_item="${tmp_data_dir_2}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
-              "${tmp_data_dir_2}" \
-              "${output_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/token_classification && \
+        output_dir="$(mktemp -d -p "$(pwd)")" && \
+        tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \
+        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \
+        python punctuation_capitalization_train_evaluate.py \
+          model.train_ds.use_tarred_dataset=false \
+          model.train_ds.ds_item="${tmp_data_dir}" \
+          model.validation_ds.ds_item="${tmp_data_dir}" \
+          model.test_ds.ds_item="${tmp_data_dir}" \
+          model.language_model.pretrained_model_name=distilbert-base-uncased \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=[0,1] \
+          trainer.accelerator="gpu" \
+          trainer.strategy=ddp \
+          trainer.max_epochs=1 \
+          +exp_manager.explicit_log_dir="${output_dir}" \
+          +do_testing=true && \
+        tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \
+        mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \
+        rm -rf "${tmp_data_dir}" && \
+        python punctuation_capitalization_train_evaluate.py \
+          model.train_ds.use_tarred_dataset=false \
+          model.train_ds.ds_item="${tmp_data_dir_2}" \
+          model.validation_ds.ds_item="${tmp_data_dir_2}" \
+          model.test_ds.ds_item="${tmp_data_dir_2}" \
+          pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=[0,1] \
+          trainer.accelerator="gpu" \
+          trainer.strategy=ddp \
+          trainer.max_epochs=1 \
+          exp_manager=null;
+
+        rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
+          "${tmp_data_dir_2}" \
+          "${output_dir}"
 
   # Punctuation & Capitalization tarred dataset:
   Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            data_dir="$(mktemp -d -p "$(pwd)")" && \
-            cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
-              /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \
-              "${data_dir}"/ && \
-            usual_data=${data_dir}/wmt_wiki_10000 && \
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            tarred_data=${output_dir}/train_tarred && \
-            tokens_in_batch=2000 && \
-            max_seq_length=512 && \
-            lm_model=distilbert-base-uncased && \
-            python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \
-              --text ${usual_data}/input.txt \
-              --labels ${usual_data}/labels.txt \
-              --output_dir ${tarred_data} \
-              --tokens_in_batch ${tokens_in_batch} \
-              --max_seq_length 512 \
-              --lines_per_dataset_fragment 2000 \
-              --num_batches_per_tarfile 5 \
-              --tar_file_prefix punctuation_capitalization \
-              --tokenizer_name ${lm_model} \
-              --use_fast_tokenizer \
-              --pad_label O \
-              --n_jobs 3 && \
-            echo "Number of tarred files in dataset:" && \
-            ls ${tarred_data}/*.tar | wc -l && \
-            echo "Label id files in dataset:" && \
-            ls ${tarred_data}/*.csv && \
-            metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \
-            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.train_ds.ds_item=${tarred_data} \
-              model.language_model.pretrained_model_name=${lm_model} \
-              model.train_ds.use_tarred_dataset=true \
-              model.train_ds.tar_metadata_file=${metadata_file} \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.accelerator="gpu" \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir=${output_dir}/output && \
-            rm -rf "${output_dir}" "${data_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        data_dir="$(mktemp -d -p "$(pwd)")" && \
+        cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
+          /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \
+          "${data_dir}"/ && \
+        usual_data=${data_dir}/wmt_wiki_10000 && \
+        output_dir="$(mktemp -d -p "$(pwd)")" && \
+        tarred_data=${output_dir}/train_tarred && \
+        tokens_in_batch=2000 && \
+        max_seq_length=512 && \
+        lm_model=distilbert-base-uncased && \
+        python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \
+          --text ${usual_data}/input.txt \
+          --labels ${usual_data}/labels.txt \
+          --output_dir ${tarred_data} \
+          --tokens_in_batch ${tokens_in_batch} \
+          --max_seq_length 512 \
+          --lines_per_dataset_fragment 2000 \
+          --num_batches_per_tarfile 5 \
+          --tar_file_prefix punctuation_capitalization \
+          --tokenizer_name ${lm_model} \
+          --use_fast_tokenizer \
+          --pad_label O \
+          --n_jobs 3 && \
+        echo "Number of tarred files in dataset:" && \
+        ls ${tarred_data}/*.tar | wc -l && \
+        echo "Label id files in dataset:" && \
+        ls ${tarred_data}/*.csv && \
+        metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \
+        python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
+          model.validation_ds.ds_item="${data_dir}" \
+          model.test_ds.ds_item="${data_dir}" \
+          model.train_ds.ds_item=${tarred_data} \
+          model.language_model.pretrained_model_name=${lm_model} \
+          model.train_ds.use_tarred_dataset=true \
+          model.train_ds.tar_metadata_file=${metadata_file} \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=[0,1] \
+          trainer.accelerator="gpu" \
+          trainer.strategy=ddp \
+          trainer.max_epochs=1 \
+          +exp_manager.explicit_log_dir=${output_dir}/output;
+
+        rm -rf "${output_dir}" "${data_dir}"
 
   # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model
   Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/token_classification && \
-            work_dir="$(mktemp -d -p "$(pwd)")" && \
-            label_vocab_dir="${work_dir}/labels" && \
-            mkdir -p ${label_vocab_dir} && \
-            data_dir="${work_dir}/data" && \
-            mkdir -p "${data_dir}" && \
-            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
-            output_dir="${work_dir}/output" && \
-            mkdir -p "${output_dir}" && \
-            punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
-            capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
-            printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
-            printf "O\nU\n" > "${capit_label_vocab}" && \
-            python punctuation_capitalization_train_evaluate.py \
-              model.train_ds.use_tarred_dataset=false \
-              model.train_ds.ds_item="${data_dir}" \
-              model.validation_ds.ds_item="${data_dir}" \
-              model.test_ds.ds_item="${data_dir}" \
-              model.language_model.pretrained_model_name=distilbert-base-uncased \
-              model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
-              model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
-              model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              +exp_manager.explicit_log_dir="${output_dir}" \
-              +do_testing=false && \
-            python punctuation_capitalization_train_evaluate.py \
-              +do_training=false \
-              +do_testing=true \
-              ~model.train_ds \
-              ~model.validation_ds \
-              model.test_ds.ds_item="${data_dir}" \
-              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-              +model.train_ds.use_cache=false \
-              +model.validation_ds.use_cache=false \
-              +model.test_ds.use_cache=false \
-              trainer.devices=[0,1] \
-              trainer.strategy=ddp \
-              trainer.max_epochs=1 \
-              exp_manager=null && \
-            rm -rf "${work_dir}"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/token_classification && \
+        work_dir="$(mktemp -d -p "$(pwd)")" && \
+        label_vocab_dir="${work_dir}/labels" && \
+        mkdir -p ${label_vocab_dir} && \
+        data_dir="${work_dir}/data" && \
+        mkdir -p "${data_dir}" && \
+        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
+        output_dir="${work_dir}/output" && \
+        mkdir -p "${output_dir}" && \
+        punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
+        capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
+        printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
+        printf "O\nU\n" > "${capit_label_vocab}" && \
+        python punctuation_capitalization_train_evaluate.py \
+          model.train_ds.use_tarred_dataset=false \
+          model.train_ds.ds_item="${data_dir}" \
+          model.validation_ds.ds_item="${data_dir}" \
+          model.test_ds.ds_item="${data_dir}" \
+          model.language_model.pretrained_model_name=distilbert-base-uncased \
+          model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
+          model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
+          model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=[0,1] \
+          trainer.strategy=ddp \
+          trainer.max_epochs=1 \
+          +exp_manager.explicit_log_dir="${output_dir}" \
+          +do_testing=false && \
+        python punctuation_capitalization_train_evaluate.py \
+          +do_training=false \
+          +do_testing=true \
+          ~model.train_ds \
+          ~model.validation_ds \
+          model.test_ds.ds_item="${data_dir}" \
+          pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
+          +model.train_ds.use_cache=false \
+          +model.validation_ds.use_cache=false \
+          +model.test_ds.use_cache=false \
+          trainer.devices=[0,1] \
+          trainer.strategy=ddp \
+          trainer.max_epochs=1 \
+          exp_manager=null && \
+        rm -rf "${work_dir}"
+        
   # TODO: pleasefixme
   # Punctuation_Capitalization_Using_model-common_datasets_parameters-punct-capit-_label_ids:
   #   needs: [cicd-test-container-setup]
@@ -2719,683 +1873,501 @@ jobs:
   # Punctuation & Capitalization inference      
   Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            output_dir="$(mktemp -d -p "$(pwd)")" && \
-            python examples/nlp/token_classification/punctuate_capitalize_infer.py \
-              --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \
-              --output_text "${output_dir}/iwslt_inference_result.txt" \
-              --max_seq_length 92 \
-              --step 8 \
-              --margin 16 \
-              --pretrained_name punctuation_en_bert \
-              --batch_size 32 && \
-            rm -rf "${output_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-  
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        output_dir="$(mktemp -d -p "$(pwd)")" && \
+        python examples/nlp/token_classification/punctuate_capitalize_infer.py \
+          --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \
+          --output_text "${output_dir}/iwslt_inference_result.txt" \
+          --max_seq_length 92 \
+          --step 8 \
+          --margin 16 \
+          --pretrained_name punctuation_en_bert \
+          --batch_size 32;
+        rm -rf "${output_dir}"
+
   # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
   L2_Pretraining_BERT_pretraining_from_Text:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/language_modeling && \
-              python bert_pretraining.py \
-              --config-name=bert_pretraining_from_text_config.yaml \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              trainer.precision=16 \
-              +trainer.fast_dev_run=true \
-              model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt  \
-              model.train_ds.batch_size=32 \
-              model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt  \
-              model.validation_ds.batch_size=32 \
-              model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \
-              model.optim.lr=0.01 \
-              model.optim.sched.warmup_ratio=0.1 \
-              model.tokenizer.tokenizer_name=sentencepiece \
-              model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \
-              model.mask_prob=0.15 \
-              model.short_seq_prob=0.1 \
-              exp_manager.exp_dir=PretrainingBERTFromText \
-              
-            rm -f /home/TestData/nlp/wikitext-2/*.pkl
-            #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/language_modeling && \
+        python bert_pretraining.py \
+        --config-name=bert_pretraining_from_text_config.yaml \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        trainer.precision=16 \
+        +trainer.fast_dev_run=true \
+        model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt  \
+        model.train_ds.batch_size=32 \
+        model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt  \
+        model.validation_ds.batch_size=32 \
+        model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \
+        model.optim.lr=0.01 \
+        model.optim.sched.warmup_ratio=0.1 \
+        model.tokenizer.tokenizer_name=sentencepiece \
+        model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \
+        model.mask_prob=0.15 \
+        model.short_seq_prob=0.1 \
+        exp_manager.exp_dir=PretrainingBERTFromText;
+      AFTER_SCRIPT: |
+        rm -f /home/TestData/nlp/wikitext-2/*.pkl
+        #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText
 
   L2_Pretraining_BERT_from_Preprocessed:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/language_modeling && \
-              python bert_pretraining.py \
-              --config-name=bert_pretraining_from_preprocessed_config.yaml \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              trainer.precision=16 \
-              +trainer.fast_dev_run=false \
-              +trainer.max_epochs=1 \
-              +trainer.limit_val_batches=0 \
-              +trainer.limit_train_batches=1 \
-              model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \
-              model.train_ds.batch_size=8 \
-              model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \
-              model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \
-              model.optim.lr=0.875e-4 \
-              model.optim.weight_decay=0.01 \
-              model.optim.sched.warmup_ratio=0.01 \
-              exp_manager.exp_dir=PretrainingBERTFromPreprocessed \
-              exp_manager.create_checkpoint_callback=False \
-              
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/language_modeling && \
+            python bert_pretraining.py \
+            --config-name=bert_pretraining_from_preprocessed_config.yaml \
+            trainer.devices=1 \
+            trainer.accelerator="gpu" \
+            trainer.precision=16 \
+            +trainer.fast_dev_run=false \
+            +trainer.max_epochs=1 \
+            +trainer.limit_val_batches=0 \
+            +trainer.limit_train_batches=1 \
+            model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \
+            model.train_ds.batch_size=8 \
+            model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \
+            model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \
+            model.optim.lr=0.875e-4 \
+            model.optim.weight_decay=0.01 \
+            model.optim.sched.warmup_ratio=0.01 \
+            exp_manager.exp_dir=PretrainingBERTFromPreprocessed \
+            exp_manager.create_checkpoint_callback=False  \
+
             #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
 
   # L2: Entity Linking        
   L2_Entity_Linking_Self_Alignment_Pretraining_BERT:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/entity_linking && \
-            python self_alignment_pretraining.py \
-            project_dir=. \
-            trainer.val_check_interval=3 \
-            model.raw_data=None \
-            model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \
-            model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \
-            model.train_ds.batch_size=8 \
-            model.validation_ds.batch_size=8 \
-            exp_manager.exp_dir=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/entity_linking && \
+        python self_alignment_pretraining.py \
+        project_dir=. \
+        trainer.val_check_interval=3 \
+        model.raw_data=None \
+        model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \
+        model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \
+        model.train_ds.batch_size=8 \
+        model.validation_ds.batch_size=8 \
+        exp_manager.exp_dir=null
 
   # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858
   # is in the release container
   # L2: NMT Attention is All You Need Training
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/machine_translation/enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=false \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.num_layers=1 \
-              model.encoder.hidden_size=64 \
-              model.encoder.inner_size=256 \
-              model.decoder.num_layers=1 \
-              model.decoder.hidden_size=64 \
-              model.decoder.inner_size=256 \
-              +model.optim.capturable=True \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              +trainer.val_check_interval=2 \
-              +trainer.limit_val_batches=1 \
-              +trainer.max_steps=2 \
-              trainer.precision=16 \
-              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
-              +exp_manager.create_checkpoint_callback=true
-              
-            python examples/nlp/machine_translation/enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.num_layers=1 \
-              model.encoder.hidden_size=64 \
-              model.encoder.inner_size=256 \
-              model.decoder.num_layers=1 \
-              model.decoder.hidden_size=64 \
-              model.decoder.inner_size=256 \
-              +model.optim.capturable=True \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              +trainer.val_check_interval=10 \
-              +trainer.limit_val_batches=1 \
-              +trainer.limit_test_batches=1 \
-              +trainer.max_steps=10 \
-              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
-              +exp_manager.create_checkpoint_callback=true \
-              +exp_manager.resume_if_exists=True
-              
-            rm -rf examples/nlp/machine_translation/nmt_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/nlp/machine_translation/enc_dec_nmt.py \
+          --config-path=conf \
+          --config-name=aayn_base \
+          do_testing=false \
+          model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+          model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+          model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+          model.encoder.num_layers=1 \
+          model.encoder.hidden_size=64 \
+          model.encoder.inner_size=256 \
+          model.decoder.num_layers=1 \
+          model.decoder.hidden_size=64 \
+          model.decoder.inner_size=256 \
+          +model.optim.capturable=True \
+          trainer.devices=1 \
+          trainer.accelerator="gpu" \
+          +trainer.val_check_interval=2 \
+          +trainer.limit_val_batches=1 \
+          +trainer.max_steps=2 \
+          trainer.precision=16 \
+          +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
+          +exp_manager.create_checkpoint_callback=true
+          
+        python examples/nlp/machine_translation/enc_dec_nmt.py \
+          --config-path=conf \
+          --config-name=aayn_base \
+          do_testing=true \
+          model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+          model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+          model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+          model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+          model.encoder.num_layers=1 \
+          model.encoder.hidden_size=64 \
+          model.encoder.inner_size=256 \
+          model.decoder.num_layers=1 \
+          model.decoder.hidden_size=64 \
+          model.decoder.inner_size=256 \
+          +model.optim.capturable=True \
+          trainer.devices=1 \
+          trainer.accelerator="gpu" \
+          +trainer.val_check_interval=10 \
+          +trainer.limit_val_batches=1 \
+          +trainer.limit_test_batches=1 \
+          +trainer.max_steps=10 \
+          +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
+          +exp_manager.create_checkpoint_callback=true \
+          +exp_manager.resume_if_exists=True
+      AFTER_SCRIPT: |    
+        rm -rf examples/nlp/machine_translation/nmt_results
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-              cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.encoder.pre_ln=true \
-              model.decoder.pre_ln=true \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              +trainer.limit_test_batches=2 \
-              exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python enc_dec_nmt.py \
+        --config-path=conf \
+        --config-name=aayn_base \
+        do_testing=true \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+        model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+        model.encoder.pre_ln=true \
+        model.decoder.pre_ln=true \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        +trainer.limit_test_batches=2 \
+        exp_manager=null
 
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-              cd examples/nlp/machine_translation && \
-              python enc_dec_nmt.py \
-              --config-path=conf \
-              --config-name=aayn_base \
-              do_testing=true \
-              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-              model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-              model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-              model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-              model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              trainer.devices=1 \
-              trainer.accelerator="gpu" \
-              +trainer.fast_dev_run=true \
-              +trainer.limit_test_batches=2 \
-              exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python enc_dec_nmt.py \
+        --config-path=conf \
+        --config-name=aayn_base \
+        do_testing=true \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
+        model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
+        model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
+        model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
+        model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
+        model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+        model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        +trainer.limit_test_batches=2 \
+        exp_manager=null
 
   # L2: NMT Attention is All You Need Inference
   L2_NMT_Attention_is_All_You_Need_Inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/machine_translation && \
-            python nmt_transformer_infer.py \
-            --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-            --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \
-            --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
-            --target_lang en \
-            --source_lang de
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python nmt_transformer_infer.py \
+        --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
+        --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \
+        --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
+        --target_lang en \
+        --source_lang de
 
   # L2: NMT Attention is All You Need Finetuning
   L2_NMT_Attention_is_All_You_Need_Finetuning:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/machine_translation && \
-            python enc_dec_nmt_finetune.py \
-            model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-            trainer.devices=1 \
-            ~trainer.max_epochs \
-            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            +trainer.val_check_interval=10 \
-            +trainer.limit_val_batches=1 \
-            +trainer.limit_test_batches=1 \
-            +trainer.max_steps=10 \
-            +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \
-            +exp_manager.create_checkpoint_callback=True \
-            +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \
-            +exp_manager.checkpoint_callback_params.mode=max \
-            +exp_manager.checkpoint_callback_params.save_best_model=true
-        
-            rm -rf examples/nlp/machine_translation/nmt_finetune
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python enc_dec_nmt_finetune.py \
+        model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
+        trainer.devices=1 \
+        ~trainer.max_epochs \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        +trainer.val_check_interval=10 \
+        +trainer.limit_val_batches=1 \
+        +trainer.limit_test_batches=1 \
+        +trainer.max_steps=10 \
+        +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \
+        +exp_manager.create_checkpoint_callback=True \
+        +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \
+        +exp_manager.checkpoint_callback_params.mode=max \
+        +exp_manager.checkpoint_callback_params.save_best_model=true
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/machine_translation/nmt_finetune
 
   # L2: NMT Tarred Dataset Creation
   L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/machine_translation && \
-            python enc_dec_nmt.py \
-            --config-path=conf \
-            --config-name=aayn_base \
-            do_training=false \
-            model.preproc_out_dir=$PWD/preproc_out_dir \
-            model.train_ds.use_tarred_dataset=true \
-            model.train_ds.n_preproc_jobs=2 \
-            model.train_ds.lines_per_dataset_fragment=500 \
-            model.train_ds.num_batches_per_tarfile=10 \
-            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.encoder_tokenizer.vocab_size=2000 \
-            model.decoder_tokenizer.vocab_size=2000 \
-            ~model.test_ds \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.fast_dev_run=true \
-            exp_manager=null \
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python enc_dec_nmt.py \
+        --config-path=conf \
+        --config-name=aayn_base \
+        do_training=false \
+        model.preproc_out_dir=$PWD/preproc_out_dir \
+        model.train_ds.use_tarred_dataset=true \
+        model.train_ds.n_preproc_jobs=2 \
+        model.train_ds.lines_per_dataset_fragment=500 \
+        model.train_ds.num_batches_per_tarfile=10 \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.encoder_tokenizer.vocab_size=2000 \
+        model.decoder_tokenizer.vocab_size=2000 \
+        ~model.test_ds \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.fast_dev_run=true \
+        exp_manager=null
 
   L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            cd examples/nlp/machine_translation && \
-            python create_tarred_parallel_dataset.py \
-            --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            --out_dir $PWD/out_dir \
-            --encoder_tokenizer_vocab_size=2000 \
-            --decoder_tokenizer_vocab_size=2000 \
-            --tokens_in_batch=1000 \
-            --lines_per_dataset_fragment=500 \
-            --num_batches_per_tarfile=10 \
-            --n_preproc_jobs=2 \
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        cd examples/nlp/machine_translation && \
+        python create_tarred_parallel_dataset.py \
+        --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        --out_dir $PWD/out_dir \
+        --encoder_tokenizer_vocab_size=2000 \
+        --decoder_tokenizer_vocab_size=2000 \
+        --tokens_in_batch=1000 \
+        --lines_per_dataset_fragment=500 \
+        --num_batches_per_tarfile=10 \
+        --n_preproc_jobs=2
 
   L2_Megatron_NMT_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/machine_translation/megatron_nmt_training.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            +trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='swiglu' \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='swiglu' \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.train_ds.num_workers=1 \
-            model.validation_ds.num_workers=1 \
-            ~model.test_ds \
-            model.train_ds.dataset_type=text_memmap \
-            model.encoder_tokenizer.library=sentencepiece \
-            model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-            model.decoder_tokenizer.library=sentencepiece \
-            model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
-            # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-            # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-            python examples/nlp/machine_translation/megatron_nmt_training.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='swiglu' \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='swiglu' \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-            model.train_ds.num_workers=1 \
-            model.validation_ds.num_workers=1 \
-            ~model.test_ds \
-            model.train_ds.dataset_type=text_memmap \
-            model.encoder_tokenizer.library=sentencepiece \
-            model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-            model.decoder_tokenizer.library=sentencepiece \
-            model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
-            rm -rf examples/nlp/machine_translation/megatron_nmt_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/machine_translation/megatron_nmt_training.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        +trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='swiglu' \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='swiglu' \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.micro_batch_size=2 \
+        model.global_batch_size=4 \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.train_ds.num_workers=1 \
+        model.validation_ds.num_workers=1 \
+        ~model.test_ds \
+        model.train_ds.dataset_type=text_memmap \
+        model.encoder_tokenizer.library=sentencepiece \
+        model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+        model.decoder_tokenizer.library=sentencepiece \
+        model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
+        # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
+        # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
+        python examples/nlp/machine_translation/megatron_nmt_training.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        +trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='swiglu' \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='swiglu' \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.micro_batch_size=2 \
+        model.global_batch_size=4 \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.train_ds.num_workers=1 \
+        model.validation_ds.num_workers=1 \
+        ~model.test_ds \
+        model.train_ds.dataset_type=text_memmap \
+        model.encoder_tokenizer.library=sentencepiece \
+        model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+        model.decoder_tokenizer.library=sentencepiece \
+        model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/machine_translation/megatron_nmt_results
 
   L2_Megatron_BART_Perceiver_MIM_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.arch=perceiver \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='swiglu' \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='swiglu' \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.data.data_impl=text_mmap \
-            model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-            model.data.splits_string='"800,100,100"' \
-            model.data.whole_word_masking=False \
-            model.tokenizer.library=sentencepiece \
-            model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-            ++model.hiddens.enc_output_name=z \
-            ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-            ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-            ++model.hiddens.loss.mim.cls_name=a_mim \
-            ++model.hiddens.loss.mim.loss_weight=0.5
-            # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-            # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.arch=perceiver \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='swiglu' \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='swiglu' \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.micro_batch_size=2 \
-            model.global_batch_size=4 \
-            model.data.data_impl=text_mmap \
-            model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-            model.data.splits_string='"800,100,100"' \
-            model.data.whole_word_masking=False \
-            model.tokenizer.library=sentencepiece \
-            model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-            ++model.hiddens.enc_output_name=z \
-            ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-            ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-            ++model.hiddens.loss.mim.cls_name=a_mim \
-            ++model.hiddens.loss.mim.loss_weight=0.5
-            rm -rf examples/nlp/language_modeling/megatron_mim_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.arch=perceiver \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='swiglu' \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='swiglu' \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.micro_batch_size=2 \
+        model.global_batch_size=4 \
+        model.data.data_impl=text_mmap \
+        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
+        model.data.splits_string='"800,100,100"' \
+        model.data.whole_word_masking=False \
+        model.tokenizer.library=sentencepiece \
+        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+        ++model.hiddens.enc_output_name=z \
+        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
+        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
+        ++model.hiddens.loss.mim.cls_name=a_mim \
+        ++model.hiddens.loss.mim.loss_weight=0.5
+        # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
+        # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.arch=perceiver \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='swiglu' \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='swiglu' \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.micro_batch_size=2 \
+        model.global_batch_size=4 \
+        model.data.data_impl=text_mmap \
+        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
+        model.data.splits_string='"800,100,100"' \
+        model.data.whole_word_masking=False \
+        model.tokenizer.library=sentencepiece \
+        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+        ++model.hiddens.enc_output_name=z \
+        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
+        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
+        ++model.hiddens.loss.mim.cls_name=a_mim \
+        ++model.hiddens.loss.mim.loss_weight=0.5
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/megatron_mim_results
 
     # stage('L2: NMT Bottleneck Fallback') {
     #   when {
@@ -3608,372 +2580,304 @@ jobs:
         
   L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            model.pipeline_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=20 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.pipeline_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            rm -rf examples/nlp/language_modeling/bert_pretrain_results
-            rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
- 
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        model.pipeline_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=20 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.pipeline_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bert_pretrain_results
+        rm -rf examples/nlp/language_modeling/bert_index_mappings
+
   L2_Megatron_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.sequence_parallel=True \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=20 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            rm -rf examples/nlp/language_modeling/bert_pretrain_results
-            rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.sequence_parallel=True \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=20 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bert_pretrain_results
+        rm -rf examples/nlp/language_modeling/bert_index_mappings
 
   L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=32 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            model.mcore_bert=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.sequence_parallel=True \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method='block' \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=20 \
-            trainer.precision=32 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.mcore_bert=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method='block' \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-            rm -rf examples/nlp/language_modeling/bert_pretrain_results
-            rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=32 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        model.mcore_bert=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.sequence_parallel=True \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method='block' \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+        NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=20 \
+        trainer.precision=32 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.mcore_bert=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method='block' \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bert_pretrain_results
+        rm -rf examples/nlp/language_modeling/bert_index_mappings
 
   L2_Megatron_RETRO_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-            trainer.num_nodes=1 \
-            trainer.devices=2 \
-            trainer.precision=bf16 \
-            trainer.accelerator=gpu \
-            model.data.data_prefix=['none'] \
-            exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-            model.mcore_gpt=True \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.optim.name=distributed_fused_adam \
-            model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-            model.data.num_workers=4 \
-            model.micro_batch_size=1 \
-            model.data.shuffle_documents=False \
-            trainer.val_check_interval=30 \
-            +trainer.num_sanity_val_steps=0 \
-            model.init_method_std=0.023 \
-            model.optim.lr=6.0e-4 \
-            model.megatron_amp_O2=True \
-            model.data.splits_string=\'\"98,2,0\"\' \
-            model.data.dataloader_type=cyclic \
-            trainer.max_steps=10
-
-            python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-            trainer.num_nodes=1 \
-            trainer.devices=2 \
-            trainer.precision=bf16 \
-            trainer.accelerator=gpu \
-            model.data.data_prefix=['none'] \
-            exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-            model.mcore_gpt=True \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.optim.name=distributed_fused_adam \
-            model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-            model.data.num_workers=4 \
-            model.micro_batch_size=1 \
-            model.data.shuffle_documents=False \
-            trainer.val_check_interval=30 \
-            +trainer.num_sanity_val_steps=0 \
-            model.init_method_std=0.023 \
-            model.optim.lr=6.0e-4 \
-            model.megatron_amp_O2=True \
-            model.data.splits_string=\'\"98,2,0\"\' \
-            model.data.dataloader_type=cyclic \
-            trainer.max_steps=20
-
-            rm -rf examples/nlp/language_modeling/mcore_retro_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
+        trainer.num_nodes=1 \
+        trainer.devices=2 \
+        trainer.precision=bf16 \
+        trainer.accelerator=gpu \
+        model.data.data_prefix=['none'] \
+        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
+        model.mcore_gpt=True \
+        model.tensor_model_parallel_size=1 \
+        model.pipeline_model_parallel_size=1 \
+        model.optim.name=distributed_fused_adam \
+        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
+        model.data.num_workers=4 \
+        model.micro_batch_size=1 \
+        model.data.shuffle_documents=False \
+        trainer.val_check_interval=30 \
+        +trainer.num_sanity_val_steps=0 \
+        model.init_method_std=0.023 \
+        model.optim.lr=6.0e-4 \
+        model.megatron_amp_O2=True \
+        model.data.splits_string=\'\"98,2,0\"\' \
+        model.data.dataloader_type=cyclic \
+        trainer.max_steps=10
+
+        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
+        trainer.num_nodes=1 \
+        trainer.devices=2 \
+        trainer.precision=bf16 \
+        trainer.accelerator=gpu \
+        model.data.data_prefix=['none'] \
+        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
+        model.mcore_gpt=True \
+        model.tensor_model_parallel_size=1 \
+        model.pipeline_model_parallel_size=1 \
+        model.optim.name=distributed_fused_adam \
+        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
+        model.data.num_workers=4 \
+        model.micro_batch_size=1 \
+        model.data.shuffle_documents=False \
+        trainer.val_check_interval=30 \
+        +trainer.num_sanity_val_steps=0 \
+        model.init_method_std=0.023 \
+        model.optim.lr=6.0e-4 \
+        model.megatron_amp_O2=True \
+        model.data.splits_string=\'\"98,2,0\"\' \
+        model.data.dataloader_type=cyclic \
+        trainer.max_steps=20
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/mcore_retro_results
 
   L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-            trainer.devices=2 \
-            trainer.num_nodes=1 \
-            trainer.accelerator=gpu \
-            trainer.accumulate_grad_batches=1 \
-            trainer.limit_val_batches=2 \
-            exp_manager.resume_if_exists=True \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-            model.data.data_prefix= \
-            model.data.knn_index= \
-            model.data.retrieval_prefix= \
-            model.tensor_model_parallel_size=2 \
-            model.micro_batch_size=4 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.chunk_size=32 \
-            model.enc_num_layers=2 \
-            model.dec_num_layers=2 \
-            model.enc_cross_attention=[1] \
-            model.dec_cross_attention=[1] \
-            +model.data.mock=True
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
+        trainer.devices=2 \
+        trainer.num_nodes=1 \
+        trainer.accelerator=gpu \
+        trainer.accumulate_grad_batches=1 \
+        trainer.limit_val_batches=2 \
+        exp_manager.resume_if_exists=True \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        trainer.val_check_interval=10 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
+        model.data.data_prefix= \
+        model.data.knn_index= \
+        model.data.retrieval_prefix= \
+        model.tensor_model_parallel_size=2 \
+        model.micro_batch_size=4 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.chunk_size=32 \
+        model.enc_num_layers=2 \
+        model.dec_num_layers=2 \
+        model.enc_cross_attention=[1] \
+        model.dec_cross_attention=[1] \
+        +model.data.mock=True
 
             python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
             trainer.devices=2 \
@@ -4005,10 +2909,8 @@ jobs:
             model.enc_cross_attention=[1] \
             model.dec_cross_attention=[1] \
             +model.data.mock=True
-
-            rm -rf examples/nlp/language_modeling/retro_legacy_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/retro_legacy_results
 
   # L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
   #   needs: [cicd-test-container-setup]
@@ -4106,224 +3008,183 @@ jobs:
 
   L2_BioMegatron_Bert_NER_Task:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/token_classification/token_classification_train.py \
-            exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
-            trainer.max_epochs=1 \
-            model.dataset.data_dir=/home/TestData/nlp/ner \
-            model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
-            model.tokenizer.tokenizer_name=null
-            rm -rf examples/nlp/language_modeling/token_classification_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/token_classification/token_classification_train.py \
+        exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
+        trainer.max_epochs=1 \
+        model.dataset.data_dir=/home/TestData/nlp/ner \
+        model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
+        model.tokenizer.tokenizer_name=null
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/token_classification_results
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.normalization=rmsnorm \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=6 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.normalization=rmsnorm \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=6 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
   L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-           trainer.devices=2 \
-           trainer.accelerator=gpu \
-           trainer.log_every_n_steps=1 \
-           trainer.val_check_interval=2 \
-           trainer.limit_val_batches=2 \
-           trainer.accumulate_grad_batches=1 \
-           trainer.max_steps=3 \
-           trainer.precision=16 \
-           trainer.gradient_clip_val=1.0 \
-           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-           model.tensor_model_parallel_size=2 \
-           model.optim.name=fused_adam \
-           model.optim.lr=2e-4 \
-           model.optim.sched.warmup_steps=1 \
-           model.optim.sched.constant_steps=1 \
-           model.optim.sched.min_lr=8e-5 \
-           model.max_position_embeddings=128 \
-           model.encoder_seq_length=128 \
-           model.data.seq_length=128 \
-           model.position_embedding_type=rope \
-           model.rotary_percentage=0.5 \
-           model.normalization=rmsnorm \
-           model.bias=False \
-           model.bias_activation_fusion=False \
-           model.bias_dropout_add_fusion=False \
-           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-           model.num_layers=8 \
-           model.hidden_size=256 \
-           model.num_attention_heads=8 \
-           model.activations_checkpoint_method=block \
-           model.activations_checkpoint_granularity=full \
-           model.activations_checkpoint_num_layers=1 \
-           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            #  commented out to save time on github ci @adithyare
-            # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            # trainer.devices=2 \
-            # trainer.accelerator=gpu \
-            # trainer.log_every_n_steps=1 \
-            # trainer.val_check_interval=2 \
-            # trainer.limit_val_batches=1 \
-            # trainer.accumulate_grad_batches=1 \
-            # trainer.max_steps=6 \
-            # trainer.precision=16 \
-            # trainer.gradient_clip_val=1.0 \
-            # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            # exp_manager.resume_if_exists=True \
-            # model.tensor_model_parallel_size=2 \
-            # model.optim.name=fused_adam \
-            # model.optim.lr=2e-4 \
-            # model.optim.sched.warmup_steps=2 \
-            # model.optim.sched.constant_steps=2 \
-            # model.optim.sched.min_lr=8e-5 \
-            # model.max_position_embeddings=128 \
-            # model.encoder_seq_length=128 \
-            # model.data.seq_length=128 \
-            # model.position_embedding_type=rope \
-            # model.rotary_percentage=0.5 \
-            # model.normalization=rmsnorm \
-            # model.bias=False \
-            # model.bias_activation_fusion=False \
-            # model.bias_dropout_add_fusion=False \
-            # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            # model.num_layers=8 \
-            # model.hidden_size=256 \
-            # model.num_attention_heads=8 \
-            # model.activations_checkpoint_method=block \
-            # model.activations_checkpoint_granularity=full \
-            # model.activations_checkpoint_num_layers=1 \
-            # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-
-           rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-           rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.position_embedding_type=rope \
+        model.rotary_percentage=0.5 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+    
+        #  commented out to save time on github ci @adithyare
+        # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        # trainer.devices=2 \
+        # trainer.accelerator=gpu \
+        # trainer.log_every_n_steps=1 \
+        # trainer.val_check_interval=2 \
+        # trainer.limit_val_batches=1 \
+        # trainer.accumulate_grad_batches=1 \
+        # trainer.max_steps=6 \
+        # trainer.precision=16 \
+        # trainer.gradient_clip_val=1.0 \
+        # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        # exp_manager.resume_if_exists=True \
+        # model.tensor_model_parallel_size=2 \
+        # model.optim.name=fused_adam \
+        # model.optim.lr=2e-4 \
+        # model.optim.sched.warmup_steps=2 \
+        # model.optim.sched.constant_steps=2 \
+        # model.optim.sched.min_lr=8e-5 \
+        # model.max_position_embeddings=128 \
+        # model.encoder_seq_length=128 \
+        # model.data.seq_length=128 \
+        # model.position_embedding_type=rope \
+        # model.rotary_percentage=0.5 \
+        # model.normalization=rmsnorm \
+        # model.bias=False \
+        # model.bias_activation_fusion=False \
+        # model.bias_dropout_add_fusion=False \
+        # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        # model.num_layers=8 \
+        # model.hidden_size=256 \
+        # model.num_attention_heads=8 \
+        # model.activations_checkpoint_method=block \
+        # model.activations_checkpoint_granularity=full \
+        # model.activations_checkpoint_num_layers=1 \
+        # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
     #  This test requires Ampere but some of the test GPUs are Volta
     #  Need to add a check for compute capability before uncommenting this test
@@ -4419,683 +3280,529 @@ jobs:
 
   L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.position_embedding_type=alibi \
-            model.normalization=rmsnorm \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            # not testing resume functionality to save time on ci @adithyare
-            #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            #trainer.devices=2 \
-            #trainer.accelerator=gpu \
-            #trainer.log_every_n_steps=1 \
-            #trainer.val_check_interval=2 \
-            #trainer.limit_val_batches=1 \
-            #trainer.accumulate_grad_batches=1 \
-            #trainer.max_steps=6 \
-            #trainer.precision=16 \
-            #trainer.gradient_clip_val=1.0 \
-            #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            #exp_manager.resume_if_exists=True \
-            #model.tensor_model_parallel_size=2 \
-            #model.optim.name=fused_adam \
-            #model.optim.lr=2e-4 \
-            #model.optim.sched.warmup_steps=2 \
-            #model.optim.sched.constant_steps=2 \
-            #model.optim.sched.min_lr=8e-5 \
-            #model.max_position_embeddings=128 \
-            #model.encoder_seq_length=128 \
-            #model.data.seq_length=128 \
-            #model.position_embedding_type=alibi \
-            #model.normalization=rmsnorm \
-            #model.bias=False \
-            #model.bias_activation_fusion=False \
-            #model.bias_dropout_add_fusion=False \
-            #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            #model.num_layers=8 \
-            #model.hidden_size=256 \
-            #model.num_attention_heads=8 \
-            #model.activations_checkpoint_method=block \
-            #model.activations_checkpoint_granularity=full \
-            #model.activations_checkpoint_num_layers=1 \
-            #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-        
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.position_embedding_type=alibi \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+        # not testing resume functionality to save time on ci @adithyare
+        #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        #trainer.devices=2 \
+        #trainer.accelerator=gpu \
+        #trainer.log_every_n_steps=1 \
+        #trainer.val_check_interval=2 \
+        #trainer.limit_val_batches=1 \
+        #trainer.accumulate_grad_batches=1 \
+        #trainer.max_steps=6 \
+        #trainer.precision=16 \
+        #trainer.gradient_clip_val=1.0 \
+        #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        #exp_manager.resume_if_exists=True \
+        #model.tensor_model_parallel_size=2 \
+        #model.optim.name=fused_adam \
+        #model.optim.lr=2e-4 \
+        #model.optim.sched.warmup_steps=2 \
+        #model.optim.sched.constant_steps=2 \
+        #model.optim.sched.min_lr=8e-5 \
+        #model.max_position_embeddings=128 \
+        #model.encoder_seq_length=128 \
+        #model.data.seq_length=128 \
+        #model.position_embedding_type=alibi \
+        #model.normalization=rmsnorm \
+        #model.bias=False \
+        #model.bias_activation_fusion=False \
+        #model.bias_dropout_add_fusion=False \
+        #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        #model.num_layers=8 \
+        #model.hidden_size=256 \
+        #model.num_attention_heads=8 \
+        #model.activations_checkpoint_method=block \
+        #model.activations_checkpoint_granularity=full \
+        #model.activations_checkpoint_num_layers=1 \
+        #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
   L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.position_embedding_type=kerple \
-            model.normalization=rmsnorm \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-            
-            # commented out to save time on github ci @adithyare
-            #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            #trainer.devices=2 \
-            #trainer.accelerator=gpu \
-            #trainer.log_every_n_steps=1 \
-            #trainer.val_check_interval=2 \
-            #trainer.limit_val_batches=1 \
-            #trainer.accumulate_grad_batches=1 \
-            #trainer.max_steps=6 \
-            #trainer.precision=16 \
-            #trainer.gradient_clip_val=1.0 \
-            #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            #exp_manager.resume_if_exists=True \
-            #model.tensor_model_parallel_size=2 \
-            #model.optim.name=fused_adam \
-            #model.optim.lr=2e-4 \
-            #model.optim.sched.warmup_steps=2 \
-            #model.optim.sched.constant_steps=2 \
-            #model.optim.sched.min_lr=8e-5 \
-            #model.max_position_embeddings=128 \
-            #model.encoder_seq_length=128 \
-            #model.data.seq_length=128 \
-            #model.position_embedding_type=kerple \
-            #model.normalization=rmsnorm \
-            #model.bias=False \
-            #model.bias_activation_fusion=False \
-            #model.bias_dropout_add_fusion=False \
-            #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            #model.num_layers=8 \
-            #model.hidden_size=256 \
-            #model.num_attention_heads=8 \
-            #model.activations_checkpoint_method=block \
-            #model.activations_checkpoint_granularity=full \
-            #model.activations_checkpoint_num_layers=1 \
-            #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-            
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.position_embedding_type=kerple \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_granularity=full \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+        # commented out to save time on github ci @adithyare
+        #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        #trainer.devices=2 \
+        #trainer.accelerator=gpu \
+        #trainer.log_every_n_steps=1 \
+        #trainer.val_check_interval=2 \
+        #trainer.limit_val_batches=1 \
+        #trainer.accumulate_grad_batches=1 \
+        #trainer.max_steps=6 \
+        #trainer.precision=16 \
+        #trainer.gradient_clip_val=1.0 \
+        #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        #exp_manager.resume_if_exists=True \
+        #model.tensor_model_parallel_size=2 \
+        #model.optim.name=fused_adam \
+        #model.optim.lr=2e-4 \
+        #model.optim.sched.warmup_steps=2 \
+        #model.optim.sched.constant_steps=2 \
+        #model.optim.sched.min_lr=8e-5 \
+        #model.max_position_embeddings=128 \
+        #model.encoder_seq_length=128 \
+        #model.data.seq_length=128 \
+        #model.position_embedding_type=kerple \
+        #model.normalization=rmsnorm \
+        #model.bias=False \
+        #model.bias_activation_fusion=False \
+        #model.bias_dropout_add_fusion=False \
+        #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        #model.num_layers=8 \
+        #model.hidden_size=256 \
+        #model.num_attention_heads=8 \
+        #model.activations_checkpoint_method=block \
+        #model.activations_checkpoint_granularity=full \
+        #model.activations_checkpoint_num_layers=1 \
+        #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=bf16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.pipeline_model_parallel_size=2 \
-            model.tensor_model_parallel_size=1 \
-            model.mcore_gpt=True \
-            model.megatron_amp_O2=True \
-            model.optim.name=distributed_fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.activation=fast-swiglu \
-            model.bias_activation_fusion=False \
-            model.hidden_dropout=0.0 \
-            model.attention_dropout=0.0 \
-            model.transformer_block_type=normformer \
-            model.headscale=True \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=6 \
-            trainer.precision=bf16 \
-            trainer.gradient_clip_val=1.0 \
-            model.mcore_gpt=True \
-            model.megatron_amp_O2=True \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.pipeline_model_parallel_size=2 \
-            model.tensor_model_parallel_size=1 \
-            model.optim.name=distributed_fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.activation=fast-swiglu \
-            model.bias_activation_fusion=False \
-            model.hidden_dropout=0.0 \
-            model.attention_dropout=0.0 \
-            model.transformer_block_type=normformer \
-            model.headscale=True \
-            model.data.seq_length=128 \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=bf16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.mcore_gpt=True \
+        model.megatron_amp_O2=True \
+        model.optim.name=distributed_fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.activation=fast-swiglu \
+        model.bias_activation_fusion=False \
+        model.hidden_dropout=0.0 \
+        model.attention_dropout=0.0 \
+        model.transformer_block_type=normformer \
+        model.headscale=True \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=6 \
+        trainer.precision=bf16 \
+        trainer.gradient_clip_val=1.0 \
+        model.mcore_gpt=True \
+        model.megatron_amp_O2=True \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.optim.name=distributed_fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.activation=fast-swiglu \
+        model.bias_activation_fusion=False \
+        model.hidden_dropout=0.0 \
+        model.attention_dropout=0.0 \
+        model.transformer_block_type=normformer \
+        model.headscale=True \
+        model.data.seq_length=128 \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=block \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+        rm -rf examples/nlp/language_modeling/gpt_index_mappings
 
-  #@athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0
   L2_Megatron_GPT_Finetuning_PP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            +trainer.limit_val_batches=2 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-            model.pipeline_model_parallel_size=2 \
-            model.tensor_model_parallel_size=1 \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.peft.peft_scheme=null \
-            model.data.train_ds.micro_batch_size=1 \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
-            model.data.train_ds.num_workers=0 \
-            model.data.test_ds.micro_batch_size=1 \
-            model.data.test_ds.global_batch_size=1 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.test_ds.names=[quarel] \
-            model.data.validation_ds.micro_batch_size=1 \
-            model.data.validation_ds.global_batch_size=1 \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.names=[quarel]
-
-            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-            model.pipeline_model_parallel_size=2 \
-            model.tensor_model_parallel_size=1 \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.peft.peft_scheme=null \
-            model.data.train_ds.micro_batch_size=1 \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
-            model.data.train_ds.num_workers=0 \
-            model.data.test_ds.micro_batch_size=1 \
-            model.data.test_ds.global_batch_size=1 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.test_ds.names=[quarel] \
-            model.data.validation_ds.micro_batch_size=1 \
-            model.data.validation_ds.global_batch_size=1 \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.names=[quarel]
-
-            rm -rf examples/nlp/language_modeling/gpt_sft_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        +trainer.limit_val_batches=2 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.peft.peft_scheme=null \
+        model.data.train_ds.micro_batch_size=1 \
+        model.data.train_ds.global_batch_size=4 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
+        model.data.train_ds.num_workers=0 \
+        model.data.test_ds.micro_batch_size=1 \
+        model.data.test_ds.global_batch_size=1 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.test_ds.names=[quarel] \
+        model.data.validation_ds.micro_batch_size=1 \
+        model.data.validation_ds.global_batch_size=1 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        +trainer.limit_val_batches=2 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.peft.peft_scheme=null \
+        model.data.train_ds.micro_batch_size=1 \
+        model.data.train_ds.global_batch_size=4 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
+        model.data.train_ds.num_workers=0 \
+        model.data.test_ds.micro_batch_size=1 \
+        model.data.test_ds.global_batch_size=1 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.test_ds.names=[quarel] \
+        model.data.validation_ds.micro_batch_size=1 \
+        model.data.validation_ds.global_batch_size=1 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_sft_results
 
   L2_Megatron_GPT_Finetuning_StarCoder_PP1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-            trainer.devices=1 \
-            trainer.num_nodes=1 \
-            trainer.precision=32 \
-            trainer.max_steps=4 \
-            trainer.val_check_interval=4 \
-            trainer.enable_checkpointing=False \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            exp_manager.checkpoint_callback_params.save_best_model=False \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-            model.peft.peft_scheme=none \
-            model.optim.name=distributed_fused_adam \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.train_ds.num_workers=0 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.test_ds.num_workers=0 \
-            model.data.train_ds.concat_sampling_probabilities=[1.0]
-        
-            rm -rf examples/nlp/language_modeling/gpt_sft_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-  
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=1 \
+        trainer.num_nodes=1 \
+        trainer.precision=32 \
+        trainer.max_steps=4 \
+        trainer.val_check_interval=4 \
+        trainer.enable_checkpointing=False \
+        +trainer.limit_val_batches=2 \
+        +trainer.limit_test_batches=2 \
+        exp_manager.checkpoint_callback_params.save_best_model=False \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+        model.peft.peft_scheme=none \
+        model.optim.name=distributed_fused_adam \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
+        model.tensor_model_parallel_size=1 \
+        model.pipeline_model_parallel_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.num_workers=0 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.test_ds.num_workers=0 \
+        model.data.train_ds.concat_sampling_probabilities=[1.0]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_sft_results
+
   L2_Megatron_GPT_Embedding:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            rm -rf /home/TestData/nlp/megatron_ir/working_dir
-
-            python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
-            exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
-            model.global_batch_size=4 \
-            model.micro_batch_size=4 \
-            trainer.devices=1 \
-            trainer.num_nodes=1 \
-            trainer.max_epochs=null \
-            trainer.max_steps=20 \
-            trainer.val_check_interval=10 \
-            model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
-            model.peft.lora_tuning.adapter_dim=8 \
-            model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
-            model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
-            model.data.validation_ds.write_embeddings_to_file=True \
-            model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
-
-
-            python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
-            trainer.devices=1 \
-            trainer.num_nodes=1 \
-            model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
-            model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \
-            model.global_batch_size=4 \
-            model.micro_batch_size=4 \
-            model.peft.lora_tuning.adapter_dim=8 \
-            model.data.test_ds.write_embeddings_to_file=True \
-            model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \
-            model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
-            model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl]
-
-            rm -rf /home/TestData/nlp/megatron_ir/working_dir
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_ir/working_dir
+
+        python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
+        exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
+        model.global_batch_size=4 \
+        model.micro_batch_size=4 \
+        trainer.devices=1 \
+        trainer.num_nodes=1 \
+        trainer.max_epochs=null \
+        trainer.max_steps=20 \
+        trainer.val_check_interval=10 \
+        model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
+        model.peft.lora_tuning.adapter_dim=8 \
+        model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
+        model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
+        model.data.validation_ds.write_embeddings_to_file=True \
+        model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
+
+
+        python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
+        trainer.devices=1 \
+        trainer.num_nodes=1 \
+        model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
+        model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \
+        model.global_batch_size=4 \
+        model.micro_batch_size=4 \
+        model.peft.lora_tuning.adapter_dim=8 \
+        model.data.test_ds.write_embeddings_to_file=True \
+        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \
+        model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
+        model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl]
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/megatron_ir/working_dir
 
   L2_Megatron_GPT_PEFT_Lora_PP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
-
-            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.max_epochs=9999 \
-            trainer.max_steps=3 \
-            trainer.val_check_interval=3 \
-            ++trainer.limit_val_batches=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
-            model.pipeline_model_parallel_size=2 \
-            model.tensor_model_parallel_size=1 \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-            model.peft.peft_scheme=lora \
-            model.answer_only_loss=True \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[1.0] \
-            model.data.train_ds.num_workers=0 \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.names=[quarel]
-
-            rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
+
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=9999 \
+        trainer.max_steps=3 \
+        trainer.val_check_interval=3 \
+        ++trainer.limit_val_batches=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
+        model.pipeline_model_parallel_size=2 \
+        model.tensor_model_parallel_size=1 \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+        model.peft.peft_scheme=lora \
+        model.answer_only_loss=True \
+        model.micro_batch_size=1 \
+        model.global_batch_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[1.0] \
+        model.data.train_ds.num_workers=0 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
 
   L2_Megatron_GPT_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            rm -rf /home/TestData/nlp/lora_tuning_tp2
-
-            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.max_epochs=9999 \
-            trainer.max_steps=3 \
-            trainer.val_check_interval=3 \
-            ++trainer.limit_val_batches=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
-            model.pipeline_model_parallel_size=1 \
-            model.tensor_model_parallel_size=2 \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-            model.peft.peft_scheme='lora' \
-            model.answer_only_loss=True \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[1.0] \
-            model.data.train_ds.num_workers=0 \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.names=[quarel]
-
-            python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-            model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
-            model.tensor_model_parallel_size=2 \
-            trainer.devices=2 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-            model.data.test_ds.names=['quarel4'] \
-            model.global_batch_size=2 \
-            model.micro_batch_size=1 \
-            model.data.test_ds.tokens_to_generate=10 \
-            model.data.test_ds.write_predictions_to_file=True \
-            model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \
-            inference.greedy=True \
-            inference.repetition_penalty=1.0 \
-            inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'
-
-            rm -rf /home/TestData/nlp/lora_tuning_tp2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        rm -rf /home/TestData/nlp/lora_tuning_tp2
+
+        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=9999 \
+        trainer.max_steps=3 \
+        trainer.val_check_interval=3 \
+        ++trainer.limit_val_batches=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
+        model.pipeline_model_parallel_size=1 \
+        model.tensor_model_parallel_size=2 \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+        model.peft.peft_scheme='lora' \
+        model.answer_only_loss=True \
+        model.micro_batch_size=1 \
+        model.global_batch_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[1.0] \
+        model.data.train_ds.num_workers=0 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+
+        python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+        model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
+        model.tensor_model_parallel_size=2 \
+        trainer.devices=2 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
+        model.data.test_ds.names=['quarel4'] \
+        model.global_batch_size=2 \
+        model.micro_batch_size=1 \
+        model.data.test_ds.tokens_to_generate=10 \
+        model.data.test_ds.write_predictions_to_file=True \
+        model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \
+        inference.greedy=True \
+        inference.repetition_penalty=1.0 \
+        inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/lora_tuning_tp2
 
   L2_Megatron_GPT_Eval:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_eval.py \
-                gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
-                prompts=['How to fix GPU memory? A:'] \
-                tensor_model_parallel_size=1 \
-                inference.tokens_to_generate=32 \
-                trainer.precision=32
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_eval.py \
+            gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
+            prompts=['How to fix GPU memory? A:'] \
+            tensor_model_parallel_size=1 \
+            inference.tokens_to_generate=32 \
+            trainer.precision=32
 
   L2_Megatron_GPT_Eval_PP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_eval.py \
-                gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-                server=False \
-                tensor_model_parallel_size=1 \
-                pipeline_model_parallel_size=2 \
-                trainer.devices=2 \
-                trainer.num_nodes=1 \
-                trainer.precision=32
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_eval.py \
+            gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+            server=False \
+            tensor_model_parallel_size=1 \
+            pipeline_model_parallel_size=2 \
+            trainer.devices=2 \
+            trainer.num_nodes=1 \
+            trainer.precision=32
 
   L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-                model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
-                model.peft.restore_from_path=null \
-                model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \
-                model.data.test_ds.names=[test] \
-                model.data.test_ds.global_batch_size=1 \
-                model.data.test_ds.micro_batch_size=1 \
-                model.data.test_ds.tokens_to_generate=30 \
-                model.data.test_ds.max_seq_length=6000 \
-                model.data.test_ds.write_predictions_to_file=True \
-                model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \
-                inference.greedy=True \
-                inference.repetition_penalty=1.0 \
-                inference.outfile_path=examples/nlp/language_modeling/out.jsonl && \
-                rm -rf examples/nlp/language_modeling/out.jsonl
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
+            model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
+            model.peft.restore_from_path=null \
+            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \
+            model.data.test_ds.names=[test] \
+            model.data.test_ds.global_batch_size=1 \
+            model.data.test_ds.micro_batch_size=1 \
+            model.data.test_ds.tokens_to_generate=30 \
+            model.data.test_ds.max_seq_length=6000 \
+            model.data.test_ds.write_predictions_to_file=True \
+            model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \
+            inference.greedy=True \
+            inference.repetition_penalty=1.0 \
+            inference.outfile_path=examples/nlp/language_modeling/out.jsonl 
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/out.jsonl
 
     # TODO: Add this test back. Test was failing on CI machines due to HW error
     # - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval
@@ -5127,1149 +3834,883 @@ jobs:
   # L2_Megatron_Change_Partitions
   L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_change_num_partitions.py \
-                --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-                --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \
-                --tensor_model_parallel_size 2 \
-                --target_tensor_model_parallel_size 1 \
-                --pipeline_model_parallel_size 1 \
-                --target_pipeline_model_parallel_size 2
-
-             rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_change_num_partitions.py \
+            --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+            --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \
+            --tensor_model_parallel_size 2 \
+            --target_tensor_model_parallel_size 1 \
+            --pipeline_model_parallel_size 1 \
+            --target_pipeline_model_parallel_size 2
+      AFTER_SCRIPT: |
+        rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo
 
   L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_change_num_partitions.py \
-                --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-                --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \
-                --tensor_model_parallel_size 2 \
-                --target_tensor_model_parallel_size 4 \
-                --pipeline_model_parallel_size 1 \
-                --target_pipeline_model_parallel_size 1
-
-            rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_change_num_partitions.py \
+            --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+            --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \
+            --tensor_model_parallel_size 2 \
+            --target_tensor_model_parallel_size 4 \
+            --pipeline_model_parallel_size 1 \
+            --target_pipeline_model_parallel_size 1
+      AFTER_SCRIPT: |
+        rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
 
   L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=relative \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=fast-swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=relative \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=fast-swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=relative \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=fast-swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=relative \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=fast-swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
 
   L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=alibi \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=alibi \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=alibi \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=alibi \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
 
   L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=kerple \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.masked_softmax_fusion=False \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.position_embedding_type=kerple \
-            model.decoder.num_layers=2 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=swiglu \
-            model.decoder.masked_softmax_fusion=False \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=pre_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-            model.data.data_impl=text_mmap \
-            +model.data.data_impl_kwargs.newline_int=10 \
-            +model.data.data_impl_kwargs.header_lines=0 \
-            +model.data.data_impl_kwargs.workers=null \
-            +model.data.data_impl_kwargs.sort_dataset_paths=False \
-            model.share_token_embeddings=False \
-            model.share_decoder_tokens_head_embeddings=False
-
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=kerple \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.masked_softmax_fusion=False \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.position_embedding_type=kerple \
+        model.decoder.num_layers=2 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=swiglu \
+        model.decoder.masked_softmax_fusion=False \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=pre_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False \
+        model.share_token_embeddings=False \
+        model.share_decoder_tokens_head_embeddings=False
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
 
   L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.pipeline_model_parallel_size=2 \
-            model.pipeline_model_parallel_split_rank=1 \
-            model.seq_length=256 \
-            model.encoder.num_layers=4 \
-            model.decoder.num_layers=1 \
-            model.encoder.hidden_size=64 \
-            model.decoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.ffn_hidden_size=2048 \
-            model.encoder.activation=gelu \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=post_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.pipeline_model_parallel_size=2 \
-            model.pipeline_model_parallel_split_rank=1 \
-            model.seq_length=256 \
-            model.encoder.num_layers=4 \
-            model.decoder.num_layers=1 \
-            model.encoder.hidden_size=64 \
-            model.decoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.ffn_hidden_size=2048 \
-            model.encoder.activation=gelu \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=post_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.pipeline_model_parallel_size=2 \
+        model.pipeline_model_parallel_split_rank=1 \
+        model.seq_length=256 \
+        model.encoder.num_layers=4 \
+        model.decoder.num_layers=1 \
+        model.encoder.hidden_size=64 \
+        model.decoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.ffn_hidden_size=2048 \
+        model.encoder.activation=gelu \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=post_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.pipeline_model_parallel_size=2 \
+        model.pipeline_model_parallel_split_rank=1 \
+        model.seq_length=256 \
+        model.encoder.num_layers=4 \
+        model.decoder.num_layers=1 \
+        model.encoder.hidden_size=64 \
+        model.decoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.ffn_hidden_size=2048 \
+        model.encoder.activation=gelu \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=post_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
 
   L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.pipeline_model_parallel_split_rank=1 \
-            model.seq_length=256 \
-            model.encoder.num_layers=4 \
-            model.decoder.num_layers=1 \
-            model.encoder.num_moe_experts=4 \
-            model.decoder.num_moe_experts=4 \
-            model.encoder.moe_frequency=3 \
-            model.decoder.moe_frequency=1 \
-            model.encoder.hidden_size=64 \
-            model.decoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.ffn_hidden_size=2048 \
-            model.encoder.activation=gelu \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=pre_ln \
-            model.decoder.transformer_block_type=post_ln \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.pipeline_model_parallel_split_rank=1 \
+        model.seq_length=256 \
+        model.encoder.num_layers=4 \
+        model.decoder.num_layers=1 \
+        model.encoder.num_moe_experts=4 \
+        model.decoder.num_moe_experts=4 \
+        model.encoder.moe_frequency=3 \
+        model.decoder.moe_frequency=1 \
+        model.encoder.hidden_size=64 \
+        model.decoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.ffn_hidden_size=2048 \
+        model.encoder.activation=gelu \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=pre_ln \
+        model.decoder.transformer_block_type=post_ln \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
 
   L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=normformer \
-            model.encoder.headscale=True \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=geglu \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.decoder.transformer_block_type=normformer \
-            model.decoder.headscale=False \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=swiglu \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.encoder.transformer_block_type=normformer \
-            model.encoder.headscale=True \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=geglu \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.decoder.transformer_block_type=normformer \
-            model.decoder.headscale=False \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-        
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-            rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=normformer \
+        model.encoder.headscale=True \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=geglu \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.decoder.transformer_block_type=normformer \
+        model.decoder.headscale=False \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=swiglu \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.encoder.transformer_block_type=normformer \
+        model.encoder.headscale=True \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=geglu \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.decoder.transformer_block_type=normformer \
+        model.decoder.headscale=False \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
 
   L2_Megatron_T5_Eval:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_eval.py \
-                --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-                --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
-                --tensor_model_parallel_size 1
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_eval.py \
+            --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
+            --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
+            --tensor_model_parallel_size 1
 
   L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='reglu' \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='reglu' \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
-
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=5 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=6 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.tensor_model_parallel_size=2 \
-            model.seq_length=128 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation='reglu' \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method='block' \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation='reglu' \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method='block' \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
-        
-            rm -rf examples/nlp/language_modeling/bart_pretrain_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='reglu' \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='reglu' \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
+
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=5 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=6 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.tensor_model_parallel_size=2 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation='reglu' \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method='block' \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation='reglu' \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method='block' \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bart_pretrain_results
 
   L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=10 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-            model.pipeline_model_parallel_size=2 \
-            model.pipeline_model_parallel_split_rank=1 \
-            model.seq_length=256 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=geglu \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=geglu \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.data.respect_document_boundaries=False \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-
-            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=10 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.pipeline_model_parallel_size=2 \
-            model.pipeline_model_parallel_split_rank=1 \
-            model.seq_length=256 \
-            model.encoder.num_layers=4 \
-            model.encoder.hidden_size=64 \
-            model.encoder.num_attention_heads=8 \
-            model.encoder.activation=geglu \
-            model.encoder.bias_activation_fusion=False \
-            model.encoder.activations_checkpoint_method=block \
-            model.encoder.activations_checkpoint_num_layers=1 \
-            model.decoder.num_layers=4 \
-            model.decoder.hidden_size=64 \
-            model.decoder.num_attention_heads=8 \
-            model.decoder.activation=geglu \
-            model.decoder.bias_activation_fusion=False \
-            model.decoder.activations_checkpoint_method=block \
-            model.decoder.activations_checkpoint_num_layers=1 \
-            model.data.respect_document_boundaries=False \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-        
-            rm -rf examples/nlp/language_modeling/bart_pretrain_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+        model.pipeline_model_parallel_size=2 \
+        model.pipeline_model_parallel_split_rank=1 \
+        model.seq_length=256 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=geglu \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=geglu \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.data.respect_document_boundaries=False \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
+
+        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=10 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.pipeline_model_parallel_size=2 \
+        model.pipeline_model_parallel_split_rank=1 \
+        model.seq_length=256 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.encoder.activation=geglu \
+        model.encoder.bias_activation_fusion=False \
+        model.encoder.activations_checkpoint_method=block \
+        model.encoder.activations_checkpoint_num_layers=1 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.decoder.activation=geglu \
+        model.decoder.bias_activation_fusion=False \
+        model.decoder.activations_checkpoint_method=block \
+        model.decoder.activations_checkpoint_num_layers=1 \
+        model.data.respect_document_boundaries=False \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/bart_pretrain_results
 
   # L2: Megatron T5 GLUE/XNLI Finetuning 
   # TODO(Oktai15): update it in 1.8.0 version
   L2_Megatron_T5_GLUE_RTE:  
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-            trainer.devices=1 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            model.pipeline_model_parallel_size=1 \
-            model.pipeline_model_parallel_split_rank=0 \
-            model.data.train_ds.task_name=rte \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=2 \
-            model.data.validation_ds.micro_batch_size=2 \
-            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-            model.data.validation_ds.task_name=rte \
-            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv
-            
-            rm -rf examples/nlp/language_modeling/t5_glue_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-  
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
+        trainer.devices=1 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        +trainer.limit_val_batches=2 \
+        +trainer.limit_test_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
+        model.pipeline_model_parallel_size=1 \
+        model.pipeline_model_parallel_split_rank=0 \
+        model.data.train_ds.task_name=rte \
+        model.data.train_ds.global_batch_size=4 \
+        model.data.train_ds.micro_batch_size=2 \
+        model.data.validation_ds.global_batch_size=2 \
+        model.data.validation_ds.micro_batch_size=2 \
+        model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
+        model.data.validation_ds.task_name=rte \
+        model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_glue_results
+
   L2_Megatron_T5_GLUE_XNLI:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-            -cn megatron_t5_config_finetune_glue_xnli \
-            trainer.devices=1 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=1 \
-            +trainer.limit_val_batches=2 \
-            +trainer.limit_test_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            model.pipeline_model_parallel_size=1 \
-            model.pipeline_model_parallel_split_rank=0 \
-            model.data.train_ds.global_batch_size=4 \
-            model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=2 \
-            model.data.validation_ds.micro_batch_size=2 \
-            model.data.test_ds.global_batch_size=2 \
-            model.data.test_ds.micro_batch_size=2 \
-            model.data.train_ds.task_name=rte \
-            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-            model.data.validation_ds.task_name=xnli \
-            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
-            model.data.test_ds.task_name=xnli \
-            model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv
-            
-            rm -rf examples/nlp/language_modeling/t5_xnli_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
- 
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
+        -cn megatron_t5_config_finetune_glue_xnli \
+        trainer.devices=1 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=1 \
+        +trainer.limit_val_batches=2 \
+        +trainer.limit_test_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
+        model.pipeline_model_parallel_size=1 \
+        model.pipeline_model_parallel_split_rank=0 \
+        model.data.train_ds.global_batch_size=4 \
+        model.data.train_ds.micro_batch_size=2 \
+        model.data.validation_ds.global_batch_size=2 \
+        model.data.validation_ds.micro_batch_size=2 \
+        model.data.test_ds.global_batch_size=2 \
+        model.data.test_ds.micro_batch_size=2 \
+        model.data.train_ds.task_name=rte \
+        model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
+        model.data.validation_ds.task_name=xnli \
+        model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
+        model.data.test_ds.task_name=xnli \
+        model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_xnli_results
+
   L2_Megatron_T5_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
-
-            python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
-            trainer.devices=2 \
-            trainer.log_every_n_steps=1 \
-            trainer.max_epochs=9999 \
-            trainer.max_steps=3 \
-            trainer.val_check_interval=3 \
-            ++trainer.limit_val_batches=2 \
-            trainer.precision=16 \
-            exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \
-            model.pipeline_model_parallel_size=1 \
-            model.tensor_model_parallel_size=2 \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-            model.peft.peft_scheme=lora \
-            model.answer_only_loss=True \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[1.0] \
-            model.data.train_ds.num_workers=0 \
-            model.data.validation_ds.num_workers=0 \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.validation_ds.names=[quarel]
-
-            python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-            model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
-            model.peft.restore_from_ckpt_name=null \
-            model.peft.restore_from_hparams_path=null \
-            model.tensor_model_parallel_size=2 \
-            trainer.devices=2 \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-            model.data.test_ds.names=[quarel4] \
-            model.global_batch_size=2 \
-            model.micro_batch_size=1 \
-            model.data.test_ds.tokens_to_generate=10 \
-            model.data.test_ds.write_predictions_to_file=True \
-            model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \
-            inference.greedy=True \
-            inference.repetition_penalty=1.0 \
-            inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl
-
-            rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
+
+        python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=9999 \
+        trainer.max_steps=3 \
+        trainer.val_check_interval=3 \
+        ++trainer.limit_val_batches=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \
+        model.pipeline_model_parallel_size=1 \
+        model.tensor_model_parallel_size=2 \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
+        model.peft.peft_scheme=lora \
+        model.answer_only_loss=True \
+        model.micro_batch_size=1 \
+        model.global_batch_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[1.0] \
+        model.data.train_ds.num_workers=0 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+
+        python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
+        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
+        model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
+        model.peft.restore_from_ckpt_name=null \
+        model.peft.restore_from_hparams_path=null \
+        model.tensor_model_parallel_size=2 \
+        trainer.devices=2 \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
+        model.data.test_ds.names=[quarel4] \
+        model.global_batch_size=2 \
+        model.micro_batch_size=1 \
+        model.data.test_ds.tokens_to_generate=10 \
+        model.data.test_ds.write_predictions_to_file=True \
+        model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \
+        inference.greedy=True \
+        inference.repetition_penalty=1.0 \
+        inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
 
   # L2: Megatron Mock Data Generation                
   L2_Megatron_Mock_Data_Generation_MockGPTDataset:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-                trainer.max_steps=10 \
-                trainer.limit_val_batches=7 \
-                trainer.val_check_interval=10 \
-                exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-                model.mcore_gpt=True \
-                model.data.data_impl=mock \
-                model.data.data_prefix=[]
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
- 
-  L2_Megatron_Mock_Data_Generation_MockT5Dataset:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
             trainer.max_steps=10 \
-            trainer.limit_val_batches=3 \
+            trainer.limit_val_batches=7 \
             trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.mcore_gpt=True \
             model.data.data_impl=mock \
             model.data.data_prefix=[]
 
-            rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+  L2_Megatron_Mock_Data_Generation_MockT5Dataset:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.max_steps=10 \
+        trainer.limit_val_batches=3 \
+        trainer.val_check_interval=10 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.data.data_impl=mock \
+        model.data.data_prefix=[]
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
 
   # L2: TTS Fast dev runs 1
   L2_TTS_Fast_dev_runs_1_Tacotron_2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure-gpus-1
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/tacotron2.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.decoder.decoder_rnn_dim=256 \
-            model.decoder.attention_rnn_dim=1024 \
-            model.decoder.prenet_dim=128 \
-            model.postnet.postnet_n_convolutions=3 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs \
-            ~trainer.check_val_every_n_epoch
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        python examples/tts/tacotron2.py \
+        train_dataset=/home/TestData/an4_dataset/an4_train.json \
+        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+        trainer.devices=1 \
+        trainer.accelerator="gpu" \
+        +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
+        trainer.strategy=auto \
+        model.decoder.decoder_rnn_dim=256 \
+        model.decoder.attention_rnn_dim=1024 \
+        model.decoder.prenet_dim=128 \
+        model.postnet.postnet_n_convolutions=3 \
+        model.train_ds.dataloader_params.batch_size=4 \
+        model.train_ds.dataloader_params.num_workers=0 \
+        model.validation_ds.dataloader_params.batch_size=4 \
+        model.validation_ds.dataloader_params.num_workers=0 \
+        ~model.text_normalizer \
+        ~model.text_normalizer_call_kwargs \
+        ~trainer.check_val_every_n_epoch
 
   L2_TTS_Fast_dev_runs_1_WaveGlow:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/waveglow.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.waveglow.n_flows=4 \
-            model.waveglow.n_wn_layers=2 \
-            model.waveglow.n_wn_channels=32 \
-            ~trainer.check_val_every_n_epoch
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/tts/waveglow.py \
+        train_dataset=/home/TestData/an4_dataset/an4_train.json \
+        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+        trainer.devices="[0]" \
+        +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
+        trainer.strategy=auto \
+        model.train_ds.dataloader_params.batch_size=4 \
+        model.train_ds.dataloader_params.num_workers=0 \
+        model.validation_ds.dataloader_params.batch_size=4 \
+        model.validation_ds.dataloader_params.num_workers=0 \
+        model.waveglow.n_flows=4 \
+        model.waveglow.n_wn_layers=2 \
+        model.waveglow.n_wn_channels=32 \
+        ~trainer.check_val_every_n_epoch
 
   L2_TTS_Fast_dev_runs_1_FastPitch:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/fastpitch.py \
-            --config-name fastpitch_align_v1.05 \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/beta_priors \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.symbols_embedding_dim=64 \
-            model.input_fft.d_inner=384 \
-            model.input_fft.n_layer=2 \
-            model.output_fft.d_inner=384 \
-            model.output_fft.n_layer=2 \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/tts/fastpitch.py \
+        --config-name fastpitch_align_v1.05 \
+        train_dataset=/home/TestData/an4_dataset/an4_train.json \
+        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+        sup_data_path=/home/TestData/an4_dataset/beta_priors \
+        trainer.devices="[0]" \
+        +trainer.limit_train_batches=1 \
+        +trainer.limit_val_batches=1 \
+        trainer.max_epochs=1 \
+        trainer.strategy=auto \
+        model.pitch_mean=212.35873413085938 \
+        model.pitch_std=68.52806091308594 \
+        model.train_ds.dataloader_params.batch_size=4 \
+        model.train_ds.dataloader_params.num_workers=0 \
+        model.validation_ds.dataloader_params.batch_size=4 \
+        model.validation_ds.dataloader_params.num_workers=0 \
+        model.symbols_embedding_dim=64 \
+        model.input_fft.d_inner=384 \
+        model.input_fft.n_layer=2 \
+        model.output_fft.d_inner=384 \
+        model.output_fft.n_layer=2 \
+        ~trainer.check_val_every_n_epoch \
+        ~model.text_normalizer \
+        ~model.text_normalizer_call_kwargs
 
   # OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS:
   #   needs: [cicd-test-container-setup]
@@ -6315,78 +4756,50 @@ jobs:
 
   L2_TTS_Fast_dev_runs_1_Mixer-TTS:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/mixer_tts.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            sup_data_path=/home/TestData/an4_dataset/sup_data \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.pitch_mean=212.35873413085938 \
-            model.pitch_std=68.52806091308594 \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            ~trainer.check_val_every_n_epoch \
-            ~model.text_normalizer \
-            ~model.text_normalizer_call_kwargs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/tts/mixer_tts.py \
+        train_dataset=/home/TestData/an4_dataset/an4_train.json \
+        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+        sup_data_path=/home/TestData/an4_dataset/sup_data \
+        trainer.devices="[0]" \
+        +trainer.limit_train_batches=1 \
+        +trainer.limit_val_batches=1 \
+        trainer.max_epochs=1 \
+        trainer.strategy=auto \
+        model.pitch_mean=212.35873413085938 \
+        model.pitch_std=68.52806091308594 \
+        model.train_ds.dataloader_params.batch_size=4 \
+        model.train_ds.dataloader_params.num_workers=0 \
+        model.validation_ds.dataloader_params.batch_size=4 \
+        model.validation_ds.dataloader_params.num_workers=0 \
+        ~trainer.check_val_every_n_epoch \
+        ~model.text_normalizer \
+        ~model.text_normalizer_call_kwargs
 
   L2_TTS_Fast_dev_runs_1_Hifigan:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/tts/hifigan.py \
-            train_dataset=/home/TestData/an4_dataset/an4_train.json \
-            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices="[0]" \
-            +trainer.limit_train_batches=1 \
-            +trainer.limit_val_batches=1 \
-            +trainer.max_epochs=1 \
-            trainer.strategy=auto \
-            model.train_ds.dataloader_params.batch_size=4 \
-            model.train_ds.dataloader_params.num_workers=0 \
-            model.validation_ds.dataloader_params.batch_size=4 \
-            model.validation_ds.dataloader_params.num_workers=0 \
-            model.generator.upsample_initial_channel=64 \
-            +model.debug=true \
-            ~trainer.check_val_every_n_epoch
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/tts/hifigan.py \
+        train_dataset=/home/TestData/an4_dataset/an4_train.json \
+        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+        trainer.devices="[0]" \
+        +trainer.limit_train_batches=1 \
+        +trainer.limit_val_batches=1 \
+        +trainer.max_epochs=1 \
+        trainer.strategy=auto \
+        model.train_ds.dataloader_params.batch_size=4 \
+        model.train_ds.dataloader_params.num_workers=0 \
+        model.validation_ds.dataloader_params.batch_size=4 \
+        model.validation_ds.dataloader_params.num_workers=0 \
+        model.generator.upsample_initial_channel=64 \
+        +model.debug=true \
+        ~trainer.check_val_every_n_epoch
 
   # L2: NeRF
   # L2_NeRF_DreamFusion:
@@ -6419,30 +4832,18 @@ jobs:
 
   Speech_Checkpoints_tests:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 20
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
-                pretrained_name=QuartzNet15x5Base-En  \
-                dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \
-                batch_size=64 \
-                tolerance=0.1012
-            rm -f examples/asr/evaluation_transcripts.json
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
+            pretrained_name=QuartzNet15x5Base-En  \
+            dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \
+            batch_size=64 \
+            tolerance=0.1012
+      TIMEOUT: 20
+      AFTER_SCRIPT: |
+        rm -f examples/asr/evaluation_transcripts.json
 
   Nemo_CICD_Test:
     needs: 
@@ -6561,16 +4962,27 @@ jobs:
       - Speech_Checkpoints_tests
     if: always()
     runs-on: ubuntu-latest
-    steps:
-        # This should depend on all the tests so we block/unblock based on all tests passing
-      - if: ${{ contains(needs.*.result, 'success') }}
+    steps:  
+      - if: ${{ always() }}
+        id: pipeline-conclusion
+        run: |
+          # Slack notifications are send only on test failure (not cancelled):
+          FAILED=${{ contains(needs.*.outputs.conclusion, 'failure') }}
+          echo "FAILED=$FAILED" >> $GITHUB_OUTPUT
+
+          # Mark as successful if no job was cancelled:
+          SUCCESS=${{ !contains(needs.*.result, 'cancelled') }}
+          echo "SUCCESS=$SUCCESS" >> $GITHUB_OUTPUT
+
+      # This should depend on all the tests so we block/unblock based on all tests passing
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }}
         run: exit 0
 
-      - if: ${{ contains(needs.*.result, 'failure') }}
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
         name: Checkout repository
         uses: actions/checkout@v4
       
-      - if: ${{ contains(needs.*.result, 'failure') }}
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
         run: |
           source .github/scripts/slackHelper.sh
 
@@ -6578,3 +4990,7 @@ jobs:
           PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
 
           sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL"
+
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
+        run: |
+          exit 1
\ No newline at end of file

From 3b758dee6055c5d4569b783c73c0e7019b34fc14 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 6 Jun 2024 09:01:57 -0400
Subject: [PATCH 164/178] Fix T5 G2P Input and Output Types (#9224) (#9269)

* fix t5 g2p model


* Apply isort and black reformatting


---------

Signed-off-by: Jason <jasoli@nvidia.com>
Signed-off-by: blisc <blisc@users.noreply.github.com>
Co-authored-by: Jason <jasoli@nvidia.com>
Co-authored-by: blisc <blisc@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 nemo/collections/tts/g2p/models/t5.py | 52 +++++++++++++++++----------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/nemo/collections/tts/g2p/models/t5.py b/nemo/collections/tts/g2p/models/t5.py
index 25f63d8d858a..19f976081687 100644
--- a/nemo/collections/tts/g2p/models/t5.py
+++ b/nemo/collections/tts/g2p/models/t5.py
@@ -46,17 +46,23 @@ class T5G2PModel(G2PModel, Exportable):
 
     @property
     def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            "input_ids": NeuralType(('B', 'T'), TokenIndex()),
-            "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True),
-            "labels": NeuralType(('B', 'T'), LabelsType()),
-        }
+        if self._input_types is None:
+            return {
+                "input_ids": NeuralType(('B', 'T'), TokenIndex()),
+                "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True),
+                "labels": NeuralType(('B', 'T'), LabelsType()),
+            }
+        return self._input_types
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {"loss": NeuralType((), LossType())}
+        if self._output_types is None:
+            return {"loss": NeuralType((), LossType())}
+        return self._output_types
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        self._input_types = None
+        self._output_types = None
         self.world_size = 1
         if trainer is not None:
             self.world_size = trainer.num_nodes * trainer.num_devices
@@ -91,7 +97,11 @@ def forward(self, input_ids, attention_mask, labels):
     # ===== Training Functions ===== #
     def training_step(self, batch, batch_idx):
         input_ids, attention_mask, labels = batch
-        train_loss = self.forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels,)
+        train_loss = self.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+        )
 
         self.log('train_loss', train_loss)
         return train_loss
@@ -126,7 +136,10 @@ def _setup_infer_dataloader(self, cfg) -> 'torch.utils.data.DataLoader':
 
     # Functions for inference
     @torch.no_grad()
-    def _infer(self, config: DictConfig,) -> List[int]:
+    def _infer(
+        self,
+        config: DictConfig,
+    ) -> List[int]:
         """
         Runs model inference.
 
@@ -161,7 +174,11 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0, split="val"):
         input_ids, attention_mask, labels = batch
 
         # Get loss from forward step
-        val_loss = self.forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels,)
+        val_loss = self.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+        )
 
         # Get preds from generate function and calculate PER
         labels_str = self._tokenizer.batch_decode(
@@ -287,15 +304,8 @@ def _prepare_for_export(self, **kwargs):
         }
 
     def _export_teardown(self):
-        self._input_types = self._output_types = None
-
-    @property
-    def input_types(self):
-        return self._input_types
-
-    @property
-    def output_types(self):
-        return self._output_types
+        self._input_types = None
+        self._output_types = None
 
     def input_example(self, max_batch=1, max_dim=44):
         """
@@ -307,7 +317,11 @@ def input_example(self, max_batch=1, max_dim=44):
         sentence = "Kupil sem si bicikel in mu zamenjal stol."
         input_ids = [sentence]
         input_encoding = self._tokenizer(
-            input_ids, padding='longest', max_length=self.max_source_len, truncation=True, return_tensors='pt',
+            input_ids,
+            padding='longest',
+            max_length=self.max_source_len,
+            truncation=True,
+            return_tensors='pt',
         )
         return (input_encoding.input_ids,)
 

From fc2e6936574dfdce81d31d20636beac26993e2b6 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Thu, 6 Jun 2024 08:32:26 -0700
Subject: [PATCH 165/178] Use model-cast-to-bfloat16 rather than
 AMP-to-bfloat16 for inference. (#9198)

* Fix the "cast ping pong" problem when we run AMP inference.

This has been tested only for Parakeet-CTC-1.1B right now. This
problem certainly exists elsewhere.

Automatic mixed precision and inference do not play well together.

First, automatic mixed precision was created back when neural networks
were much simpler. In particular, they did not have softmax and layer
norm as frequent operations. In the era of transformers, softmax and
layer norm are very common. AMP will uncoditionally output fp32
outputs from these operations, even if their inputs are fp16. See
here: https://pytorch.org/docs/stable/amp.html#cuda-ops-that-can-autocast-to-float32

This is no longer necessary, now that layer norm does accumulation in
fp32 in pytorch, even if the input is fp16:
https://github.com/pytorch/pytorch/issues/66707

Do infernece by casting model to bfloat16, not by using AMP.

Do feature preprocessing in float32 for accuracy. Warn if someone
tries to input a non-float32 tensor.

Always create the output in the type the rest of the model expects.

Sort manifests by duration.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* Always cast softmax inputs to float32 when in training mode.

While we don't need this for accurate results in b/float16, this is a
safety precaution to make sure that training accuracy does not
regress.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
---
 examples/asr/transcribe_speech.py             | 38 +++++++++++++++++--
 nemo/collections/asr/models/ctc_models.py     |  8 +++-
 .../asr/modules/audio_preprocessing.py        | 22 +++++++++--
 .../asr/modules/conformer_encoder.py          |  3 +-
 .../asr/modules/squeezeformer_encoder.py      | 25 +++++++-----
 .../asr/parts/mixins/transcription.py         |  3 +-
 .../asr/parts/submodules/conformer_modules.py |  2 +-
 .../parts/submodules/multi_head_attention.py  | 21 +++++-----
 .../asr/parts/utils/transcribe_utils.py       | 38 +++++++++++++------
 .../adapters/test_asr_adapter_modules.py      |  6 +--
 10 files changed, 120 insertions(+), 46 deletions(-)

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index 5f5543fe11e5..d54ee34c18cd 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -16,6 +16,7 @@
 import glob
 import json
 import os
+import time
 from dataclasses import dataclass, field, is_dataclass
 from tempfile import NamedTemporaryFile
 from typing import List, Optional, Union
@@ -84,6 +85,8 @@
   langid: Str used for convert_num_to_words during groundtruth cleaning
   use_cer: Bool to use Character Error Rate (CER)  or Word Error Rate (WER)
 
+  calculate_rtfx: Bool to calculate the RTFx throughput to transcribe the input dataset.
+
 # Usage
 ASR model can be specified by either "model_path" or "pretrained_name".
 Data for transcription can be defined with either "audio_dir" or "dataset_manifest".
@@ -153,6 +156,7 @@ class TranscriptionConfig:
     allow_mps: bool = False  # allow to select MPS device (Apple Silicon M-series GPU)
     amp: bool = False
     amp_dtype: str = "float16"  # can be set to "float16" or "bfloat16" when using amp
+    compute_dtype: str = "float32"
     matmul_precision: str = "highest"  # Literal["highest", "high", "medium"]
     audio_type: str = "wav"
 
@@ -208,6 +212,8 @@ class TranscriptionConfig:
     allow_partial_transcribe: bool = False
     extract_nbest: bool = False  # Extract n-best hypotheses from the model
 
+    calculate_rtfx: bool = False
+
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
 def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis]]:
@@ -266,6 +272,14 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     asr_model.set_trainer(trainer)
     asr_model = asr_model.eval()
 
+    if cfg.compute_dtype != "float32" and cfg.amp:
+        raise ValueError("amp=true is mutually exclusive with a compute_dtype other than float32")
+
+    amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
+
+    if cfg.compute_dtype != "float32":
+        asr_model.to(getattr(torch, cfg.compute_dtype))
+
     # we will adjust this flag if the model does not support it
     compute_timestamps = cfg.compute_timestamps
     compute_langs = cfg.compute_langs
@@ -378,7 +392,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     else:
 
         @contextlib.contextmanager
-        def autocast(dtype=None):
+        def autocast(dtype=None, enabled=True):
             yield
 
     # Compute output filename
@@ -394,10 +408,22 @@ def autocast(dtype=None):
 
     # transcribe audio
 
-    amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
+    if cfg.calculate_rtfx:
+        total_duration = 0.0
+
+        with open(cfg.dataset_manifest, "rt") as fh:
+            for line in fh:
+                item = json.loads(line)
+                if "duration" not in item:
+                    raise ValueError(
+                        f"Requested calculate_rtfx=True, but line {line} in manifest {cfg.dataset_manifest} lacks a 'duration' field."
+                    )
+                total_duration += item["duration"]
 
-    with autocast(dtype=amp_dtype):
+    with autocast(dtype=amp_dtype, enabled=cfg.amp):
         with torch.no_grad():
+            if cfg.calculate_rtfx:
+                start_time = time.time()
             if partial_audio:
                 transcriptions = transcribe_partial_audio(
                     asr_model=asr_model,
@@ -420,10 +446,13 @@ def autocast(dtype=None):
                 override_cfg.lang_field = cfg.gt_lang_attr_name
                 if hasattr(override_cfg, "prompt"):
                     override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
+
                 transcriptions = asr_model.transcribe(
                     audio=filepaths,
                     override_config=override_cfg,
                 )
+            if cfg.calculate_rtfx:
+                transcribe_time = time.time() - start_time
 
     if cfg.dataset_manifest is not None:
         logging.info(f"Finished transcribing from manifest file: {cfg.dataset_manifest}")
@@ -475,6 +504,9 @@ def autocast(dtype=None):
             logging.info(f"Writing prediction and error rate of each sample to {output_manifest_w_wer}!")
             logging.info(f"{total_res}")
 
+    if cfg.calculate_rtfx:
+        logging.info(f"Dataset RTFx {(total_duration/transcribe_time)}")
+
     return cfg
 
 
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 177da81f85f2..093419c3ca0c 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -668,7 +668,7 @@ def test_dataloader(self):
     def _transcribe_on_begin(self, audio, trcfg: TranscribeConfig):
         super()._transcribe_on_begin(audio, trcfg)
 
-        # Freeze the encoder and decoure_exder modules
+        # Freeze the encoder and decoder modules
         self.encoder.freeze()
         self.decoder.freeze()
 
@@ -706,7 +706,11 @@ def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> Gen
             logits_len = logits_len.cpu()
             # dump log probs per file
             for idx in range(logits_cpu.shape[0]):
-                current_hypotheses[idx].y_sequence = logits_cpu[idx][: logits_len[idx]]
+                # We clone because we don't want references to the
+                # cudaMallocHost()-allocated tensor to be floating
+                # around. Were that to be the case, then the pinned
+                # memory cache would always miss.
+                current_hypotheses[idx].y_sequence = logits_cpu[idx, : logits_len[idx]].clone()
                 if current_hypotheses[idx].alignments is None:
                     current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence
             del logits_cpu
diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py
index d45c0acf314f..2dca468fab35 100644
--- a/nemo/collections/asr/modules/audio_preprocessing.py
+++ b/nemo/collections/asr/modules/audio_preprocessing.py
@@ -39,7 +39,7 @@
 )
 from nemo.core.utils import numba_utils
 from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__
-from nemo.utils import logging
+from nemo.utils import logging, logging_mode
 
 try:
     import torchaudio
@@ -85,11 +85,27 @@ def __init__(self, win_length, hop_length):
             None: torch.ones,
         }
 
+        # Normally, when you call to(dtype) on a torch.nn.Module, all
+        # floating point parameters and buffers will change to that
+        # dtype, rather than being float32. The AudioPreprocessor
+        # classes, uniquely, don't actually have any parameters or
+        # buffers from what I see. In addition, we want the input to
+        # the preprocessor to be float32, but need to create the
+        # output in appropriate precision. We have this empty tensor
+        # here just to detect which dtype tensor this module should
+        # output at the end of execution.
+        self.register_buffer("dtype_sentinel_tensor", torch.tensor((), dtype=torch.float32), persistent=False)
+
     @typecheck()
     @torch.no_grad()
     def forward(self, input_signal, length):
-        processed_signal, processed_length = self.get_features(input_signal, length)
-
+        if input_signal.dtype != torch.float32:
+            logging.warn(
+                f"AudioPreprocessor received an input signal of dtype {input_signal.dtype}, rather than torch.float32. In sweeps across multiple datasets, we have found that the preprocessor is not robust to low precision  mathematics. As such, it runs in float32. Your input will be cast to float32, but this is not necessarily enough to recovery full accuracy. For example, simply casting input_signal from torch.float32 to torch.bfloat16, then back to torch.float32 before running AudioPreprocessor causes drops in absolute WER of up to 0.1%. torch.bfloat16 simply does not have enough mantissa bits to represent enough values in the range [-1.0,+1.0] correctly.",
+                mode=logging_mode.ONCE,
+            )
+        processed_signal, processed_length = self.get_features(input_signal.to(torch.float32), length)
+        processed_signal = processed_signal.to(self.dtype_sentinel_tensor.dtype)
         return processed_signal, processed_length
 
     @abstractmethod
diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
index d8f0e58833f7..d723ce85d2ce 100644
--- a/nemo/collections/asr/modules/conformer_encoder.py
+++ b/nemo/collections/asr/modules/conformer_encoder.py
@@ -679,7 +679,8 @@ def set_max_audio_length(self, max_audio_length):
         """
         self.max_audio_length = max_audio_length
         device = next(self.parameters()).device
-        self.pos_enc.extend_pe(max_audio_length, device)
+        dtype = next(self.parameters()).dtype
+        self.pos_enc.extend_pe(max_audio_length, device, dtype)
 
     def _create_masks(self, att_context_size, padding_length, max_audio_length, offset, device):
         if self.self_attention_model != "rel_pos_local_attn":
diff --git a/nemo/collections/asr/modules/squeezeformer_encoder.py b/nemo/collections/asr/modules/squeezeformer_encoder.py
index ce0d49843d4f..ae779380edf6 100644
--- a/nemo/collections/asr/modules/squeezeformer_encoder.py
+++ b/nemo/collections/asr/modules/squeezeformer_encoder.py
@@ -99,8 +99,7 @@ def input_example(self, max_batch=1, max_dim=256):
 
     @property
     def input_types(self):
-        """Returns definitions of module input ports.
-        """
+        """Returns definitions of module input ports."""
         return OrderedDict(
             {
                 "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()),
@@ -110,8 +109,7 @@ def input_types(self):
 
     @property
     def output_types(self):
-        """Returns definitions of module output ports.
-        """
+        """Returns definitions of module output ports."""
         return OrderedDict(
             {
                 "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
@@ -253,7 +251,11 @@ def __init__(
             # Chose same type of positional encoding as the originally determined above
             if self_attention_model == "rel_pos":
                 self.time_reduce_pos_enc = RelPositionalEncoding(
-                    d_model=d_model, dropout_rate=0.0, max_len=pos_emb_max_len, xscale=None, dropout_rate_emb=0.0,
+                    d_model=d_model,
+                    dropout_rate=0.0,
+                    max_len=pos_emb_max_len,
+                    xscale=None,
+                    dropout_rate_emb=0.0,
                 )
             else:
                 self.time_reduce_pos_enc = PositionalEncoding(
@@ -275,20 +277,21 @@ def __init__(
         self.interctc_capture_at_layers = None
 
     def set_max_audio_length(self, max_audio_length):
-        """ Sets maximum input length.
-            Pre-calculates internal seq_range mask.
+        """Sets maximum input length.
+        Pre-calculates internal seq_range mask.
         """
         self.max_audio_length = max_audio_length
         device = next(self.parameters()).device
+        dtype = next(self.parameters()).dtype
         seq_range = torch.arange(0, self.max_audio_length, device=device)
         if hasattr(self, 'seq_range'):
             self.seq_range = seq_range
         else:
             self.register_buffer('seq_range', seq_range, persistent=False)
-        self.pos_enc.extend_pe(max_audio_length, device)
+        self.pos_enc.extend_pe(max_audio_length, device, dtype)
 
         if self.time_reduce_pos_enc is not None:
-            self.time_reduce_pos_enc.extend_pe(max_audio_length, device)
+            self.time_reduce_pos_enc.extend_pe(max_audio_length, device, dtype)
 
     @typecheck()
     def forward(self, audio_signal, length=None):
@@ -434,7 +437,9 @@ def _update_adapter_cfg_input_dim(self, cfg: DictConfig):
         cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.d_model)
         return cfg
 
-    def get_accepted_adapter_types(self,) -> Set[type]:
+    def get_accepted_adapter_types(
+        self,
+    ) -> Set[type]:
         types = super().get_accepted_adapter_types()
 
         if len(types) == 0:
diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py
index 261e97a225dd..5b9461d0a389 100644
--- a/nemo/collections/asr/parts/mixins/transcription.py
+++ b/nemo/collections/asr/parts/mixins/transcription.py
@@ -181,7 +181,7 @@ class TranscriptionMixin(ABC):
 
     """
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def transcribe(
         self,
         audio: Union[str, List[str], np.ndarray, DataLoader],
@@ -381,7 +381,6 @@ def transcribe_generator(self, audio, override_config: Optional[TranscribeConfig
                 for test_batch in tqdm(dataloader, desc="Transcribing", disable=not verbose):
                     # Move batch to device
                     test_batch = move_to_device(test_batch, transcribe_cfg._internal.device)
-
                     # Run forward pass
                     model_outputs = self._transcribe_forward(test_batch, transcribe_cfg)
                     processed_outputs = self._transcribe_output_processing(model_outputs, transcribe_cfg)
diff --git a/nemo/collections/asr/parts/submodules/conformer_modules.py b/nemo/collections/asr/parts/submodules/conformer_modules.py
index efd23ef44628..093cde63c439 100644
--- a/nemo/collections/asr/parts/submodules/conformer_modules.py
+++ b/nemo/collections/asr/parts/submodules/conformer_modules.py
@@ -377,7 +377,7 @@ def forward(self, x, pad_mask=None, cache=None):
             x = self.pointwise_activation(x)
 
         if pad_mask is not None:
-            x = x.float().masked_fill(pad_mask.unsqueeze(1), 0.0)
+            x = x.masked_fill(pad_mask.unsqueeze(1), 0.0)
 
         x = self.depthwise_conv(x, cache=cache)
         if cache is not None:
diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py
index 19d713405953..de86132a721b 100644
--- a/nemo/collections/asr/parts/submodules/multi_head_attention.py
+++ b/nemo/collections/asr/parts/submodules/multi_head_attention.py
@@ -669,7 +669,10 @@ def _compute_out_global_to_all(
         global_attn_scores = global_attn_scores.view(batch_size * self.h, max_num_global_attn_indices, seq_len)
 
         # compute global attn probs
-        global_attn_probs_float = nn.functional.softmax(global_attn_scores, dim=-1, dtype=torch.float32)
+        if self.training:
+            global_attn_probs_float = nn.functional.softmax(global_attn_scores, dim=-1, dtype=torch.float32)
+        else:
+            global_attn_probs_float = nn.functional.softmax(global_attn_scores, dim=-1)
 
         global_attn_probs = self.dropout(global_attn_probs_float)
 
@@ -906,7 +909,7 @@ def __init__(self, d_model, dropout_rate, max_len=5000, xscale=None, dropout_rat
         else:
             self.dropout_emb = None
 
-    def create_pe(self, positions):
+    def create_pe(self, positions, dtype):
         pos_length = positions.size(0)
         pe = torch.zeros(pos_length, self.d_model, device=positions.device)
         div_term = torch.exp(
@@ -915,18 +918,18 @@ def create_pe(self, positions):
         )
         pe[:, 0::2] = torch.sin(positions * div_term)
         pe[:, 1::2] = torch.cos(positions * div_term)
-        pe = pe.unsqueeze(0)
+        pe = pe.unsqueeze(0).to(dtype)
         if hasattr(self, 'pe'):
             self.pe = pe
         else:
             self.register_buffer('pe', pe, persistent=False)
 
-    def extend_pe(self, length, device):
+    def extend_pe(self, length, device, dtype):
         """Reset and extend the positional encodings if needed."""
         if hasattr(self, 'pe') and self.pe.size(1) >= length:
             return
         positions = torch.arange(0, length, dtype=torch.float32, device=device).unsqueeze(1)
-        self.create_pe(positions=positions)
+        self.create_pe(positions=positions, dtype=dtype)
 
     def forward(self, x: torch.Tensor, cache_len=0):
         """Adds positional encoding.
@@ -958,7 +961,7 @@ class RelPositionalEncoding(PositionalEncoding):
         dropout_rate_emb (float): dropout rate for the positional embeddings
     """
 
-    def extend_pe(self, length, device):
+    def extend_pe(self, length, device, dtype):
         """Reset and extend the positional encodings if needed."""
         needed_size = 2 * length - 1
         if hasattr(self, 'pe') and self.pe.size(1) >= needed_size:
@@ -966,7 +969,7 @@ def extend_pe(self, length, device):
         # positions would be from negative numbers to positive
         # positive positions would be used for left positions and negative for right positions
         positions = torch.arange(length - 1, -length, -1, dtype=torch.float32, device=device).unsqueeze(1)
-        self.create_pe(positions=positions)
+        self.create_pe(positions=positions, dtype=dtype)
 
     def forward(self, x, cache_len=0):
         """Compute positional encoding.
@@ -1012,7 +1015,7 @@ def __init__(self, att_context_size, **kwargs):
         self.left_context = att_context_size[0]
         self.right_context = att_context_size[1]
 
-    def extend_pe(self, length, device):
+    def extend_pe(self, length, device, dtype):
         """Reset and extend the positional encodings only at the beginning"""
         if hasattr(self, 'pe'):
             return
@@ -1020,7 +1023,7 @@ def extend_pe(self, length, device):
         positions = torch.arange(
             self.left_context, -self.right_context - 1, -1, dtype=torch.float32, device=device
         ).unsqueeze(1)
-        self.create_pe(positions=positions)
+        self.create_pe(positions=positions, dtype=dtype)
 
     def forward(self, x, cache_len=0):
         """Compute positional encoding.
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index 8465406224e7..c270e5c3a0f7 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -232,7 +232,7 @@ def get_buffered_pred_feat_multitaskAED(
 
 
 def wrap_transcription(hyps: List[str]) -> List[rnnt_utils.Hypothesis]:
-    """ Wrap transcription to the expected format in func write_transcription """
+    """Wrap transcription to the expected format in func write_transcription"""
     wrapped_hyps = []
     for hyp in hyps:
         hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], text=hyp)
@@ -241,7 +241,7 @@ def wrap_transcription(hyps: List[str]) -> List[rnnt_utils.Hypothesis]:
 
 
 def setup_model(cfg: DictConfig, map_location: torch.device) -> Tuple[ASRModel, str]:
-    """ Setup model from cfg and return model and model name for next step """
+    """Setup model from cfg and return model and model name for next step"""
     if cfg.model_path is not None and cfg.model_path != "None":
         # restore model from .nemo file path
         model_cfg = ASRModel.restore_from(restore_path=cfg.model_path, return_config=True)
@@ -249,13 +249,15 @@ def setup_model(cfg: DictConfig, map_location: torch.device) -> Tuple[ASRModel,
         imported_class = model_utils.import_class_by_path(classpath)  # type: ASRModel
         logging.info(f"Restoring model : {imported_class.__name__}")
         asr_model = imported_class.restore_from(
-            restore_path=cfg.model_path, map_location=map_location,
+            restore_path=cfg.model_path,
+            map_location=map_location,
         )  # type: ASRModel
         model_name = os.path.splitext(os.path.basename(cfg.model_path))[0]
     else:
         # restore model by name
         asr_model = ASRModel.from_pretrained(
-            model_name=cfg.pretrained_name, map_location=map_location,
+            model_name=cfg.pretrained_name,
+            map_location=map_location,
         )  # type: ASRModel
         model_name = cfg.pretrained_name
 
@@ -269,7 +271,7 @@ def setup_model(cfg: DictConfig, map_location: torch.device) -> Tuple[ASRModel,
 
 
 def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
-    """ Prepare audio data and decide whether it's partial_audio condition. """
+    """Prepare audio data and decide whether it's partial_audio condition."""
     # this part may need refactor alongsides with refactor of transcribe
     partial_audio = False
 
@@ -282,11 +284,20 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
             logging.error(f"The input dataset_manifest {cfg.dataset_manifest} is empty. Exiting!")
             return None
 
+        audio_key = cfg.get('audio_key', 'audio_filepath')
+
+        with open(cfg.dataset_manifest, "rt") as fh:
+            for line in fh:
+                item = json.loads(line)
+                item["audio_filepath"] = get_full_path(item["audio_filepath"], cfg.dataset_manifest)
+                if item.get("duration") is None and cfg.presort_manifest:
+                    raise ValueError(
+                        f"Requested presort_manifest=True, but line {line} in manifest {cfg.dataset_manifest} lacks a 'duration' field."
+                    )
         all_entries_have_offset_and_duration = True
         for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=cfg.presort_manifest):
             if not ("offset" in item and "duration" in item):
                 all_entries_have_offset_and_duration = False
-            audio_key = cfg.get('audio_key', 'audio_filepath')
             audio_file = get_full_path(audio_file=item[audio_key], manifest_file=cfg.dataset_manifest)
             filepaths.append(audio_file)
         partial_audio = all_entries_have_offset_and_duration
@@ -322,7 +333,7 @@ def restore_transcription_order(manifest_path: str, transcriptions: list) -> lis
 
 
 def compute_output_filename(cfg: DictConfig, model_name: str) -> DictConfig:
-    """ Compute filename of output manifest and update cfg"""
+    """Compute filename of output manifest and update cfg"""
     if cfg.output_filename is None:
         # create default output filename
         if cfg.audio_dir is not None:
@@ -363,7 +374,7 @@ def write_transcription(
     compute_langs: bool = False,
     compute_timestamps: bool = False,
 ) -> Tuple[str, str]:
-    """ Write generated transcription to output file. """
+    """Write generated transcription to output file."""
     if cfg.append_pred:
         logging.info(f'Transcripts will be written in "{cfg.output_filename}" file')
         if cfg.pred_name_postfix is not None:
@@ -533,7 +544,11 @@ def transcribe_partial_audio(
                     lg = logits[idx][: logits_len[idx]]
                     hypotheses.append(lg)
             else:
-                current_hypotheses, _ = decode_function(logits, logits_len, return_hypotheses=return_hypotheses,)
+                current_hypotheses, _ = decode_function(
+                    logits,
+                    logits_len,
+                    return_hypotheses=return_hypotheses,
+                )
 
                 if return_hypotheses:
                     # dump log probs per file
@@ -567,10 +582,9 @@ def compute_metrics_per_sample(
     punctuation_marks: List[str] = [".", ",", "?"],
     output_manifest_path: str = None,
 ) -> dict:
-
     '''
     Computes metrics per sample for given manifest
-    
+
     Args:
         manifest_path: str, Required - path to dataset JSON manifest file (in NeMo format)
         reference_field: str, Optional - name of field in .json manifest with the reference text ("text" by default).
@@ -578,7 +592,7 @@ def compute_metrics_per_sample(
         metrics: list[str], Optional - list of metrics to be computed (currently supported "wer", "cer", "punct_er")
         punctuation_marks: list[str], Optional - list of punctuation marks for computing punctuation error rate ([".", ",", "?"] by default).
         output_manifest_path: str, Optional - path where .json manifest with calculated metrics will be saved.
-    
+
     Returns:
         samples: dict - Dict of samples with calculated metrics
     '''
diff --git a/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py b/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py
index 2637e33ebd2a..c4ee4b97a2a6 100644
--- a/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py
+++ b/tests/collections/asr/mixins/adapters/test_asr_adapter_modules.py
@@ -150,7 +150,7 @@ def test_relmha_adapter_init(self, n_head, proj_dim):
         relpos_enc = adapter_modules.RelPositionalEncodingAdapter(d_model=50)
 
         pad_mask, att_mask = get_mask(lengths)
-        relpos_enc.extend_pe(lengths.max(), device='cpu')
+        relpos_enc.extend_pe(lengths.max(), device='cpu', dtype=torch.float32)
 
         with torch.no_grad():
             assert adapter.linear_out.weight.sum() == 0
@@ -171,7 +171,7 @@ def test_abspos_encoding_init(self):
 
         relpos_enc = adapter_modules.PositionalEncodingAdapter(d_model=50)
 
-        relpos_enc.extend_pe(lengths.max(), device='cpu')
+        relpos_enc.extend_pe(lengths.max(), device='cpu', dtype=torch.float32)
 
         with torch.no_grad():
             out, pos_emb = relpos_enc(x)
@@ -187,7 +187,7 @@ def test_relpos_encoding_init(self):
 
         relpos_enc = adapter_modules.RelPositionalEncodingAdapter(d_model=50)
 
-        relpos_enc.extend_pe(lengths.max(), device='cpu')
+        relpos_enc.extend_pe(lengths.max(), device='cpu', dtype=torch.float32)
 
         with torch.no_grad():
             out, pos_emb = relpos_enc(x)

From 2fbf74adb5e1af835138e650d73c67f38f00645d Mon Sep 17 00:00:00 2001
From: huvunvidia <86480512+huvunvidia@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:00:49 -0700
Subject: [PATCH 166/178] Huvu/rag pipeline citest (#9384)

* huvu/NeMo_rag_citest first commit

* adding llama-index to dependency

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* adjusting data/models path in ci-test to dependency

* putting llama-index to optional

* update cicd-main.yml

---------

Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml | 64 +++++++++++++++++++++++++++++++++
 Dockerfile.ci                   |  1 +
 2 files changed, 65 insertions(+)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 815b8b5d69be..1b0690ce0082 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3006,6 +3006,68 @@ jobs:
   #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
   #         if: "failure()"
 
+  L2_RAG_Pipeline_Indexing:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/rag/rag_indexing.py \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            trainer.precision='bf16-mixed' \
+            indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \
+            indexing.embedder.embed_batch_size=128 \
+            indexing.data.data_path='/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data' \
+            indexing.data.chunk_size=256 \
+            indexing.data.chunk_overlap=10 \
+            indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index'
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
+  L2_RAG_Pipeline_Generating:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/rag/rag_generating.py \
+            trainer.devices=1 \
+            trainer.precision='bf16-mixed' \
+            indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \
+            indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' \
+            generating.llm.model_path='/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo' \
+            generating.inference.tokens_to_generate=50 \
+            generating.inference.greedy=False \
+            generating.inference.temperature=1.0 \
+            generating.query='Which art schools did I applied to?'
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
   L2_BioMegatron_Bert_NER_Task:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4923,6 +4985,8 @@ jobs:
       - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training
       - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training
       - L2_Megatron_RETRO_Pretraining_and_Resume_Training
+      - L2_RAG_Pipeline_Indexing
+      - L2_RAG_Pipeline_Generating
       - L2_BioMegatron_Bert_NER_Task
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
diff --git a/Dockerfile.ci b/Dockerfile.ci
index 5b2cd8d6eb61..18188f7be45f 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -47,6 +47,7 @@ pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.n
 "megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
 "nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
 "apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
+"llama-index==0.10.43" \
 -r tools/ctc_segmentation/requirements.txt \
 ".[all]"
 

From 12437ea36bbf70da4d44229b8b16311022f09317 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Thu, 6 Jun 2024 13:05:34 -0400
Subject: [PATCH 167/178] Re-org export code (#9353)

* reorg the export code

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* replaced log with raise

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* add converter and loader folders

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* move nemo_ckpt_convert into the converter folder

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* move nemo_file into loader folder

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* reorg converter

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* continue to reorg converter

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* continue to reorg

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* move nemo file back into nemo folder

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* renamed nemo folder to nemo_ckpt_loader

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* remove unused function

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* removed nemo file

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* moved a function to tensorrt_llm_run file

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* Remove unused imports

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* import csv added

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py                   |  12 +-
 nemo/export/trt_llm/converter/__init__.py     |  13 +
 .../model_converter.py}                       | 205 +-----
 .../converter/model_to_trt_llm_ckpt.py        | 251 +++++++
 .../{nemo/convert.py => converter/utils.py}   |  20 +-
 nemo/export/trt_llm/nemo/nemo.py              | 255 -------
 nemo/export/trt_llm/nemo/nemo_ckpt_convert.py | 647 ------------------
 .../{nemo => nemo_ckpt_loader}/__init__.py    |   2 +-
 .../trt_llm/nemo_ckpt_loader/nemo_file.py     | 406 +++++++++++
 .../sentencepiece_tokenizer.py                |   0
 nemo/export/trt_llm/qnemo/tokenizer_utils.py  |   2 +-
 nemo/export/trt_llm/tensorrt_llm_build.py     |  10 -
 nemo/export/trt_llm/tensorrt_llm_run.py       |  61 +-
 13 files changed, 774 insertions(+), 1110 deletions(-)
 create mode 100644 nemo/export/trt_llm/converter/__init__.py
 rename nemo/export/trt_llm/{nemo_utils.py => converter/model_converter.py} (54%)
 create mode 100644 nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
 rename nemo/export/trt_llm/{nemo/convert.py => converter/utils.py} (97%)
 delete mode 100644 nemo/export/trt_llm/nemo/nemo.py
 delete mode 100644 nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
 rename nemo/export/trt_llm/{nemo => nemo_ckpt_loader}/__init__.py (86%)
 create mode 100644 nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
 rename nemo/export/trt_llm/{nemo => nemo_ckpt_loader}/sentencepiece_tokenizer.py (100%)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 401ac2e930a6..7705f6553210 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -28,7 +28,8 @@
 
 from nemo.deploy import ITritonDeployable
 from nemo.export.tarutils import TarPath, unpack_tarball
-from nemo.export.trt_llm.nemo_utils import get_tokenzier, is_nemo_file, nemo_to_trtllm_config
+from nemo.export.trt_llm.converter.model_converter import model_to_trtllm_ckpt
+from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import get_tokenzier, is_nemo_file, load_nemo_model
 from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
 from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
 from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine
@@ -225,15 +226,16 @@ def export(
                     lora_target_modules=lora_target_modules,
                 )
             else:
-                weights_dicts, model_configs, self.tokenizer = nemo_to_trtllm_config(
-                    in_file=nemo_checkpoint_path,
+                model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
+                weights_dicts, model_configs = model_to_trtllm_ckpt(
+                    model=model,
+                    nemo_model_config=model_configs,
+                    nemo_export_dir=nemo_export_dir,
                     decoder_type=model_type,
                     dtype=dtype,
                     tensor_parallel_size=tensor_parallel_size,
                     pipeline_parallel_size=pipeline_parallel_size,
                     use_parallel_embedding=use_parallel_embedding,
-                    nemo_export_dir=nemo_export_dir,
-                    save_nemo_model_config=save_nemo_model_config,
                 )
 
                 for weight_dict, model_config in zip(weights_dicts, model_configs):
diff --git a/nemo/export/trt_llm/converter/__init__.py b/nemo/export/trt_llm/converter/__init__.py
new file mode 100644
index 000000000000..4fc50543f1d2
--- /dev/null
+++ b/nemo/export/trt_llm/converter/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/converter/model_converter.py
similarity index 54%
rename from nemo/export/trt_llm/nemo_utils.py
rename to nemo/export/trt_llm/converter/model_converter.py
index 7e687ce020da..5e522d8bbff2 100644
--- a/nemo/export/trt_llm/nemo_utils.py
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -13,14 +13,9 @@
 # limitations under the License.
 
 
-import argparse
 import csv
-import datetime
 import logging
-import os
-import sys
-from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Tuple
 
 import numpy as np
 import tensorrt_llm
@@ -28,20 +23,9 @@
 from tensorrt_llm.functional import non_gated_version
 from tensorrt_llm.layers import MoeConfig
 from tensorrt_llm.models.modeling_utils import PretrainedConfig
-from transformers import AutoTokenizer, LlamaConfig, PreTrainedTokenizer
 
-from nemo.export.tarutils import TarPath
-from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir
-from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint
-
-
-DECODER_MODEL_TYPE = {
-    "gptj": 'GPTForCausalLM',
-    "gptnext": 'GPTForCausalLM',
-    "llama": 'LLaMAForCausalLM',
-    "gemma": 'GemmaForCausalLM',
-    "falcon": 'FalconForCausalLM',
-}
+from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import convert_model_to_trt_llm_ckpt
+from nemo.export.trt_llm.converter.utils import DECODER_MODEL_TYPE, split
 
 LOGGER = logging.getLogger("NeMo")
 
@@ -80,181 +64,26 @@ def prompt_convert(prompt_config, prompt_weights):
     return vtokens_embeddings
 
 
-def is_nemo_file(path):
-    flag = False
-
-    if path is not None:
-        if len(path) > 5:
-            pc = pathlib.Path(path)
-            if pc.exists():
-                if pc.is_file():
-                    if path[-5 : len(path)] == ".nemo":
-                        flag = True
-
-    return flag
-
-
-def split(v, tp_size, idx, dim=0):
-    """Splits the np tensor v on dim and return the idx's slice."""
-    if tp_size == 1:
-        return v
-    if len(v.shape) == 1:
-        return np.ascontiguousarray(np.split(v, tp_size)[idx])
-    else:
-        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
-
-
-def _nemo_llm_decode(
-    in_file: str,
-    out_dir: str,
-    tensor_parallelism: int = 1,
-    processes: int = 1,
-    storage_type: str = "bfloat16",
-    load_checkpoints_on_gpu: bool = False,
-    decoder_type: str = "gptnext",
-    use_parallel_embedding: bool = False,
-    save_nemo_model_config: bool = False,
-) -> Tuple[Dict[str, np.ndarray], PretrainedConfig, PreTrainedTokenizer]:
-    """Decodes the NEMO file and returns the weights dict, llm config and tokenizer."""
-    args = argparse.Namespace()
-    args.out_dir = out_dir
-    args.tensor_parallelism = tensor_parallelism
-    args.processes = processes
-    args.storage_type = storage_type
-    args.load_checkpoints_on_gpu = load_checkpoints_on_gpu
-    args.verbose = False
-    args.decoder_type = decoder_type
-    args.use_parallel_embedding = use_parallel_embedding
-
-    if not os.path.exists(in_file):
-        LOGGER.error("%s does not exist", in_file)
-        sys.exit(1)
-
-    if os.path.isdir(in_file):
-        nemo_dir = Path(in_file)
-    else:
-        nemo_dir = TarPath(in_file)
-
-    try:
-        unpacked_checkpoint_dir = UnpackedNemoCheckpointDir(
-            nemo_dir, load_checkpoints_to_cpu=not args.load_checkpoints_on_gpu
-        )
-
-        start_time = datetime.datetime.now()
-        dist_ckpt_folder = nemo_dir / "model_weights"
-
-        if dist_ckpt_folder.exists():
-            weights_dict, llm_config, tokenizer = convert_dist_checkpoint(unpacked_checkpoint_dir, args)
-        else:
-            raise Exception(
-                "Not a supported nemo file format. " "Only distributed mcore nemo checkpoints are support."
-            )
-
-        LOGGER.info("Spent %s (h:m:s) to convert the model", datetime.datetime.now() - start_time)
-
-        if save_nemo_model_config:
-            # Copy the config file without using shutil.copy(...) because input may be a TarPath
-            with (unpacked_checkpoint_dir._checkpoints_dir / "model_config.yaml").open("rb") as infile:
-                with open(os.path.join(args.out_dir, "model_config.yaml"), "wb") as outfile:
-                    outfile.write(infile.read())
-    finally:
-        if isinstance(nemo_dir, TarPath):
-            nemo_dir.tarobject.close()
-
-    return weights_dict, llm_config, tokenizer
-
-
-def get_tokenzier(tokenizer_dir_or_path: Path) -> PreTrainedTokenizer:
-    """Loads the tokenizer from the decoded NEMO weights dir."""
-    if os.path.isdir(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer")):
-        return AutoTokenizer.from_pretrained(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer"))
-
-    model_path = tokenizer_dir_or_path / "tokenizer.model" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
-    tokenizer_config = {"library": "sentencepiece", "model": str(model_path)}
-    return build_tokenizer(tokenizer_config)
-
-
-def to_word_list_format(
-    word_dict: List[List[str]],
-    tokenizer=None,
-    ref_str="<extra_id_1>",
-):
-    '''
-    format of word_dict
-        len(word_dict) should be same to batch_size
-        word_dict[i] means the words for batch i
-        len(word_dict[i]) must be 1, which means it only contains 1 string
-        This string can contains several sentences and split by ",".
-        For example, if word_dict[2] = " I am happy, I am sad", then this function will return
-        the ids for two short sentences " I am happy" and " I am sad".
-    '''
-    assert tokenizer is not None, "need to set tokenizer"
-
-    flat_ids = []
-    offsets = []
-    # The encoding of a single word can't always be trusted. See
-    #   https://github.com/NVIDIA/NeMo/blob/bb575b72fd0be51ae10cc77d9f89ddb9e9d3b96d/nemo/collections/nlp/modules/common/text_generation_strategy.py#L229
-    ids_ref = tokenizer.encode(ref_str)
-    for word_dict_item in word_dict:
-        item_flat_ids = []
-        item_offsets = []
-
-        if isinstance(word_dict_item[0], bytes):
-            word_dict_item = [word_dict_item[0].decode()]
-
-        words = list(csv.reader(word_dict_item))[0]
-        for word in words:
-            ids = tokenizer.encode(f"{ref_str}{word}")
-            if ids[0 : len(ids_ref)] == ids_ref:
-                # It worked! We can obtain the token(s) associated to `word` by stripping the prefix tokens.
-                ids = ids[len(ids_ref) :]
-            else:
-                # Unfortunately the prefix was merged with `word`. We could try with a different prefix, but
-                # for now we just use the basic encoding since this should be a very rare edge case.
-                ids = tokenizer.encode(word)
-                logging.warning(f"The encoding of word '{word}' into tokens {ids} might be incorrect")
-
-            if len(ids) == 0:
-                continue
-
-            item_flat_ids += ids
-            item_offsets.append(len(ids))
-
-        flat_ids.append(np.array(item_flat_ids))
-        offsets.append(np.cumsum(np.array(item_offsets)))
-
-    pad_to = max(1, max(len(ids) for ids in flat_ids))
-
-    for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
-        flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
-        offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
-
-    return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
-
-
-def nemo_to_trtllm_config(
-    in_file: str,
+def model_to_trtllm_ckpt(
+    model,
+    nemo_model_config,
+    nemo_export_dir,
     decoder_type: str,
-    nemo_export_dir: Union[str, Path],
     dtype: str = "bfloat16",
     tensor_parallel_size: int = 1,
     pipeline_parallel_size: int = 1,
     use_parallel_embedding: bool = False,
-    save_nemo_model_config: bool = False,
-) -> Tuple[List[Dict], List[PretrainedConfig], PreTrainedTokenizer]:
-    """Converts the NEMO file and construct the `PretrainedConfig` before tensorrt_llm deployment."""
-    dtype_str = dtype
-
-    weights_dict, nemo_model_config, tokenizer = _nemo_llm_decode(
-        in_file=in_file,
-        out_dir=nemo_export_dir,
-        tensor_parallelism=tensor_parallel_size,
+) -> Tuple[List[Dict], List[PretrainedConfig]]:
+
+    weights_dict = convert_model_to_trt_llm_ckpt(
+        model=model,
+        nemo_model_config=nemo_model_config,
+        nemo_export_dir=nemo_export_dir,
+        inference_tp_size=tensor_parallel_size,
         processes=1,
-        storage_type=dtype_str,
+        storage_type=dtype,
         use_parallel_embedding=use_parallel_embedding,
-        load_checkpoints_on_gpu=False,
         decoder_type=decoder_type,
-        save_nemo_model_config=save_nemo_model_config,
     )
 
     world_size = tensor_parallel_size * pipeline_parallel_size
@@ -275,7 +104,7 @@ def nemo_to_trtllm_config(
 
     config = {
         'architecture': DECODER_MODEL_TYPE[decoder_type],
-        'dtype': dtype_str,
+        'dtype': dtype,
         'num_hidden_layers': nemo_model_config.get('num_layers'),
         'num_attention_heads': nemo_model_config.get('num_attention_heads'),
         'num_key_value_heads': nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
@@ -387,4 +216,4 @@ def nemo_to_trtllm_config(
         model_configs.append(model_config)
         weights_dicts.append(weights_dict_local)
 
-    return weights_dicts, model_configs, tokenizer
+    return weights_dicts, model_configs
diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
new file mode 100644
index 000000000000..df7e43548a44
--- /dev/null
+++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import math
+import multiprocessing
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+import torch
+from tensorrt_llm._utils import pad_vocab_size, str_dtype_to_torch, torch_to_numpy
+from tqdm import tqdm
+
+from nemo.export.trt_llm.converter.utils import split_and_save_weight
+
+LOGGER = logging.getLogger("NeMo")
+
+layer_names = {
+    "position_embedding": "embedding.position_embeddings.weight",
+    "word_embedding": "embedding.word_embeddings.weight",
+    "output_layer": "output_layer.weight",
+    "final_layernorm.weight": "final_layernorm.weight",
+    "final_layernorm.bias": "final_layernorm.bias",
+}
+
+
+def extract_layers_with_prefix(model_, prefix):
+    length_to_trim = len(prefix)
+    model_state = model_.get("state_dict", model_)
+    return {key[length_to_trim:]: model_state[key] for key in model_state.keys() if prefix in key}
+
+
+def get_layer_name(layer_type: str, prefix: str):
+    layer_dict = layer_names
+    if layer_type in layer_dict:
+        return prefix + layer_dict[layer_type]
+    else:
+        raise ValueError(f"Unknown layer type {layer_type}")
+
+
+def get_layer_prefix(layer_names, is_mcore):
+    transformer_layer_prefix = None
+
+    for layer_name in layer_names:
+        if 'self_attention' in layer_name:
+            transformer_layer_prefix = layer_name.split('layers')[0]
+            break
+    assert transformer_layer_prefix is not None, "Cannot extract transformer layer prefix from {layer_name}"
+    if is_mcore:
+        model_prefix = transformer_layer_prefix.split('decoder')[0]
+    else:
+        model_prefix = transformer_layer_prefix.split('encoder')[0]
+    assert model_prefix is not None, "Cannot extract model prefix from {layer_name}"
+
+    return model_prefix, transformer_layer_prefix
+
+
+def rename_key_dist_ckpt(old_key: str, layer: int):
+    new_key = old_key
+
+    if "layers." in old_key:
+        split_key = old_key.split(".")
+        split_key.insert(1, str(layer))
+        new_key = ".".join(split_key)
+
+        if "self_attention" in new_key:
+            new_key = new_key.replace("self_attention", "attention")
+        if "attention.linear_qkv.layer_norm_weight" in new_key:
+            new_key = new_key.replace("attention.linear_qkv.layer_norm_weight", "input_layernorm.weight")
+        if "attention.linear_qkv.layer_norm_bias" in new_key:
+            new_key = new_key.replace("attention.linear_qkv.layer_norm_bias", "input_layernorm.bias")
+        if "mlp.linear_fc1.layer_norm_weight" in new_key:
+            new_key = new_key.replace("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight")
+        if "mlp.linear_fc1.layer_norm_bias" in new_key:
+            new_key = new_key.replace("mlp.linear_fc1.layer_norm_bias", "post_attention_layernorm.bias")
+
+    return new_key
+
+
+@torch.no_grad()
+def convert_model_to_trt_llm_ckpt(
+    nemo_model_config,
+    model,
+    nemo_export_dir,
+    storage_type,
+    inference_tp_size,
+    decoder_type,
+    use_parallel_embedding,
+    processes,
+):
+
+    # if checkpoints files could be found - start preparing output dir
+    out_dir = create_export_dir(nemo_export_dir)
+    storage_type = str_dtype_to_torch(storage_type)
+    is_mcore = nemo_model_config.get("mcore_gpt", False)
+
+    # load position_embedding from rank 0
+    model_state_dict = model.get("state_dict", model)
+
+    prefix, transformer_layer_prefix = get_layer_prefix(model_state_dict.keys(), is_mcore)
+
+    has_position_embedding = get_layer_name("position_embedding", prefix) in model_state_dict
+    has_lm_head = get_layer_name("output_layer", prefix) in model_state_dict
+    share_embeddings_and_output = nemo_model_config.get("share_embeddings_and_output_weights", False)
+    embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
+    hidden_size = nemo_model_config["hidden_size"]
+
+    num_layers = nemo_model_config["num_layers"]
+    training_tp_size = 1
+    training_pp_size = 1
+    num_kv_heads = nemo_model_config.get("num_query_groups", 0)
+    multi_query_mode = nemo_model_config.get("multi_query_mode", False)
+    num_attention_heads = nemo_model_config["num_attention_heads"]
+    kv_channels = nemo_model_config.get("kv_channels", None)
+
+    if num_kv_heads == 0:
+        if multi_query_mode:
+            num_kv_heads = 1
+        else:
+            num_kv_heads = num_attention_heads
+
+    export_config = {
+        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
+        "tp_size": training_tp_size,
+        "split_gated_activation": nemo_model_config.get("activation", "gelu")
+        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu"]
+        and (decoder_type == "gptnext" or is_mcore),
+        "num_attention_heads": num_attention_heads,
+        "num_kv_heads": num_kv_heads,
+        "kv_channels": kv_channels,
+        "use_attention_nemo_shape": True,
+        "transpose_weights": True,
+        "use_parallel_embedding": use_parallel_embedding,
+    }
+
+    # split_factor: in how many parts a TP training node is split
+    split_factor = inference_tp_size
+    model_level_weights = defaultdict(list)
+
+    def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
+        if tp_idx == 0 and pp_idx == 0:
+            if has_position_embedding:
+                val = model[get_layer_name("position_embedding", prefix)]
+                val = torch_to_numpy(val.to(storage_type).cpu())
+                model_level_weights["transformer.position_embedding.weight"].append(val)
+        if pp_idx == 0:
+            val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
+            if embedding_scaling:
+                val = val * float(math.sqrt(hidden_size))
+
+            vocab_size = val.shape[0]
+            if use_parallel_embedding:
+                # Pad vocab_size first
+                if vocab_size % inference_tp_size != 0:
+                    vocab_size_padded = pad_vocab_size(vocab_size, inference_tp_size)
+                    pad_width = vocab_size_padded - vocab_size
+                    val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0)
+
+            val = torch_to_numpy(val.to(storage_type).cpu())
+            model_level_weights["transformer.vocab_embedding.weight"].append(val)
+            if share_embeddings_and_output:
+                val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
+                val = torch_to_numpy(val.to(storage_type).cpu())
+                model_level_weights["lm_head.weight"].append(val)
+        if has_lm_head and pp_idx == training_pp_size - 1:
+            val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
+            val = torch_to_numpy(val.to(storage_type).cpu())
+            model_level_weights["lm_head.weight"].append(val)
+
+    weights_dict = {}
+
+    tp_rank = 0
+
+    handle_model_level_weights(model, 0, 0)
+    model = extract_layers_with_prefix(model, transformer_layer_prefix)
+
+    starmap_args = []
+    for key, val in model.items():
+        if "_extra_state" not in key:
+            if len(val.size()) == 1:
+                starmap_args.append(
+                    (
+                        tp_rank,
+                        out_dir,
+                        split_factor,
+                        # Let's rename/map the key to the old layer name previously. You can try printing out
+                        # the rename_key output of the old llama checkpoint and compare.
+                        rename_key_dist_ckpt(key, 0),
+                        # Since the state dict value has the full layers, let's select the ith layer weights/biases here.
+                        [val],
+                        storage_type,
+                        None,
+                        export_config,
+                    )
+                )
+            else:
+                for i in range(num_layers):
+                    starmap_args.append(
+                        (
+                            tp_rank,
+                            out_dir,
+                            split_factor,
+                            # Let's rename/map the key to the old layer name previously. You can try printing out
+                            # the rename_key output of the old llama checkpoint and compare.
+                            rename_key_dist_ckpt(key, i),
+                            # Since the state dict value has the full layers, let's select the ith layer weights/biases here.
+                            [val[i]],
+                            storage_type,
+                            None,
+                            export_config,
+                        )
+                    )
+
+    starmap_args = tqdm(starmap_args, desc="saving weights")
+
+    if processes > 1:
+        with multiprocessing.Pool(processes) as pool:
+            weights_dicts = pool.starmap(split_and_save_weight, starmap_args)
+            weights_dict_local = {k: v for d in weights_dicts for k, v in d.items()}
+    else:
+        # simpler for debug situations
+        for starmap_arg in starmap_args:
+            weights_dict_local = split_and_save_weight(*starmap_arg)
+
+    weights_dict.update(weights_dict_local)
+
+    for key, values in model_level_weights.items():
+        model_level_weights[key] = np.concatenate(values, axis=0)
+        weights_dict[key] = model_level_weights[key]
+
+    return weights_dict
+
+
+def create_export_dir(nemo_export_dir):
+    out_dir = Path(nemo_export_dir)
+    if not out_dir.exists():
+        out_dir.mkdir(parents=True)
+    return out_dir
diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/converter/utils.py
similarity index 97%
rename from nemo/export/trt_llm/nemo/convert.py
rename to nemo/export/trt_llm/converter/utils.py
index aa2a29888703..469d624bdb18 100644
--- a/nemo/export/trt_llm/nemo/convert.py
+++ b/nemo/export/trt_llm/converter/utils.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Utilities for exporting a model to our custom format."""
 
 import numpy as np
 import torch
@@ -23,6 +22,15 @@
 weights_dict = {}
 
 
+DECODER_MODEL_TYPE = {
+    "gptj": 'GPTForCausalLM',
+    "gptnext": 'GPTForCausalLM',
+    "llama": 'LLaMAForCausalLM',
+    "gemma": 'GemmaForCausalLM',
+    "falcon": 'FalconForCausalLM',
+}
+
+
 def save_val(val, dir, key, tp_num=None):
     suffix = "" if tp_num is None else f".{tp_num}.bin"
     # Transpose linear layer weights to the correct shape.
@@ -396,3 +404,13 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
 
     global weights_dict
     return weights_dict
+
+
+def split(v, tp_size, idx, dim=0):
+    """Splits the np tensor v on dim and return the idx's slice."""
+    if tp_size == 1:
+        return v
+    if len(v.shape) == 1:
+        return np.ascontiguousarray(np.split(v, tp_size)[idx])
+    else:
+        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
diff --git a/nemo/export/trt_llm/nemo/nemo.py b/nemo/export/trt_llm/nemo/nemo.py
deleted file mode 100644
index 6276de5dddd9..000000000000
--- a/nemo/export/trt_llm/nemo/nemo.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import functools
-import logging
-import pathlib
-import typing
-
-import torch
-import yaml
-from transformers import FalconConfig, GPT2Config, LlamaConfig
-
-from nemo.export.tarutils import TarPath
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def cpu_map_location(storage, loc):
-    return storage.cpu()
-
-
-def gpu_map_location(storage, loc):
-    if loc.startswith("cuda"):
-        training_gpu_idx = int(loc.split(":")[1])
-        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
-        return storage.cuda(inference_gpu_idx)
-    elif loc.startswith("cpu"):
-        return storage.cpu()
-    else:
-        raise ValueError(f"Not handled {loc}")
-
-
-def nemo_to_llm_config(nemo_model_config, vocab_size, eos_id, bos_id, decoder_type):
-    convertion_dict = {
-        "activation_function": "activation",
-        "layer_norm_epsilon": "layernorm_epsilon",
-        "n_embd": "hidden_size",
-        "n_head": "num_attention_heads",
-        "n_layer": "num_layers",
-        "n_positions": "max_position_embeddings",
-        "rotary_pct": "rotary_percentage",
-        "rotary_base": "rotary_base",
-        "rotary_scaling": "seq_len_interpolation_factor",
-        "position_embedding_type": "position_embedding_type",
-        "bias": "bias",
-        "intermediate_size": "ffn_hidden_size",
-        "num_kv_heads": "num_query_groups",
-        "moe_num_experts": "num_moe_experts",
-        "moe_top_k": "moe_router_topk",
-        "moe_renorm_mode": "moe_renorm_mode",
-        "kv_channels": "kv_channels",
-        "norm_epsilon": "layernorm_epsilon",
-    }
-
-    kwargs = {key: nemo_model_config[value] for key, value in convertion_dict.items() if value in nemo_model_config}
-    kwargs["vocab_size"] = vocab_size
-    kwargs["eos_token_id"] = eos_id
-    kwargs["bos_token_id"] = eos_id if decoder_type == 'falcon' else bos_id  # in HF falcon eos==bos
-    if "moe_num_experts" not in kwargs:
-        kwargs["moe_num_experts"] = 0
-    config_dict = {"llama": LlamaConfig, "falcon": FalconConfig, "gemma": LlamaConfig}
-    llm_config = config_dict[decoder_type] if decoder_type in config_dict else GPT2Config
-
-    return llm_config(**kwargs)
-
-
-def add_special_tokens_to_tokenizer(tokenizer):
-    # Need to add cls, sep, mask tokens to the tokenizer if they don't exist.
-    # If cls, sep and mask are not attributes of the tokenizer, add it.
-    if not hasattr(tokenizer, "cls_token"):
-        tokenizer.add_special_tokens({"cls_token": "<cls>"})
-    if not hasattr(tokenizer.tokenizer, "sep_id"):
-        tokenizer.add_special_tokens({"sep_token": "<sep>"})
-    if not hasattr(tokenizer.tokenizer, "mask_id"):
-        tokenizer.add_special_tokens({"mask_token": "<mask>"})
-
-    # bos, eos, pad and unk may be present in the provided spm .model file, if they are, use it.
-    if not hasattr(tokenizer, "pad_token"):
-        if hasattr(tokenizer.tokenizer, "pad_id") and tokenizer.tokenizer.pad_id() > 0:
-            tokenizer.pad_token = tokenizer.tokenizer.id_to_piece(tokenizer.tokenizer.pad_id())
-        else:
-            tokenizer.add_special_tokens({"pad_token": "<pad>"})
-    else:
-        tokenizer.add_special_tokens({"pad_token": "<pad>"})
-
-    if not hasattr(tokenizer, "bos_token"):
-        if hasattr(tokenizer.tokenizer, "bos_id") and tokenizer.tokenizer.bos_id() > 0:
-            tokenizer.bos_token = tokenizer.tokenizer.id_to_piece(tokenizer.tokenizer.bos_id())
-        else:
-            tokenizer.add_special_tokens({"bos_token": "<bos>"})
-    else:
-        tokenizer.add_special_tokens({"bos_token": "<s>"})
-
-    if not hasattr(tokenizer, "eos_token"):
-        if hasattr(tokenizer.tokenizer, "eos_id") and tokenizer.tokenizer.eos_id() > 0:
-            tokenizer.eos_token = tokenizer.tokenizer.id_to_piece(tokenizer.tokenizer.eos_id())
-        else:
-            tokenizer.add_special_tokens({"eos_token": "<eos>"})
-    else:
-        tokenizer.add_special_tokens({"eos_token": "</s>"})
-
-
-def extract_layers_with_prefix(model_, prefix):
-    length_to_trim = len(prefix)
-    model_state = model_.get("state_dict", model_)
-    return {key[length_to_trim:]: model_state[key] for key in model_state.keys() if prefix in key}
-
-
-class UnpackedNemoCheckpointDir:
-    def __init__(
-        self,
-        checkpoints_dir: typing.Union[pathlib.Path, TarPath],
-        load_checkpoints_to_cpu: bool = False,
-    ):
-        assert isinstance(checkpoints_dir, (pathlib.Path, TarPath))
-        self._checkpoints_dir = checkpoints_dir
-        self._load_checkpoints_to_cpu = load_checkpoints_to_cpu
-
-    @property
-    @functools.lru_cache
-    def model_config(self):
-        model_config = None
-
-        model_config_filename = "model_config.yaml"
-        model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename))
-        if model_configs_paths:
-            if len(model_configs_paths) > 1:
-                LOGGER.debug(f"There are more than single {model_config_filename} in" f" {self._checkpoints_dir}")
-            model_config_path = model_configs_paths[0]
-            LOGGER.debug("Loading model config from %s", model_config_path)
-            with model_config_path.open("r") as model_config_file:
-                model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
-        else:
-            LOGGER.debug("Searching model config in checkpoints")
-            # try to obtain from checkpoint
-            checkpoint_name = self.checkpoint_name
-            checkpoints_paths = sorted(self._checkpoints_dir.rglob(checkpoint_name))
-            if checkpoints_paths:
-                # assume that parallel ranks 0 checkpoint should have model config embedded
-                checkpoint_path = checkpoints_paths[0]
-
-                map_location_fn = cpu_map_location if self._load_checkpoints_to_cpu else gpu_map_location
-
-                model_00 = torch.load(checkpoint_path, map_location=map_location_fn)
-                if "hyper_parameters" in model_00 and "cfg" in model_00["hyper_parameters"]:
-                    model_config = model_00["hyper_parameters"]["cfg"]
-                    LOGGER.debug("Loaded model config from checkpoint %s", checkpoint_path)
-                else:
-                    LOGGER.debug("Could not find model config in checkpoint %s", checkpoint_path)
-
-                del model_00
-
-        if model_config is None:
-            LOGGER.warning("Could not find checkpoint with NeMo model config in %s", self._checkpoints_dir)
-
-        LOGGER.debug("Loaded model config %s", model_config)
-
-        return model_config
-
-    @property
-    def checkpoints_dir(self):
-        return self._checkpoints_dir
-
-    def get_checkpoints_paths(self, tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
-        """Injects tensor/pipeline model parallel ranks into the filepath.
-        Does nothing if not using model parallelism.
-        """
-        checkpoint_path_without_rank = self.checkpoints_dir / self.checkpoint_name
-
-        def _inject_parallel_ranks(tp_rank, pp_rank):
-            if tensor_model_parallel_size > 1 or pipeline_model_parallel_size > 1:
-                if pipeline_model_parallel_size is None or pipeline_model_parallel_size == 1:
-                    checkpoint_path = (
-                        checkpoint_path_without_rank.parent
-                        / f"mp_rank_{tp_rank:02d}"
-                        / checkpoint_path_without_rank.name
-                    )
-                else:
-                    checkpoint_path = (
-                        checkpoint_path_without_rank.parent
-                        / f"tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}"
-                        / checkpoint_path_without_rank.name
-                    )
-                return checkpoint_path
-            else:
-                return checkpoint_path_without_rank
-
-        return [
-            [
-                _inject_parallel_ranks(tp_rank=tp_rank, pp_rank=pp_rank)
-                for pp_rank in range(pipeline_model_parallel_size)
-            ]
-            for tp_rank in range(tensor_model_parallel_size)
-        ]
-
-    @property
-    @functools.lru_cache
-    def checkpoint_name(self):
-        patterns = [
-            "model_weights.ckpt",  # older megatron checkpoints
-            "*last.ckpt",  # newer format of checkpoints
-        ]
-        for pattern in patterns:
-            model_files = sorted(list(self._checkpoints_dir.rglob(pattern)))
-            if model_files:
-                return model_files[0].name
-
-        raise ValueError(f"Could not find checkpoint files in {self._checkpoints_dir}")
-
-    @functools.lru_cache
-    def get_tokenizer_file_path(self, tokenizer_key, file_key, default_filename_pattern):
-        model_config = self.model_config
-        file_property = None
-        if tokenizer_key in model_config and file_key in model_config[tokenizer_key]:
-            file_property = model_config[tokenizer_key][file_key]
-        elif file_key in model_config:
-            file_property = model_config[file_key]
-
-        LOGGER.debug("model_config[%s][%s]=%s", tokenizer_key, file_key, file_property)
-
-        if file_property and file_property.startswith("nemo:"):
-            filename = file_property.split("nemo:")[1]
-            filename_pattern = f"*{filename}"
-        elif file_property and file_property.startswith("/artifacts/"):
-            filename = pathlib.Path(file_property).name
-            filename_pattern = f"*{filename}"
-        elif file_property is None or file_property == "None":
-            filename_pattern = None
-        else:
-            filename_pattern = default_filename_pattern
-            LOGGER.warning(
-                f"Tokenizer file from config: {tokenizer_key}.{file_key}={file_property} "
-                f"looks like unsupported path. Pattern {filename_pattern} will be used."
-            )
-
-        file_path = None
-        if filename_pattern is not None:
-            files_paths = list(self._checkpoints_dir.glob(filename_pattern))
-            if files_paths:
-                assert len(files_paths) == 1
-                file_path = files_paths[0]
-
-        return file_path
diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
deleted file mode 100644
index d83129b43fab..000000000000
--- a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
+++ /dev/null
@@ -1,647 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import configparser
-import json
-import logging
-import math
-import multiprocessing
-import os
-import typing
-from collections import defaultdict
-from pathlib import Path
-from typing import Union
-
-import numpy as np
-import tensorstore  # This is important even though not used. Otherwise zarr raises error.
-import torch
-import zarr
-from tensorrt_llm._utils import np_bfloat16, pad_vocab_size, str_dtype_to_torch, torch_to_numpy
-from torch.distributed.checkpoint import FileSystemReader, TensorStorageMetadata
-from torch.distributed.checkpoint.state_dict_loader import load_state_dict
-from tqdm import tqdm
-from transformers import AutoTokenizer, GPT2Tokenizer, LlamaConfig
-
-from nemo.export.tarutils import TarPath, ZarrPathStore
-from nemo.export.trt_llm.nemo.convert import split_and_save_weight
-from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir, extract_layers_with_prefix, nemo_to_llm_config
-from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
-
-LOGGER = logging.getLogger("NeMo")
-
-layer_names = {
-    "position_embedding": "embedding.position_embeddings.weight",
-    "word_embedding": "embedding.word_embeddings.weight",
-    "output_layer": "output_layer.weight",
-    "final_layernorm.weight": "final_layernorm.weight",
-    "final_layernorm.bias": "final_layernorm.bias",
-}
-
-
-def get_layer_name(layer_type: str, prefix: str):
-    layer_dict = layer_names
-    if layer_type in layer_dict:
-        return prefix + layer_dict[layer_type]
-    else:
-        raise ValueError(f"Unknown layer type {layer_type}")
-
-
-def get_layer_prefix(layer_names, is_mcore):
-    transformer_layer_prefix = None
-
-    for layer_name in layer_names:
-        if 'self_attention' in layer_name:
-            transformer_layer_prefix = layer_name.split('layers')[0]
-            break
-    assert transformer_layer_prefix is not None, "Cannot extract transformer layer prefix from {layer_name}"
-    if is_mcore:
-        model_prefix = transformer_layer_prefix.split('decoder')[0]
-    else:
-        model_prefix = transformer_layer_prefix.split('encoder')[0]
-    assert model_prefix is not None, "Cannot extract model prefix from {layer_name}"
-
-    return model_prefix, transformer_layer_prefix
-
-
-def get_layer_index(split_key):
-    index = 0
-    for key in split_key:
-        if key == "layers":
-            return index + 1
-        index += 1
-
-
-def rename_key(old_key: str, pp_rank: int, num_layers: int, pp_size: int):
-    new_key = old_key
-
-    if "layers." in old_key:
-        split_key = old_key.split(".")
-        layer_index = get_layer_index(split_key)
-        split_key[layer_index] = str(int(split_key[layer_index]) + pp_rank * num_layers // pp_size)
-        new_key = ".".join(split_key)
-
-        if "self_attention" in new_key:
-            new_key = new_key.replace("self_attention", "attention")
-        if "attention.linear_qkv.layer_norm_weight" in new_key:
-            new_key = new_key.replace("attention.linear_qkv.layer_norm_weight", "input_layernorm.weight")
-        if "mlp.linear_fc1.layer_norm_weight" in new_key:
-            new_key = new_key.replace("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight")
-
-    return new_key
-
-
-def rename_key_dist_ckpt(old_key: str, layer: int):
-    new_key = old_key
-
-    if "layers." in old_key:
-        split_key = old_key.split(".")
-        split_key.insert(1, str(layer))
-        new_key = ".".join(split_key)
-
-        if "self_attention" in new_key:
-            new_key = new_key.replace("self_attention", "attention")
-        if "attention.linear_qkv.layer_norm_weight" in new_key:
-            new_key = new_key.replace("attention.linear_qkv.layer_norm_weight", "input_layernorm.weight")
-        if "attention.linear_qkv.layer_norm_bias" in new_key:
-            new_key = new_key.replace("attention.linear_qkv.layer_norm_bias", "input_layernorm.bias")
-        if "mlp.linear_fc1.layer_norm_weight" in new_key:
-            new_key = new_key.replace("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight")
-        if "mlp.linear_fc1.layer_norm_bias" in new_key:
-            new_key = new_key.replace("mlp.linear_fc1.layer_norm_bias", "post_attention_layernorm.bias")
-
-    return new_key
-
-
-def load_sharded_metadata(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
-    with (checkpoint_dir / 'metadata.json').open(mode='r') as f:
-        config_dict = json.load(f)
-    if config_dict['sharded_backend'] == 'zarr':
-        return load_sharded_metadata_zarr(checkpoint_dir, torch_tensor)
-    elif config_dict['sharded_backend'] == 'torch_dist':
-        return load_sharded_metadata_torch_dist(checkpoint_dir, torch_tensor)
-    else:
-        raise NotImplementedError(f'Distributed checkpoint backend {config_dict["sharded_backend"]} not supported')
-
-
-class TarFileSystemReader(FileSystemReader):
-    """Reader that accepts both Path and TarPath checkpoint directory.
-
-    The FileSystemReader works with TarPath, but expects a pure Path.
-    It's enough to skip the Path check in __init__.
-    """
-
-    def __init__(self, path: Union[Path, TarPath]) -> None:
-        """No call to super().__init__ because it expects pure Path."""
-        self.path = path
-        self.storage_data = dict()
-
-
-def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
-    fs_reader = TarFileSystemReader(checkpoint_dir)
-    metadata = fs_reader.read_metadata()
-
-    state_dict = {
-        k: torch.empty(tp.size, dtype=tp.properties.dtype)
-        for k, tp in metadata.state_dict_metadata.items()
-        if isinstance(tp, TensorStorageMetadata)
-    }
-    load_state_dict(
-        state_dict,
-        storage_reader=fs_reader,
-        no_dist=True,
-    )
-
-    if not torch_tensor:
-        for k, v in state_dict.items():
-            if v.dtype == torch.bfloat16:
-                state_dict[k] = v.view(torch.int16).numpy().view(np_bfloat16)
-            else:
-                state_dict[k] = v.numpy()
-    return state_dict
-
-
-def load_sharded_metadata_zarr(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
-    sharded_state_dict = {}
-    for subdir in checkpoint_dir.iterdir():
-        if not subdir.is_dir() or not (subdir / '.zarray').exists():
-            continue
-        key = subdir.name
-
-        zstore = ZarrPathStore(subdir)
-        arr = zarr.open(zstore, 'r')
-
-        if torch_tensor:
-            # sharded_state_dict[key] = torch.from_numpy(arr[:].astype("float32")).to(dtype=torch.bfloat16)
-            if arr.dtype.name == "bfloat16":
-                sharded_state_dict[key] = torch.from_numpy(arr[:].view(np.int16)).view(torch.bfloat16)
-            else:
-                sharded_state_dict[key] = torch.from_numpy(arr[:])
-        else:
-            sharded_state_dict[key] = arr[:]
-
-    return sharded_state_dict
-
-
-@torch.no_grad()
-def convert_dist_checkpoint(unpacked_checkpoints_dir: UnpackedNemoCheckpointDir, args):
-    nemo_model_config = unpacked_checkpoints_dir.model_config
-    checkpoints_path = unpacked_checkpoints_dir.checkpoints_dir / "model_weights"
-
-    # if checkpoints files could be found - start preparing output dir
-    out_dir = create_out_dir(args)
-
-    storage_type = str_dtype_to_torch(args.storage_type)
-    is_mcore = nemo_model_config.get("mcore_gpt", False)
-
-    # load position_embedding from rank 0
-    model = load_sharded_metadata(checkpoints_path)
-    model_state_dict = model.get("state_dict", model)
-
-    prefix, transformer_layer_prefix = get_layer_prefix(model_state_dict.keys(), is_mcore)
-
-    has_position_embedding = get_layer_name("position_embedding", prefix) in model_state_dict
-    has_lm_head = get_layer_name("output_layer", prefix) in model_state_dict
-    share_embeddings_and_output = nemo_model_config.get("share_embeddings_and_output_weights", False)
-    embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
-    hidden_size = nemo_model_config["hidden_size"]
-
-    num_layers = nemo_model_config["num_layers"]
-    training_tp_size = 1
-    training_pp_size = 1
-    inference_tp_size = args.tensor_parallelism
-    num_kv_heads = nemo_model_config.get("num_query_groups", 0)
-    multi_query_mode = nemo_model_config.get("multi_query_mode", False)
-    num_attention_heads = nemo_model_config["num_attention_heads"]
-    kv_channels = nemo_model_config.get("kv_channels", None)
-    use_parallel_embedding = args.use_parallel_embedding
-    if num_kv_heads == 0:
-        if multi_query_mode:
-            num_kv_heads = 1
-        else:
-            num_kv_heads = num_attention_heads
-
-    export_config = {
-        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
-        "tp_size": training_tp_size,
-        "split_gated_activation": nemo_model_config.get("activation", "gelu")
-        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu"]
-        and (args.decoder_type == "gptnext" or is_mcore),
-        "num_attention_heads": num_attention_heads,
-        "num_kv_heads": num_kv_heads,
-        "kv_channels": kv_channels,
-        "use_attention_nemo_shape": True,
-        "transpose_weights": True,
-        "use_parallel_embedding": use_parallel_embedding,
-    }
-
-    # split_factor: in how many parts a TP training node is split
-    split_factor = inference_tp_size
-    model_level_weights = defaultdict(list)
-
-    def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
-        if tp_idx == 0 and pp_idx == 0:
-            if has_position_embedding:
-                val = model[get_layer_name("position_embedding", prefix)]
-                val = torch_to_numpy(val.to(storage_type).cpu())
-                model_level_weights["transformer.position_embedding.weight"].append(val)
-        if pp_idx == 0:
-            val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
-            if embedding_scaling:
-                val = val * float(math.sqrt(hidden_size))
-
-            vocab_size = val.shape[0]
-            if use_parallel_embedding:
-                # Pad vocab_size first
-                if vocab_size % inference_tp_size != 0:
-                    vocab_size_padded = pad_vocab_size(vocab_size, inference_tp_size)
-                    pad_width = vocab_size_padded - vocab_size
-                    val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0)
-
-            val = torch_to_numpy(val.to(storage_type).cpu())
-            model_level_weights["transformer.vocab_embedding.weight"].append(val)
-            if share_embeddings_and_output:
-                val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
-                val = torch_to_numpy(val.to(storage_type).cpu())
-                model_level_weights["lm_head.weight"].append(val)
-        if has_lm_head and pp_idx == training_pp_size - 1:
-            val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
-            val = torch_to_numpy(val.to(storage_type).cpu())
-            model_level_weights["lm_head.weight"].append(val)
-
-    weights_dict = {}
-
-    tp_rank = 0
-
-    handle_model_level_weights(model, 0, 0)
-    model = extract_layers_with_prefix(model, transformer_layer_prefix)
-
-    starmap_args = []
-    for key, val in model.items():
-        if "_extra_state" not in key:
-            if len(val.size()) == 1:
-                starmap_args.append(
-                    (
-                        tp_rank,
-                        out_dir,
-                        split_factor,
-                        # Let's rename/map the key to the old layer name previously. You can try printing out
-                        # the rename_key output of the old llama checkpoint and compare.
-                        rename_key_dist_ckpt(key, 0),
-                        # Since the state dict value has the full layers, let's select the ith layer weights/biases here.
-                        [val],
-                        storage_type,
-                        None,
-                        export_config,
-                    )
-                )
-            else:
-                for i in range(num_layers):
-                    starmap_args.append(
-                        (
-                            tp_rank,
-                            out_dir,
-                            split_factor,
-                            # Let's rename/map the key to the old layer name previously. You can try printing out
-                            # the rename_key output of the old llama checkpoint and compare.
-                            rename_key_dist_ckpt(key, i),
-                            # Since the state dict value has the full layers, let's select the ith layer weights/biases here.
-                            [val[i]],
-                            storage_type,
-                            None,
-                            export_config,
-                        )
-                    )
-
-    starmap_args = tqdm(starmap_args, desc="saving weights")
-
-    if args.processes > 1:
-        with multiprocessing.Pool(args.processes) as pool:
-            weights_dicts = pool.starmap(split_and_save_weight, starmap_args)
-            weights_dict_local = {k: v for d in weights_dicts for k, v in d.items()}
-    else:
-        # simpler for debug situations
-        for starmap_arg in starmap_args:
-            weights_dict_local = split_and_save_weight(*starmap_arg)
-
-    weights_dict.update(weights_dict_local)
-
-    for key, values in model_level_weights.items():
-        model_level_weights[key] = np.concatenate(values, axis=0)
-
-        weights_dict[key] = model_level_weights[key]
-
-    if nemo_model_config["tokenizer"].get("library", None) == "huggingface":
-        tokenizer = AutoTokenizer.from_pretrained(
-            nemo_model_config["tokenizer"]["type"], use_fast=nemo_model_config["tokenizer"].get("use_fast", False)
-        )
-    else:
-        tokenizer_config = update_tokenizer_paths(nemo_model_config["tokenizer"], unpacked_checkpoints_dir)
-        copy_tokenizer_files(tokenizer_config, out_dir)
-
-        tokenizer_config["model"] = os.path.join(out_dir, "tokenizer.model")
-        tokenizer = build_tokenizer(tokenizer_config)
-
-    return weights_dict, nemo_model_config, tokenizer
-
-
-@torch.no_grad()
-def convert_nemo_model(nemo_model, nemo_model_config, storage_type_str, decoder_type=None):
-    from megatron.core import parallel_state
-
-    is_mcore = nemo_model_config.get("mcore_gpt", False)
-
-    nemo_model_state_dict = nemo_model.state_dict()
-    prefix, transformer_layer_prefix = get_layer_prefix(nemo_model_state_dict, is_mcore)
-    has_position_embedding = get_layer_name("position_embedding", prefix) in nemo_model_state_dict
-    has_lm_head = get_layer_name("output_layer", prefix) in nemo_model_state_dict
-    has_final_layer_bias = get_layer_name("final_layernorm.bias", transformer_layer_prefix) in nemo_model_state_dict
-
-    tp_rank = parallel_state.get_tensor_model_parallel_rank()
-    tp_size = parallel_state.get_tensor_model_parallel_world_size()
-    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
-    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
-    pp_group = parallel_state.get_pipeline_model_parallel_group()
-    # split_factor = 1
-    storage_type = str_dtype_to_torch(storage_type_str)
-
-    num_layers = nemo_model_config["num_layers"]
-    training_tp_size = nemo_model_config.get("tensor_model_parallel_size", 1)
-    training_pp_size = nemo_model_config.get("pipeline_model_parallel_size", 1)
-    num_kv_heads = nemo_model_config.get("num_query_groups", 0)
-    multi_query_mode = nemo_model_config.get("multi_query_mode", False)
-    num_attention_heads = nemo_model_config["num_attention_heads"]
-
-    # pp currently unsupported so reshard away PP
-    is_pp_resharding = False
-    if pp_size > 1:
-        is_pp_resharding = True
-
-    if num_kv_heads == 0:
-        if multi_query_mode:
-            num_kv_heads = 1
-        else:
-            num_kv_heads = num_attention_heads
-
-    export_config = {
-        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
-        "tp_size": training_tp_size,
-        "split_gated_activation": "swiglu" in nemo_model_config.get("activation", "gelu")
-        and (decoder_type == "gptnext" or is_mcore),
-        "num_attention_heads": nemo_model_config["num_attention_heads"],
-        "num_kv_heads": num_kv_heads,
-        "use_attention_nemo_shape": True,
-        "transpose_weights": True,
-        "from_nemo_model": True,
-    }
-
-    # Gather meta data from first and last PP stage
-    if is_pp_resharding:
-        has_lm_head = torch.tensor(has_lm_head).cuda()
-        src_rank = torch.distributed.get_global_rank(pp_group, pp_size - 1)
-        torch.distributed.broadcast(has_lm_head, src_rank, group=pp_group)
-        has_lm_head = has_lm_head.item()
-
-        has_position_embedding = torch.tensor(has_position_embedding).cuda()
-        src_rank = torch.distributed.get_global_rank(pp_group, 0)
-        torch.distributed.broadcast(has_position_embedding, src_rank, group=pp_group)
-        has_position_embedding = has_position_embedding.item()
-
-        has_final_layer_bias = torch.tensor(has_final_layer_bias).cuda()
-        src_rank = torch.distributed.get_global_rank(pp_group, pp_size - 1)
-        torch.distributed.broadcast(has_final_layer_bias, src_rank, group=pp_group)
-        has_final_layer_bias = has_final_layer_bias.item()
-
-    trt_inflight_weights = {}
-    starmap_args = []
-
-    def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
-        def _handle_weights(src_key: str, dst_key: str, pp_src_idx: int, tensor_dim: int):
-            src_pp_global_rank = torch.distributed.get_global_rank(pp_group, pp_src_idx)
-            # Broadcast the shape
-            if pp_idx == pp_src_idx:
-                gathered_tensor = model.get("state_dict", model)[src_key].type(storage_type).cuda()
-                shape = torch.IntTensor(list(gathered_tensor.shape)).cuda()
-            else:
-                shape = torch.zeros(tensor_dim, dtype=torch.int32).cuda()
-            torch.distributed.broadcast(shape, src_pp_global_rank, group=pp_group)
-
-            # Collect the tensor
-            if pp_idx != pp_src_idx:
-                gathered_tensor = torch.zeros(*shape, dtype=storage_type).cuda()
-            torch.distributed.broadcast(gathered_tensor, src_pp_global_rank, group=pp_group)
-
-            if "final_layernorm" not in src_key:
-                gathered_tensor = gathered_tensor.to(storage_type).cpu()
-                trt_inflight_weights[dst_key] = torch_to_numpy(gathered_tensor)
-            else:
-                starmap_args.append(
-                    {
-                        "tp_rank": tp_idx,
-                        "saved_dir": trt_inflight_weights,
-                        "split_factor": 1,
-                        "key": dst_key,
-                        "vals": [gathered_tensor],
-                        "storage_type": storage_type,
-                        "act_range": None,
-                        "config": export_config,
-                    }
-                )
-
-        if has_lm_head:
-            _handle_weights(get_layer_name("output_layer", prefix), "model.lm_head.weight.bin", pp_size - 1, 2)
-        if has_position_embedding:
-            _handle_weights(get_layer_name("position_embedding", prefix), "model.wpe.bin", 0, 2)
-
-        _handle_weights(get_layer_name("word_embedding", prefix), "model.wte.bin", 0, 2)
-        _handle_weights(
-            get_layer_name("final_layernorm.weight", transformer_layer_prefix),
-            "final_layernorm.weight",
-            pp_size - 1,
-            1,
-        )
-
-        if has_final_layer_bias:
-            _handle_weights(
-                get_layer_name("final_layernorm.bias", transformer_layer_prefix),
-                "final_layernorm.bias",
-                pp_size - 1,
-                1,
-            )
-
-        torch.cuda.empty_cache()
-
-    models = []
-
-    handle_model_level_weights(nemo_model_state_dict, tp_rank, pp_rank)
-    layers = extract_layers_with_prefix(nemo_model_state_dict, transformer_layer_prefix)
-    models.append(layers)
-
-    for key in models[0].keys():
-        # Skip final_layernorm.
-        if not key.startswith("layers."):
-            continue
-        if "_extra_state" not in key:
-            starmap_args.append(
-                {
-                    "tp_rank": tp_rank,
-                    "saved_dir": trt_inflight_weights,
-                    "split_factor": 1,
-                    "key": rename_key(key, pp_rank, num_layers, training_pp_size),
-                    "vals": [model[key] for model in models],
-                    "storage_type": storage_type,
-                    "act_range": None,
-                    "config": export_config,
-                }
-            )
-    starmap_args = tqdm(starmap_args, desc="saving weights")
-    for starmap_arg in starmap_args:
-        save_weight_torch(**starmap_arg)
-
-    # Collect weights from different pp stages
-    # Assume each rank has the same number of layers
-    if is_pp_resharding:
-        collect_pp_weights = {}
-        for key, val in trt_inflight_weights.items():
-            # Skip embedding and final layer
-            if not key.startswith("model.layers"):
-                continue
-            # Convert numpy array to torch tensor and gather weights
-            curr_weight = trt_inflight_weights[key]
-            if curr_weight.dtype != np_bfloat16:
-                curr_weight = torch.tensor(curr_weight).cuda()
-            else:
-                curr_weight = torch.tensor(curr_weight.view(np.int16)).view(torch.bfloat16).cuda()
-            weight_list = [torch.zeros_like(curr_weight) for _ in range(pp_size)]
-            torch.distributed.all_gather(weight_list, curr_weight, group=pp_group)
-            # Collect weights name
-            for rank in range(pp_size):
-                split_key = key.split(".")
-                layer_index = get_layer_index(split_key)
-                split_key[layer_index] = str(int(split_key[layer_index]) + (rank - pp_rank) * num_layers // pp_size)
-                new_key = ".".join(split_key)
-                collect_pp_weights[new_key] = torch_to_numpy(weight_list[rank].to(storage_type).cpu())
-
-        trt_inflight_weights.update(collect_pp_weights)
-
-    vocab_size = trt_inflight_weights["model.wte.bin"].shape[0] * tp_size
-
-    llm_config = nemo_to_llm_config(
-        nemo_model_config,
-        vocab_size,
-        None,
-        None,
-        decoder_type=decoder_type,  # how to get eos_id and bos_id from different tokenizer?
-    )
-    llm_config.is_mcore = is_mcore
-
-    config = configparser.ConfigParser()
-    model_name = "llama" if isinstance(llm_config, LlamaConfig) else "gpt"
-    config[model_name] = {k: str(v) for k, v in vars(llm_config).items()}
-    config[model_name]["storage_dtype"] = storage_type_str
-
-    return trt_inflight_weights, llm_config
-
-
-def create_out_dir(args):
-    out_dir = Path(args.out_dir)
-    if not out_dir.exists():
-        out_dir.mkdir(parents=True)
-    return out_dir
-
-
-def update_tokenizer_paths(tokenizer_config: typing.Dict, unpacked_checkpoints_dir):
-    def _update_config_entry(key, file_pattern):
-        old_path = tokenizer_config[key]
-        if old_path is None:
-            return
-        old_path = Path(old_path)
-        new_path = unpacked_checkpoints_dir.get_tokenizer_file_path("tokenizer", key, file_pattern)
-        if new_path:
-            LOGGER.debug(f"Update tokenizer {key} {old_path} -> {new_path}")
-            tokenizer_config[key] = new_path
-        elif not old_path.exists():
-            LOGGER.warning(f"Tokenizer {key}'s path {old_path} does not exists: set it to None")
-            tokenizer_config[key] = None
-
-    _update_config_entry("model", "*.model")
-    _update_config_entry("vocab_file", "*vocab*")
-    _update_config_entry("merge_file", "*merge*.txt")
-
-    return tokenizer_config
-
-
-def copy_tokenizer_files(config, out_dir):
-    basenames = {
-        "model": "tokenizer",
-        "vocab_file": "vocab",
-        "merge_file": "merges",
-    }
-
-    for key in basenames.keys():
-        if config[key] is None:
-            continue
-
-        path = config[key]
-
-        if isinstance(path, str):
-            path = Path(path)
-
-        if not path.exists():
-            LOGGER.debug(f"Tokenizer {key}: {path} file not found")
-            continue
-
-        dst_path = out_dir / f"{basenames[key]}{path.suffix}"
-        LOGGER.debug(f"Copy tokenizer {key}: {path}->{dst_path}")
-
-        # Copy 'path' to 'dst_path' without shutil.copy(...) because 'path' may be a TarPath
-        with path.open('rb') as infile:
-            with open(dst_path, 'wb') as outfile:
-                outfile.write(infile.read())
-
-
-def build_tokenizer(tokenizer):
-    if isinstance(tokenizer, dict):
-        tokenizer_config = tokenizer
-        if tokenizer_config["library"] == "sentencepiece":
-            return SentencePieceTokenizer(model_path=tokenizer_config["model"])
-        elif "GPT2" in tokenizer_config["type"]:
-            tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"])
-        else:
-            raise ValueError(f'Tokenizer type {tokenizer_config["library"]} not handled')
-
-        if tokenizer.bos_token_id is None:
-            tokenizer.add_special_tokens({"bos_token": "<s>"})
-        if tokenizer.eos_token_id is None:
-            tokenizer.add_special_tokens({"eos_token": "</s>"})
-    else:
-        try:
-            # If NeMo tokenizer, monkey patch interface
-            from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-
-            if isinstance(tokenizer, TokenizerSpec):
-
-                def batch_encode_patch(self, ids):
-                    if torch.is_tensor(ids):
-                        ids = ids.cpu().numpy()
-                    return self.ids_to_text(ids)
-
-                tokenizer.bos_token_id = tokenizer.bos_id
-                tokenizer.eos_token_id = tokenizer.eos_id
-                tokenizer.encode = tokenizer.text_to_ids
-                TokenizerSpec.batch_decode = batch_encode_patch
-        except:
-            raise TypeError(f'Unsupported tokenizer build input: {type(tokenizer)}')
-
-    return tokenizer
diff --git a/nemo/export/trt_llm/nemo/__init__.py b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
similarity index 86%
rename from nemo/export/trt_llm/nemo/__init__.py
rename to nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
index 19059dfa144a..c9c6f65d27e0 100644
--- a/nemo/export/trt_llm/nemo/__init__.py
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 
 
-from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.export.trt_llm.nemo_ckpt_loader.sentencepiece_tokenizer import SentencePieceTokenizer
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
new file mode 100644
index 000000000000..09eae628999a
--- /dev/null
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -0,0 +1,406 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import functools
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Dict, Union
+
+import numpy as np
+import tensorstore  # This is important even though not used. Otherwise zarr raises error.
+import torch
+import yaml
+import zarr
+from torch.distributed.checkpoint import FileSystemReader
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+from nemo.export.tarutils import TarPath, ZarrPathStore
+from nemo.export.trt_llm.nemo_ckpt_loader.sentencepiece_tokenizer import SentencePieceTokenizer
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def is_nemo_file(path):
+    flag = False
+
+    if path is not None:
+        if len(path) > 5:
+            pc = Path(path)
+            if pc.exists():
+                if pc.is_file():
+                    if path[-5 : len(path)] == ".nemo":
+                        flag = True
+
+    return flag
+
+
+class TarFileSystemReader(FileSystemReader):
+    """Reader that accepts both Path and TarPath checkpoint directory.
+
+    The FileSystemReader works with TarPath, but expects a pure Path.
+    It's enough to skip the Path check in __init__.
+    """
+
+    def __init__(self, path: Union[Path, TarPath]) -> None:
+        """No call to super().__init__ because it expects pure Path."""
+        self.path = path
+        self.storage_data = dict()
+
+
+def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
+    fs_reader = TarFileSystemReader(checkpoint_dir)
+    metadata = fs_reader.read_metadata()
+
+    state_dict = {
+        k: torch.empty(tp.size, dtype=tp.properties.dtype)
+        for k, tp in metadata.state_dict_metadata.items()
+        if isinstance(tp, TensorStorageMetadata)
+    }
+    load_state_dict(
+        state_dict,
+        storage_reader=fs_reader,
+        no_dist=True,
+    )
+
+    if not torch_tensor:
+        for k, v in state_dict.items():
+            if v.dtype == torch.bfloat16:
+                state_dict[k] = v.view(torch.int16).numpy().view(np_bfloat16)
+            else:
+                state_dict[k] = v.numpy()
+    return state_dict
+
+
+def load_sharded_metadata_zarr(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
+    sharded_state_dict = {}
+    for subdir in checkpoint_dir.iterdir():
+        if not subdir.is_dir() or not (subdir / '.zarray').exists():
+            continue
+        key = subdir.name
+
+        zstore = ZarrPathStore(subdir)
+        arr = zarr.open(zstore, 'r')
+
+        if torch_tensor:
+            # sharded_state_dict[key] = torch.from_numpy(arr[:].astype("float32")).to(dtype=torch.bfloat16)
+            if arr.dtype.name == "bfloat16":
+                sharded_state_dict[key] = torch.from_numpy(arr[:].view(np.int16)).view(torch.bfloat16)
+            else:
+                sharded_state_dict[key] = torch.from_numpy(arr[:])
+        else:
+            sharded_state_dict[key] = arr[:]
+
+    return sharded_state_dict
+
+
+def load_sharded_metadata(checkpoint_dir: Union[Path, TarPath], torch_tensor=True):
+    with (checkpoint_dir / 'metadata.json').open(mode='r') as f:
+        config_dict = json.load(f)
+    if config_dict['sharded_backend'] == 'zarr':
+        return load_sharded_metadata_zarr(checkpoint_dir, torch_tensor)
+    elif config_dict['sharded_backend'] == 'torch_dist':
+        return load_sharded_metadata_torch_dist(checkpoint_dir, torch_tensor)
+    else:
+        raise NotImplementedError(f'Distributed checkpoint backend {config_dict["sharded_backend"]} not supported')
+
+
+def update_tokenizer_paths(tokenizer_config: Dict, unpacked_checkpoints_dir):
+    def _update_config_entry(key, file_pattern):
+        old_path = tokenizer_config[key]
+        if old_path is None:
+            return
+        old_path = Path(old_path)
+        new_path = unpacked_checkpoints_dir.get_tokenizer_file_path("tokenizer", key, file_pattern)
+        if new_path:
+            LOGGER.debug(f"Update tokenizer {key} {old_path} -> {new_path}")
+            tokenizer_config[key] = new_path
+        elif not old_path.exists():
+            LOGGER.warning(f"Tokenizer {key}'s path {old_path} does not exists: set it to None")
+            tokenizer_config[key] = None
+
+    _update_config_entry("model", "*.model")
+    _update_config_entry("vocab_file", "*vocab*")
+    _update_config_entry("merge_file", "*merge*.txt")
+
+    return tokenizer_config
+
+
+def copy_tokenizer_files(config, out_dir):
+    basenames = {
+        "model": "tokenizer",
+        "vocab_file": "vocab",
+        "merge_file": "merges",
+    }
+
+    for key in basenames.keys():
+        if config[key] is None:
+            continue
+
+        path = config[key]
+
+        if isinstance(path, str):
+            path = Path(path)
+
+        if not path.exists():
+            LOGGER.debug(f"Tokenizer {key}: {path} file not found")
+            continue
+
+        dst_path = out_dir / f"{basenames[key]}{path.suffix}"
+        LOGGER.debug(f"Copy tokenizer {key}: {path}->{dst_path}")
+
+        # Copy 'path' to 'dst_path' without shutil.copy(...) because 'path' may be a TarPath
+        with path.open('rb') as infile:
+            with open(dst_path, 'wb') as outfile:
+                outfile.write(infile.read())
+
+
+def get_tokenzier(tokenizer_dir_or_path: Path) -> PreTrainedTokenizer:
+    """Loads the tokenizer from the decoded NEMO weights dir."""
+    if os.path.isdir(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer")):
+        return AutoTokenizer.from_pretrained(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer"))
+
+    model_path = tokenizer_dir_or_path / "tokenizer.model" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
+    tokenizer_config = {"library": "sentencepiece", "model": str(model_path)}
+    return build_tokenizer(tokenizer_config)
+
+
+def build_tokenizer(tokenizer):
+    if isinstance(tokenizer, dict):
+        tokenizer_config = tokenizer
+        if tokenizer_config["library"] == "sentencepiece":
+            return SentencePieceTokenizer(model_path=tokenizer_config["model"])
+        elif "GPT2" in tokenizer_config["type"]:
+            tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"])
+        else:
+            raise ValueError(f'Tokenizer type {tokenizer_config["library"]} not handled')
+
+        if tokenizer.bos_token_id is None:
+            tokenizer.add_special_tokens({"bos_token": "<s>"})
+        if tokenizer.eos_token_id is None:
+            tokenizer.add_special_tokens({"eos_token": "</s>"})
+    else:
+        try:
+            # If NeMo tokenizer, monkey patch interface
+            from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+            if isinstance(tokenizer, TokenizerSpec):
+
+                def batch_encode_patch(self, ids):
+                    if torch.is_tensor(ids):
+                        ids = ids.cpu().numpy()
+                    return self.ids_to_text(ids)
+
+                tokenizer.bos_token_id = tokenizer.bos_id
+                tokenizer.eos_token_id = tokenizer.eos_id
+                tokenizer.encode = tokenizer.text_to_ids
+                TokenizerSpec.batch_decode = batch_encode_patch
+        except:
+            raise TypeError(f'Unsupported tokenizer build input: {type(tokenizer)}')
+
+    return tokenizer
+
+
+def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Path]):
+
+    if not os.path.exists(nemo_ckpt):
+        raise TypeError("%s does not exist", nemo_ckpt)
+
+    if os.path.isdir(nemo_ckpt):
+        nemo_dir = Path(nemo_ckpt)
+    else:
+        nemo_dir = TarPath(nemo_ckpt)
+
+    try:
+        unpacked_checkpoint_dir = UnpackedNemoCheckpointDir(nemo_dir, load_checkpoints_to_cpu=True)
+
+        dist_ckpt_folder = nemo_dir / "model_weights"
+        if dist_ckpt_folder.exists():
+            model = load_sharded_metadata(dist_ckpt_folder)
+            nemo_model_config = unpacked_checkpoint_dir.model_config
+
+            if nemo_model_config["tokenizer"].get("library", None) == "huggingface":
+                tokenizer = AutoTokenizer.from_pretrained(
+                    nemo_model_config["tokenizer"]["type"],
+                    use_fast=nemo_model_config["tokenizer"].get("use_fast", False),
+                )
+            else:
+                tokenizer_config = update_tokenizer_paths(nemo_model_config["tokenizer"], unpacked_checkpoint_dir)
+                copy_tokenizer_files(tokenizer_config, nemo_export_dir)
+
+                tokenizer_config["model"] = os.path.join(nemo_export_dir, "tokenizer.model")
+                tokenizer = build_tokenizer(tokenizer_config)
+        else:
+            raise Exception(
+                "Not a supported nemo file format. " "Only distributed mcore nemo checkpoints are support."
+            )
+    finally:
+        if isinstance(nemo_dir, TarPath):
+            nemo_dir.tarobject.close()
+
+    return model, nemo_model_config, tokenizer
+
+
+def cpu_map_location(storage, loc):
+    return storage.cpu()
+
+
+def gpu_map_location(storage, loc):
+    if loc.startswith("cuda"):
+        training_gpu_idx = int(loc.split(":")[1])
+        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
+        return storage.cuda(inference_gpu_idx)
+    elif loc.startswith("cpu"):
+        return storage.cpu()
+    else:
+        raise ValueError(f"Not handled {loc}")
+
+
+class UnpackedNemoCheckpointDir:
+    def __init__(
+        self,
+        checkpoints_dir: Union[Path, TarPath],
+        load_checkpoints_to_cpu: bool = False,
+    ):
+        assert isinstance(checkpoints_dir, (Path, TarPath))
+        self._checkpoints_dir = checkpoints_dir
+        self._load_checkpoints_to_cpu = load_checkpoints_to_cpu
+
+    @property
+    @functools.lru_cache
+    def model_config(self):
+        model_config = None
+
+        model_config_filename = "model_config.yaml"
+        model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename))
+        if model_configs_paths:
+            if len(model_configs_paths) > 1:
+                LOGGER.debug(f"There are more than single {model_config_filename} in" f" {self._checkpoints_dir}")
+            model_config_path = model_configs_paths[0]
+            LOGGER.debug("Loading model config from %s", model_config_path)
+            with model_config_path.open("r") as model_config_file:
+                model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
+        else:
+            LOGGER.debug("Searching model config in checkpoints")
+            # try to obtain from checkpoint
+            checkpoint_name = self.checkpoint_name
+            checkpoints_paths = sorted(self._checkpoints_dir.rglob(checkpoint_name))
+            if checkpoints_paths:
+                # assume that parallel ranks 0 checkpoint should have model config embedded
+                checkpoint_path = checkpoints_paths[0]
+
+                map_location_fn = cpu_map_location if self._load_checkpoints_to_cpu else gpu_map_location
+
+                model_00 = torch.load(checkpoint_path, map_location=map_location_fn)
+                if "hyper_parameters" in model_00 and "cfg" in model_00["hyper_parameters"]:
+                    model_config = model_00["hyper_parameters"]["cfg"]
+                    LOGGER.debug("Loaded model config from checkpoint %s", checkpoint_path)
+                else:
+                    LOGGER.debug("Could not find model config in checkpoint %s", checkpoint_path)
+
+                del model_00
+
+        if model_config is None:
+            LOGGER.warning("Could not find checkpoint with NeMo model config in %s", self._checkpoints_dir)
+
+        LOGGER.debug("Loaded model config %s", model_config)
+
+        return model_config
+
+    @property
+    def checkpoints_dir(self):
+        return self._checkpoints_dir
+
+    def get_checkpoints_paths(self, tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+        """Injects tensor/pipeline model parallel ranks into the filepath.
+        Does nothing if not using model parallelism.
+        """
+        checkpoint_path_without_rank = self.checkpoints_dir / self.checkpoint_name
+
+        def _inject_parallel_ranks(tp_rank, pp_rank):
+            if tensor_model_parallel_size > 1 or pipeline_model_parallel_size > 1:
+                if pipeline_model_parallel_size is None or pipeline_model_parallel_size == 1:
+                    checkpoint_path = (
+                        checkpoint_path_without_rank.parent
+                        / f"mp_rank_{tp_rank:02d}"
+                        / checkpoint_path_without_rank.name
+                    )
+                else:
+                    checkpoint_path = (
+                        checkpoint_path_without_rank.parent
+                        / f"tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}"
+                        / checkpoint_path_without_rank.name
+                    )
+                return checkpoint_path
+            else:
+                return checkpoint_path_without_rank
+
+        return [
+            [
+                _inject_parallel_ranks(tp_rank=tp_rank, pp_rank=pp_rank)
+                for pp_rank in range(pipeline_model_parallel_size)
+            ]
+            for tp_rank in range(tensor_model_parallel_size)
+        ]
+
+    @property
+    @functools.lru_cache
+    def checkpoint_name(self):
+        patterns = [
+            "model_weights.ckpt",  # older megatron checkpoints
+            "*last.ckpt",  # newer format of checkpoints
+        ]
+        for pattern in patterns:
+            model_files = sorted(list(self._checkpoints_dir.rglob(pattern)))
+            if model_files:
+                return model_files[0].name
+
+        raise ValueError(f"Could not find checkpoint files in {self._checkpoints_dir}")
+
+    @functools.lru_cache
+    def get_tokenizer_file_path(self, tokenizer_key, file_key, default_filename_pattern):
+        model_config = self.model_config
+        file_property = None
+        if tokenizer_key in model_config and file_key in model_config[tokenizer_key]:
+            file_property = model_config[tokenizer_key][file_key]
+        elif file_key in model_config:
+            file_property = model_config[file_key]
+
+        LOGGER.debug("model_config[%s][%s]=%s", tokenizer_key, file_key, file_property)
+
+        if file_property and file_property.startswith("nemo:"):
+            filename = file_property.split("nemo:")[1]
+            filename_pattern = f"*{filename}"
+        elif file_property and file_property.startswith("/artifacts/"):
+            filename = Path(file_property).name
+            filename_pattern = f"*{filename}"
+        elif file_property is None or file_property == "None":
+            filename_pattern = None
+        else:
+            filename_pattern = default_filename_pattern
+            LOGGER.warning(
+                f"Tokenizer file from config: {tokenizer_key}.{file_key}={file_property} "
+                f"looks like unsupported path. Pattern {filename_pattern} will be used."
+            )
+
+        file_path = None
+        if filename_pattern is not None:
+            files_paths = list(self._checkpoints_dir.glob(filename_pattern))
+            if files_paths:
+                assert len(files_paths) == 1
+                file_path = files_paths[0]
+
+        return file_path
diff --git a/nemo/export/trt_llm/nemo/sentencepiece_tokenizer.py b/nemo/export/trt_llm/nemo_ckpt_loader/sentencepiece_tokenizer.py
similarity index 100%
rename from nemo/export/trt_llm/nemo/sentencepiece_tokenizer.py
rename to nemo/export/trt_llm/nemo_ckpt_loader/sentencepiece_tokenizer.py
diff --git a/nemo/export/trt_llm/qnemo/tokenizer_utils.py b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
index 3fde26253af6..4b0775a0aa2a 100644
--- a/nemo/export/trt_llm/qnemo/tokenizer_utils.py
+++ b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
@@ -17,7 +17,7 @@
 from omegaconf import OmegaConf
 from transformers import AutoTokenizer
 
-from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.export.trt_llm.nemo_ckpt_loader.sentencepiece_tokenizer import SentencePieceTokenizer
 
 # TODO: use get_nmt_tokenizer helper below to instantiate tokenizer once environment / dependencies get stable
 # from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index 30490cc91254..bbafec319fd5 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -13,25 +13,15 @@
 # limitations under the License.
 
 
-import argparse
 import logging
-import os
-import time
-from pathlib import Path
-from typing import List
-
 import tensorrt_llm
 from tensorrt_llm._common import check_max_num_tokens
-from tensorrt_llm._utils import np_dtype_to_trt
 from tensorrt_llm.builder import BuildConfig, Builder
 from tensorrt_llm.commands.build import build as build_trtllm
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraBuildConfig
 from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights
-from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin import PluginConfig
-from tensorrt_llm.plugin.plugin import ContextFMHAType
-from tensorrt_llm.quantization import QuantMode
 
 MODEL_NAME = "NeMo"
 
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index f79d6ddce4bc..8fdd747dcb90 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+import csv
 import json
 import logging
 import os
@@ -30,8 +31,6 @@
 from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
 from transformers import PreTrainedTokenizer
 
-from nemo.export.trt_llm.nemo_utils import to_word_list_format  # isort:skip
-
 
 LOGGER = logging.getLogger("NeMo")
 
@@ -627,3 +626,61 @@ def unload(host_context: TensorrtLLMHostContext):
     global tensorrt_llm_worker_context
     tensorrt_llm_worker_context.decoder = None
     tensorrt_llm_worker_context = TensorrtLLMWorkerContext()
+
+
+def to_word_list_format(
+    word_dict: List[List[str]],
+    tokenizer=None,
+    ref_str="<extra_id_1>",
+):
+    '''
+    format of word_dict
+        len(word_dict) should be same to batch_size
+        word_dict[i] means the words for batch i
+        len(word_dict[i]) must be 1, which means it only contains 1 string
+        This string can contains several sentences and split by ",".
+        For example, if word_dict[2] = " I am happy, I am sad", then this function will return
+        the ids for two short sentences " I am happy" and " I am sad".
+    '''
+    assert tokenizer is not None, "need to set tokenizer"
+
+    flat_ids = []
+    offsets = []
+    # The encoding of a single word can't always be trusted. See
+    #   https://github.com/NVIDIA/NeMo/blob/bb575b72fd0be51ae10cc77d9f89ddb9e9d3b96d/nemo/collections/nlp/modules/common/text_generation_strategy.py#L229
+    ids_ref = tokenizer.encode(ref_str)
+    for word_dict_item in word_dict:
+        item_flat_ids = []
+        item_offsets = []
+
+        if isinstance(word_dict_item[0], bytes):
+            word_dict_item = [word_dict_item[0].decode()]
+
+        words = list(csv.reader(word_dict_item))[0]
+        for word in words:
+            ids = tokenizer.encode(f"{ref_str}{word}")
+            if ids[0 : len(ids_ref)] == ids_ref:
+                # It worked! We can obtain the token(s) associated to `word` by stripping the prefix tokens.
+                ids = ids[len(ids_ref) :]
+            else:
+                # Unfortunately the prefix was merged with `word`. We could try with a different prefix, but
+                # for now we just use the basic encoding since this should be a very rare edge case.
+                ids = tokenizer.encode(word)
+                logging.warning(f"The encoding of word '{word}' into tokens {ids} might be incorrect")
+
+            if len(ids) == 0:
+                continue
+
+            item_flat_ids += ids
+            item_offsets.append(len(ids))
+
+        flat_ids.append(np.array(item_flat_ids))
+        offsets.append(np.cumsum(np.array(item_offsets)))
+
+    pad_to = max(1, max(len(ids) for ids in flat_ids))
+
+    for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
+        flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
+        offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
+
+    return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))

From f5b6e871acdf5e55a8f3f27f6f56e77e7a6766b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Thu, 6 Jun 2024 19:27:12 +0200
Subject: [PATCH 168/178] ci: Fix
 `L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav`
 (#9399)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cicd-main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 1b0690ce0082..4cc344ab4a09 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -758,7 +758,7 @@ jobs:
       RUNNER: self-hosted-azure
       SCRIPT: |
         cd tools/ctc_segmentation && \
-        $=`date +"%Y-%m-%d-%T"` && \
+        TIME=`date +"%Y-%m-%d-%T"` && \
         /bin/bash run_segmentation.sh \
         --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
         --DATA_DIR=/home/TestData/ctc_segmentation/eng \

From 733d1843f78c4b71743c09b94bc093d071a0052d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:55:41 -0700
Subject: [PATCH 169/178] disable overlap for qkv (#9079)

* disable overlap for qkv (#9072)

* disable overlap for qkv

Signed-off-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: michal2409 <michal2409@users.noreply.github.com>

---------

Signed-off-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: michal2409 <michal2409@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: Rachit Garg <rachitgarg91@gmail.com>
Co-authored-by: Rachit Garg <rachitg@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: michal2409 <michal2409@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../common/megatron/adapters/mcore_mixins.py  | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index fe9e900f4ad0..a85c155cc0a8 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -80,7 +80,7 @@ def mcore_register_adapters(self):
         if (
             self.config.sequence_parallel
             and hasattr(self.linear_qkv, "return_layernorm_output_gathered")
-            and not self.config.tp_comm_overlap
+            and not self.linear_qkv.ub_overlap_ag
         ):
             # for LoRA SP, TE v1.5 can return layernorm output gathered so there is no need
             # to perform the redundant gather in the adapter module, unless TP communication
@@ -142,11 +142,19 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
         if SplitAlongDim is not None:
 
             # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-            (query, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list,)
+            (query, key, value) = SplitAlongDim(
+                mixed_qkv,
+                3,
+                split_arg_list,
+            )
         else:
 
             # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-            (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3,)
+            (query, key, value) = torch.split(
+                mixed_qkv,
+                split_arg_list,
+                dim=3,
+            )
 
         # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
         query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
@@ -231,11 +239,21 @@ def forward(
 
         if self.checkpoint_core_attention:
             core_attn_out = self._checkpointed_attention_forward(
-                query, key, value, attention_mask, attn_mask_type=attn_mask_type, packed_seq_params=packed_seq_params,
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
             )
         else:
             core_attn_out = self.core_attention(
-                query, key, value, attention_mask, attn_mask_type=attn_mask_type, packed_seq_params=packed_seq_params,
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
             )
 
         if packed_seq_params is not None:
@@ -316,7 +334,9 @@ def forward(self, hidden_states):
                     intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
             elif self.activation_func == F.silu and self.config.gated_linear_unit:
                 intermediate_parallel = bias_swiglu_impl(
-                    intermediate_parallel, bias_parallel, self.config.activation_func_fp8_input_store,
+                    intermediate_parallel,
+                    bias_parallel,
+                    self.config.activation_func_fp8_input_store,
                 )
 
             else:

From bbbe826aa524d2ef3d841e50ec97788322a31f4e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:57:07 -0700
Subject: [PATCH 170/178] Fix circular import for MM dataprep notebook (#9287)
 (#9292)

* update launcher name and fix mm circular import


* Apply isort and black reformatting


---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py   |  4 +++-
 .../Multimodal Data Preparation.ipynb         | 21 ++++++-------------
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 79937c265b09..8ca010e59f70 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -64,7 +64,6 @@
     # since PyTorch 2.3 the path has changed
     from torch.amp.grad_scaler import _refresh_per_optimizer_state
 
-from nemo.collections.multimodal.modules.stable_diffusion.attention import BasicTransformerBlock
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.transformer import AutocastTransformerLayer, ParallelTransformerLayer
 from nemo.collections.nlp.parts import utils_funcs
@@ -662,6 +661,9 @@ def __init__(
         # Use the default FSDP backward-prefetch policy for proper communication overlap.
         kwargs['backward_prefetch'] = BackwardPrefetch.BACKWARD_PRE
 
+        # import here to prevent circular imports
+        from nemo.collections.multimodal.modules.stable_diffusion.attention import BasicTransformerBlock
+
         # Set FSDP wrapping policy: use Transformer layer module as the FSDP sharding granularity.
         self.fsdp_wrap_module = {
             MCoreTransformerLayer,
diff --git a/tutorials/multimodal/Multimodal Data Preparation.ipynb b/tutorials/multimodal/Multimodal Data Preparation.ipynb
index e506bbd4d4b4..b3a38b8b5ec2 100644
--- a/tutorials/multimodal/Multimodal Data Preparation.ipynb	
+++ b/tutorials/multimodal/Multimodal Data Preparation.ipynb	
@@ -67,7 +67,7 @@
     "\n",
     "This notebook will show you how to prepare an image-text dataset into the [WebDataset](https://github.com/webdataset/webdataset) format. The Webdataset format is required to train all multimodal models in NeMo, such as Stable Diffusion and Imagen. \n",
     "\n",
-    "This notebook is designed to demonstrate the different stages of multimodal dataset preparation. It is not meant to be used to process large-scale datasets since many stages are too time-consuming to run without parallelism. For large workloads, we recommend running the multimodal dataset preparation pipeline with the NeMo-Megatron-Launcher on multiple processors/GPUs. NeMo-Megatron-Launcher packs the same 5 scripts in this notebook into one runnable command and one config file to enable a smooth and a streamlined workflow.\n",
+    "This notebook is designed to demonstrate the different stages of multimodal dataset preparation. It is not meant to be used to process large-scale datasets since many stages are too time-consuming to run without parallelism. For large workloads, we recommend running the multimodal dataset preparation pipeline with the NeMo-Framework-Launcher on multiple processors/GPUs. NeMo-Framework-Launcher packs the same 5 scripts in this notebook into one runnable command and one config file to enable a smooth and a streamlined workflow.\n",
     "\n",
     "Depending on your use case, not all 5 stages need to be run. Please go to [NeMo Multimodal Documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/multimodal/text2img/datasets.html) for an overview of the 5 stages.\n",
     "    \n",
@@ -85,7 +85,7 @@
    "source": [
     "import os\n",
     "\n",
-    "LAUNCHER_DIR = \"/opt/NeMo-Megatron-Launcher\"\n",
+    "LAUNCHER_DIR = \"/opt/NeMo-Framework-Launcher\"  # formerly NeMo-Megatron-Launcher\n",
     "SCRIPT_DIR = os.path.join(LAUNCHER_DIR, \"launcher_scripts/nemo_launcher/collections/dataprep_scripts/multimodal_dataprep\")\n",
     "CONF_DIR = \"conf\"\n",
     "DATA_DIR = \"dummy_data\"\n",
@@ -168,7 +168,7 @@
     "\n",
     "Script: download_images.py\n",
     "\n",
-    "Environment variables (automatically set by SLURM if running with NeMo-Megatron-Launcher):\n",
+    "Environment variables (automatically set by SLURM if running with NeMo-Framework-Launcher):\n",
     "- `SLURM_ARRAY_TASK_COUNT`: total number of tasks, should be set to the number of parquet files in `$DATA_DIR/parquet/dummy_dataset50000.parquet_parts`. (i.e. `parquet_subpartitions` x `num_parquets_downloaded`)\n",
     "- `SLURM_ARRAY_TASK_ID`: id of the current task (0 <= SLURM_ARRAY_TASK_ID < SLURM_ARRAY_TASK_COUNT)\n",
     "\n",
@@ -266,7 +266,7 @@
     "\n",
     "Script: reorganize_tar.py\n",
     "\n",
-    "Environment variables (automatically set by SLURM if running with NeMo-Megatron-Launcher):\n",
+    "Environment variables (automatically set by SLURM if running with NeMo-Framework-Launcher):\n",
     "- `SLURM_ARRAY_TASK_COUNT`: total number of tasks, should be set to parquet_subpartitions x num_parquets_downloaded\n",
     "- `SLURM_ARRAY_TASK_ID`: id of the current task (0 <= `SLURM_ARRAY_TASK_ID` < `SLURM_ARRAY_TASK_COUNT`)\n",
     "\n",
@@ -430,7 +430,7 @@
    },
    "outputs": [],
    "source": [
-    "! wget https://raw.githubusercontent.com/NVIDIA/NeMo-Megatron-Launcher/master/launcher_scripts/conf/data_preparation/multimodal/precache_sd.yaml -P $CONF_DIR/"
+    "! wget https://raw.githubusercontent.com/NVIDIA/NeMo-Framework-Launcher/master/launcher_scripts/conf/data_preparation/multimodal/precache_sd.yaml -P $CONF_DIR/"
    ]
   },
   {
@@ -506,7 +506,7 @@
     "\n",
     "Script: precache_encodings.py\n",
     "\n",
-    "Environment variables (automatically set by SLURM if running with NeMo-Megatron-Launcher):\n",
+    "Environment variables (automatically set by SLURM if running with NeMo-Framework-Launcher):\n",
     "- `SLURM_ARRAY_TASK_COUNT`: total number of tasks, should be set to parquet_subpartitions x num_parquets_downloaded\n",
     "- `SLURM_ARRAY_TASK_ID`: id of the current task (0 <= `SLURM_ARRAY_TASK_ID` < `SLURM_ARRAY_TASK_COUNT`)\n",
     "\n",
@@ -533,15 +533,6 @@
     "    precache_config_path=$CONF_DIR/precache_sd_example.yaml"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "source": [
-    "If you encounter a nemo import problem with the cell above, please also running it in the terminal directly."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
-  },
   {
    "attachments": {},
    "cell_type": "markdown",

From d0630fdf4a8d91abd48bb4e73bd879e3ca0132c6 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:58:14 -0700
Subject: [PATCH 171/178] add check if num layers is divisible by pp size
 (#9208) (#9298)

* add check if num_layers % pp == 0


* Apply isort and black reformatting


* move num_layers / pp check to build_transformer_config


---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py      | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index db35fc42293e..652b3b767c94 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1990,6 +1990,12 @@ def build_transformer_config(self) -> TransformerConfig:
         For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
         """
 
+        if self.cfg.num_layers % self.cfg.get('pipeline_model_parallel_size', 1) != 0:
+            raise ValueError(
+                f"num_layers ({self.cfg.num_layers}) should be divisible by "
+                f"pipeline_model_parallel_size ({self.cfg.get('pipeline_model_parallel_size', 1)})"
+            )
+
         normalization = self.cfg.get('normalization', 'layernorm').lower()
         layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
         if normalization == 'layernorm':

From cc24d38c3150fdc9386b49acd4de1fb41f799e92 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Thu, 6 Jun 2024 11:31:37 -0700
Subject: [PATCH 172/178] Add HF siglip vision encoder (#9185)

* temp save

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* temp save 2

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable seq packing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix neva and clip

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Enable parallel seq packing algo and few other fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Pipeline parallel support

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update data preprocess

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix few pp issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable sequence packing w/ PP

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix cu_seqlens in inputs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add assert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Depend on PP to decide whether do padding

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add docstring

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few evaluation issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few PP evaluation issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add llama3 template

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix license

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix llama3

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* llama3 inference fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Force vision encoder to run in fp32

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Revert "Force vision encoder to run in fp32"

This reverts commit 9d2160d96cb3e2a27a18538950ef43b4482c04da.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Try adding distributed format of checkpoint

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Allow dist checkpoint to be non-strict

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Some fixes for PP + dist ckpt in Neva

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix peft

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* few fixes for lora

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* checkpoint updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* bug fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add HF siglip vision encoder

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* handle steerlm label in nv_dpo template

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* Add neva dist checkpoint converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix CLEAN RESPONSE logic to not use last EOS

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* strip extra_id_1 from clean response

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* change inference time image processor

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* resolve comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* remove open_clip vision encoder for siglip

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* update neva dist ckpt apis

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix return

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* resolve CLEAN RESPONSE multiturn issue

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* code format

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* fixes for isort

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* refac image processor loading to util

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* black and isort

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* move crop size assertion

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* few neva fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Co-authored-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../sequence_packing/preprocess_dataset.py    |  11 +-
 .../multimodal/data/neva/neva_dataset.py      | 107 +++++++++---------
 .../models/multimodal_llm/neva/neva_model.py  |  76 +++++++++----
 nemo/collections/multimodal/parts/utils.py    |  79 +++++++------
 .../modules/common/text_generation_utils.py   |   3 +-
 5 files changed, 158 insertions(+), 118 deletions(-)

diff --git a/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
index ee96ff6489d3..60f882fa9821 100644
--- a/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
+++ b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
@@ -60,6 +60,7 @@
 from tqdm import tqdm
 
 from nemo.collections.multimodal.data.neva.neva_dataset import make_supervised_data_module
+from nemo.collections.multimodal.parts.utils import create_image_processor
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.utils import logging
 
@@ -254,8 +255,14 @@ def main():
     nemo_config.model.data.conv_template = args.conv_template
     nemo_config.model.data.image_aspect_ratio = args.image_aspect_ratio
 
-    tokenizer = get_nmt_tokenizer(library="sentencepiece", tokenizer_model=args.tokenizer_path,)
-    train_ds = make_supervised_data_module(tokenizer=tokenizer, model_cfg=nemo_config.model)["train_dataset"]
+    tokenizer = get_nmt_tokenizer(
+        library="sentencepiece",
+        tokenizer_model=args.tokenizer_path,
+    )
+    image_processor = create_image_processor(nemo_config.model.mm_cfg)
+    train_ds = make_supervised_data_module(
+        tokenizer=tokenizer, image_processor=image_processor, model_cfg=nemo_config.model
+    )["train_dataset"]
     train_dl = DataLoader(train_ds, num_workers=32, collate_fn=None, shuffle=False)
     # Example shape: {'tokens': torch.Size([1, 344]), 'labels': torch.Size([1, 344]), 'image': torch.Size([1, 1, 3, 224, 224])}
 
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 07b5ad1a32df..86d45ded54cf 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -29,7 +29,7 @@
 from omegaconf import DictConfig
 from PIL import Image
 from torch.utils.data import Dataset, default_collate
-from transformers import CLIPImageProcessor
+from transformers import CLIPImageProcessor, SiglipImageProcessor
 
 import nemo.collections.multimodal.data.neva.conversation as conversation_lib
 from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform
@@ -294,6 +294,42 @@ def preprocess_multimodal(sources: dict, multimodal_cfg: dict, cur_token_len: in
     return sources
 
 
+def process_image(processor, image, image_aspect_ratio="square"):
+    if isinstance(processor, CLIPImageProcessor) or isinstance(processor, SiglipImageProcessor):
+        # image processor from HF
+        if image_aspect_ratio == 'keep':
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            max_len, min_len = 448, 224
+            shortest_edge = int(min(max_len / aspect_ratio, min_len))
+            image = processor.preprocess(
+                image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
+            )['pixel_values'][0]
+        elif image_aspect_ratio == 'pad':
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        else:
+            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+    else:
+        assert image_aspect_ratio == 'square', 'NeMo image transform with setting `image_aspect_ratio` to `square`.'
+        image = processor(image)
+    return image
+
+
 def preprocess_llama_3(
     sources: dict,
     tokenizer,
@@ -456,9 +492,11 @@ def preprocess_llama_2(
             parts[0] += sep
 
             round_len = len(tokenizer.text_to_ids(rou + conv.sep2))
+            instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2
             if i > 0:
                 round_len -= 1  # Remove extra token added by sp tokenizer
-            instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2
+            else:
+                instruction_len += 1
             target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
 
             cur_len += round_len
@@ -723,7 +761,11 @@ def preprocess_nv_dpo(
 
             if i % 2 == 1:
                 turn['from'] = conv.roles[1]
-                conv.append_message(turn['from'], turn['value'])
+                if "label" in turn:
+                    value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value']
+                else:
+                    value = turn["value"]
+                conv.append_message(turn['from'], value)
                 if not turn["value"]:
                     strip_end_for_inference = (
                         True  # in inference, current turn is empty, thus end tokens need to striped.
@@ -765,7 +807,11 @@ def preprocess_nv_dpo(
             if len(parts) != 2:
                 break
 
-            instruction_len = len(tokenizer.text_to_ids(parts[0] + sep))
+            # handle label if exists
+            labels_match = re.search(rf"{re.escape(DEFAULT_LABELS_TOKEN)}.*?\n", parts[1])
+            instruction_len = len(
+                tokenizer.text_to_ids(parts[0] + sep + (parts[1][: labels_match.end()] if labels_match else ""))
+            )
             round_len = len(tokenizer.text_to_ids(rou + conv.sep))
             target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
 
@@ -886,40 +932,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
                 image = self.image_loader.open_image(image_file)
                 if image is None:
                     logging.warning(f"Image {image_file} could not be found!")
-                if isinstance(self.processor, CLIPImageProcessor):
-                    # image processor from HF
-                    if self.multimodal_cfg['image_aspect_ratio'] == 'keep':
-                        max_hw, min_hw = max(image.size), min(image.size)
-                        aspect_ratio = max_hw / min_hw
-                        max_len, min_len = 448, 224
-                        shortest_edge = int(min(max_len / aspect_ratio, min_len))
-                        image = self.processor.preprocess(
-                            image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
-                        )['pixel_values'][0]
-                    elif self.multimodal_cfg['image_aspect_ratio'] == 'pad':
-
-                        def expand2square(pil_img, background_color):
-                            width, height = pil_img.size
-                            if width == height:
-                                return pil_img
-                            elif width > height:
-                                result = Image.new(pil_img.mode, (width, width), background_color)
-                                result.paste(pil_img, (0, (width - height) // 2))
-                                return result
-                            else:
-                                result = Image.new(pil_img.mode, (height, height), background_color)
-                                result.paste(pil_img, ((height - width) // 2, 0))
-                                return result
-
-                        image = expand2square(image, tuple(int(x * 255) for x in self.processor.image_mean))
-                        image = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-                    else:
-                        image = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-                else:
-                    assert (
-                        self.multimodal_cfg['image_aspect_ratio'] == 'square'
-                    ), 'NeMo image transform with setting `image_aspect_ratio` to `square`.'
-                    image = self.processor(image)
+                image = process_image(self.processor, image, self.multimodal_cfg['image_aspect_ratio'])
                 images.append(image)
             media_tensors = torch.tensor([])
             if images:
@@ -1205,7 +1218,7 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         return batch
 
 
-def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
+def make_supervised_data_module(tokenizer, image_processor, model_cfg) -> Dict:
     """Make dataset and collator for supervised fine-tuning."""
     data_cfg = model_cfg.data
     mm_cfg = model_cfg.mm_cfg
@@ -1213,22 +1226,6 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
     if getattr(model_cfg, 'no_seqlen_plus_one_input_tokens', False):
         add_extra_token = 0
     crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224))
-    if mm_cfg.vision_encoder.from_hf:
-        image_processor = CLIPImageProcessor.from_pretrained(
-            mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
-        )
-        assert crop_size == (
-            image_processor.crop_size['height'],
-            image_processor.crop_size['width'],
-        ), f"Crop size {crop_size} does not match the HuggingFace CLIP model's crop size {(image_processor.crop_size['height'], image_processor.crop_size['width'])}"
-    else:
-        # TODO(yuya): Fix this hard-code for our own CLIP
-        image_processor = image_transform(
-            crop_size,
-            is_train=False,
-            mean=None,
-            std=None,
-        )
 
     train_dataset = NevaDataset(
         tokenizer=tokenizer,
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index e33cf267c230..cce40da45725 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -23,7 +23,7 @@
 from omegaconf.dictconfig import DictConfig
 from pkg_resources import packaging
 from pytorch_lightning.trainer.trainer import Trainer
-from transformers import CLIPVisionModel
+from transformers import CLIPVisionModel, SiglipVisionModel
 
 from nemo.collections.common.parts.utils import extend_instance
 from nemo.collections.multimodal.data.neva.conversation import DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN
@@ -36,7 +36,7 @@
     CLIPVisionTransformer,
     MegatronCLIPModel,
 )
-from nemo.collections.multimodal.parts.utils import load_nemo_model_weights
+from nemo.collections.multimodal.parts.utils import create_image_processor, load_nemo_model_weights
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import MegatronPretrainingSampler
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel, get_specs
@@ -141,7 +141,7 @@ def init_vision(
         use_im_start_end=False,
     ):
         self.vision_encoder = vision_encoder
-        self.from_hf = isinstance(vision_encoder, CLIPVisionModel)
+        self.from_hf = isinstance(vision_encoder, CLIPVisionModel) or isinstance(vision_encoder, SiglipVisionModel)
         self.media_start_id = media_start_id
         self.media_end_id = media_end_id
         self.class_token_length = class_token_length
@@ -280,25 +280,7 @@ def __init__(
         if mm_cfg.llm.freeze:
             self.freeze_llm(mm_cfg)
 
-        # Initialize vision encoder and freeze it
-        if mm_cfg.vision_encoder.from_hf:
-            vision_encoder = CLIPVisionModel.from_pretrained(
-                mm_cfg.vision_encoder.from_pretrained,
-                torch_dtype=torch.bfloat16,
-            ).cuda()
-            vision_encoder = vision_encoder.to(torch.bfloat16)
-            if mm_cfg.vision_encoder.freeze:
-                for param in vision_encoder.parameters():
-                    param.requires_grad = False
-                vision_encoder = vision_encoder.eval()
-        else:
-            vision_cfg = MegatronCLIPModel.restore_from(
-                mm_cfg.vision_encoder.from_pretrained, return_config=True
-            ).vision
-            vision_encoder = FrozenCLIPVisionTransformer(vision_cfg, self.config)
-            self.load_vision_encoder_weights(vision_encoder, mm_cfg.vision_encoder.from_pretrained)
-            if mm_cfg.vision_encoder.freeze:
-                vision_encoder.freeze()
+        vision_encoder, self.image_processor = self.create_vision_encoder_and_processor(mm_cfg)
 
         # Monkey patch embedding
         if kwargs.get("pre_process", True):
@@ -312,6 +294,44 @@ def __init__(
                 use_im_start_end=mm_cfg.get("use_im_start_end", False),
             )
 
+    def create_vision_encoder_and_processor(self, mm_cfg):
+        # Initialize vision encoder and freeze it
+        if mm_cfg.vision_encoder.get("from_hf", False):
+            if "clip" in mm_cfg.vision_encoder.from_pretrained:
+                vision_encoder = CLIPVisionModel.from_pretrained(
+                    mm_cfg.vision_encoder.from_pretrained,
+                    torch_dtype=torch.bfloat16,
+                ).cuda()
+                vision_encoder = vision_encoder.to(torch.bfloat16)
+                if mm_cfg.vision_encoder.freeze:
+                    for param in vision_encoder.parameters():
+                        param.requires_grad = False
+                    vision_encoder = vision_encoder.eval()
+            elif "siglip" in mm_cfg.vision_encoder.from_pretrained:
+                vision_encoder = SiglipVisionModel.from_pretrained(
+                    mm_cfg.vision_encoder.from_pretrained,
+                    torch_dtype=torch.bfloat16,
+                ).cuda()
+                vision_encoder = vision_encoder.to(torch.bfloat16)
+                if mm_cfg.vision_encoder.freeze:
+                    for param in vision_encoder.parameters():
+                        param.requires_grad = False
+                    vision_encoder = vision_encoder.eval()
+            else:
+                raise (ValueError("Currently only support CLIPVisionModel and SigLipVisionModel from Huggingface"))
+        else:
+            vision_cfg = MegatronCLIPModel.restore_from(
+                mm_cfg.vision_encoder.from_pretrained, return_config=True
+            ).vision
+            vision_encoder = FrozenCLIPVisionTransformer(vision_cfg, self.config)
+            self.load_vision_encoder_weights(vision_encoder, mm_cfg.vision_encoder.from_pretrained)
+            if mm_cfg.vision_encoder.freeze:
+                vision_encoder.freeze()
+
+        image_processor = create_image_processor(mm_cfg)
+
+        return vision_encoder, image_processor
+
     def freeze_llm(self, mm_cfg):
         raise NotImplementedError
 
@@ -1028,16 +1048,22 @@ def build_train_valid_test_datasets(self):
         logging.info('Building Neva datasets.')
         if self.cfg.data.get("packed_sequence", False):
             assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"
-            self._train_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size"))
-            self._validation_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size"))
+            self._train_ds = NevaPackedSeqDatatset(
+                self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size")
+            )
+            self._validation_ds = NevaPackedSeqDatatset(
+                self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size")
+            )
         else:
             ds_dict = make_supervised_data_module(
                 tokenizer=self.tokenizer,
+                image_processor=(
+                    self.model.module.image_processor if hasattr(self.model, "module") else self.model.image_processor
+                ),
                 model_cfg=self.cfg,
             )
             self._train_ds = ds_dict["train_dataset"]
             self._validation_ds = ds_dict["eval_dataset"]
-
         return self._train_ds, self._validation_ds
 
     def build_pretraining_data_loader(
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 8f2549b8fcd0..9ad8856daa63 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -22,8 +22,10 @@
 from PIL import Image
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from transformers import CLIPImageProcessor
+from transformers import CLIPImageProcessor, SiglipImageProcessor
+from nemo.collections.multimodal.data.clip.augmentations.augmentations import image_transform
 
+from nemo.collections.multimodal.data.neva.neva_dataset import process_image
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
@@ -430,42 +432,17 @@ def image_processor(maybe_image_path):
         else:
             image = maybe_image_path
 
-        if neva_cfg.mm_cfg.vision_encoder.from_hf:
-            processor = CLIPImageProcessor.from_pretrained(
-                neva_cfg.mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
-            )
-        else:
-            processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16)
-
-        if neva_cfg.data.image_aspect_ratio == 'keep':
-            max_hw, min_hw = max(image.size), min(image.size)
-            aspect_ratio = max_hw / min_hw
-            max_len, min_len = 448, 224
-            shortest_edge = int(min(max_len / aspect_ratio, min_len))
-            image = processor.preprocess(
-                image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge}
-            )['pixel_values'][0]
-        elif neva_cfg.data.image_aspect_ratio == 'pad':
-
-            def expand2square(pil_img, background_color):
-                width, height = pil_img.size
-                if width == height:
-                    return pil_img
-                elif width > height:
-                    result = Image.new(pil_img.mode, (width, width), background_color)
-                    result.paste(pil_img, (0, (width - height) // 2))
-                    return result
-                else:
-                    result = Image.new(pil_img.mode, (height, height), background_color)
-                    result.paste(pil_img, ((height - width) // 2, 0))
-                    return result
-
-            image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
-            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        processor = (
+            model.model.module.image_processor if hasattr(model.model, "module") else model.model.image_processor
+        )
+        image = process_image(processor, image, neva_cfg.data.image_aspect_ratio)
+        if neva_cfg.precision in [16, '16', '16-mixed']:
+            media = image.type(torch.float16)
+        elif neva_cfg.precision in [32, '32', '32-true']:
+            media = image.type(torch.float32)
         else:
-            image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            media = image.type(torch.bfloat16)
 
-        media = image.type(torch_dtype_from_precision(neva_cfg.precision))
         return media.unsqueeze(dim=0).unsqueeze(dim=0).unsqueeze(dim=0)
 
     # add video processor for video neva
@@ -533,3 +510,35 @@ def expand2square(pil_img, background_color):
         return media_tensors.unsqueeze(dim=0).unsqueeze(dim=0)
 
     return model, image_processor, video_processor
+
+
+def create_image_processor(mm_cfg):
+    if mm_cfg.vision_encoder.get("from_hf", False):
+        if "clip" in mm_cfg.vision_encoder.from_pretrained:
+            image_processor = CLIPImageProcessor.from_pretrained(
+                mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
+            )
+        elif "siglip" in mm_cfg.vision_encoder.from_pretrained:
+            image_processor = SiglipImageProcessor.from_pretrained(
+                mm_cfg.vision_encoder.from_pretrained, torch_dtype=torch.bfloat16
+            )
+        else:
+            raise (ValueError("Currently only support CLIPImageProcessor and SiglipImageProcessor from Huggingface"))
+
+        crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224))
+        if hasattr(image_processor, 'crop_size'):
+            assert crop_size == (
+                image_processor.crop_size['height'],
+                image_processor.crop_size['width'],
+            ), f"Crop size {crop_size} does not match the HuggingFace CLIP model's crop size {(image_processor.crop_size['height'], image_processor.crop_size['width'])}"
+
+    else:
+        # Corresponds to MegatronCLIPModel
+        crop_size = mm_cfg.get("crop_size", (224, 224))
+        image_processor = image_transform(
+            crop_size,
+            is_train=False,
+            mean=None,
+            std=None,
+        )
+    return image_processor
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index 722c493dfa9b..498d9e9a09da 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -203,7 +203,8 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
                 clean_response = clean_response[last_match_end_position:]
             clean_response = clean_response.strip("<extra_id_1>")
         elif conv_template == 'nv_dpo':
-            clean_response = clean_response.split("<extra_id_1>")[-2][10:]  # [10:] for removing "Assistant\n"
+            clean_response = clean_response.split("<extra_id_1>Assistant\n")[-1]
+            clean_response = clean_response.strip("<extra_id_1>")
         elif conv_template == "llama_2":
             clean_response = clean_response.rsplit("[/INST] ", 1)[-1]
         elif conv_template == "llama_3":

From 5a0638d2ffb62a482891d2a35de258e2e3d503eb Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Thu, 6 Jun 2024 15:40:49 -0700
Subject: [PATCH 173/178] [Nemo CICD] timeouts fix (#9407)

* timeouts fix

* timeouts fix
---
 .github/workflows/cicd-main.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 4cc344ab4a09..12b8cdcb8eed 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -100,6 +100,7 @@ jobs:
     uses: ./.github/workflows/_test_template.yml
     with:
       RUNNER: self-hosted-azure
+      TIMEOUT: 30
       SCRIPT: |
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
       IS_OPTIONAL: true
@@ -109,7 +110,7 @@ jobs:
     uses: ./.github/workflows/_test_template.yml
     with:
       RUNNER: self-hosted-azure-cpu
-      TIMEOUT: 80
+      TIMEOUT: 60
       SCRIPT: |
         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
 
@@ -4897,13 +4898,13 @@ jobs:
     uses: ./.github/workflows/_test_template.yml
     with:
       RUNNER: self-hosted-azure
+      TIMEOUT: 20
       SCRIPT: |
         CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
             pretrained_name=QuartzNet15x5Base-En  \
             dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \
             batch_size=64 \
             tolerance=0.1012
-      TIMEOUT: 20
       AFTER_SCRIPT: |
         rm -f examples/asr/evaluation_transcripts.json
 
@@ -5057,4 +5058,4 @@ jobs:
 
       - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
         run: |
-          exit 1
\ No newline at end of file
+          exit 1

From 5fe31ec670acedf5e7ece8a6abacfb618d3db464 Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Fri, 7 Jun 2024 02:29:28 +0200
Subject: [PATCH 174/178] Removing un-used ModelConfig class (#9389)

Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/gpt/model/base.py |  6 +----
 nemo/lightning/base.py                 | 33 ++------------------------
 2 files changed, 3 insertions(+), 36 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index c6db9b8cbd80..2bd15d03cc95 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -8,7 +8,6 @@
 from torch.optim import Optimizer
 
 from nemo.lightning import get_vocab_size, io
-from nemo.lightning.base import ModelConfig
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 
 if TYPE_CHECKING:
@@ -18,7 +17,7 @@
 
 
 @dataclass
-class GPTConfig(TransformerConfig, ModelConfig):
+class GPTConfig(TransformerConfig):
     # From megatron.core.models.gpt.gpt_model.GPTModel
     fp16_lm_cross_entropy: bool = False
     parallel_output: bool = True
@@ -126,9 +125,6 @@ def training_loss_reduction(self) -> MaskedTokenLossReduction:
     def validation_loss_reduction(self) -> MaskedTokenLossReduction:
         return MaskedTokenLossReduction(validation_step=True)
 
-    def copy(self) -> "GPTModel":
-        return self.__class__(self.config, self.tokenizer)
-
 
 def gpt_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
     from megatron.core import parallel_state
diff --git a/nemo/lightning/base.py b/nemo/lightning/base.py
index 9cf2d9a44f35..ba5daf12f95f 100644
--- a/nemo/lightning/base.py
+++ b/nemo/lightning/base.py
@@ -1,15 +1,13 @@
 import gc
-import inspect
 import os
 from pathlib import Path
-from typing import Generic, Optional, Type, TypeVar
+from typing import Optional
 
 import torch
 import torch.distributed
-from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning import Trainer
 from torch import nn
 
-from nemo.lightning import io
 
 DEFAULT_NEMO_CACHE_HOME = Path.home() / ".cache" / "nemo"
 NEMO_CACHE_HOME = Path(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME))
@@ -19,33 +17,6 @@
 NEMO_MODELS_CACHE = Path(os.getenv("NEMO_MODELS_CACHE", DEFAULT_NEMO_MODELS_CACHE))
 
 
-ModelT = TypeVar("ModelT", bound=LightningModule)
-
-
-class ModelConfig(Generic[ModelT], io.IOMixin):
-    def model_cls(self) -> Type[ModelT]:
-        raise NotImplementedError("Must be implemented by subclass")
-
-    @property
-    def model_type(self) -> Type[ModelT]:
-        return self.model_cls()
-
-    def init(self, *args, data=None, cpu: bool = False, **kwargs) -> ModelT:
-        model_cls = self.model_cls()
-        if data:
-            kwargs.update(data.model_kwargs())
-
-        signature = inspect.signature(model_cls.__init__)
-        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature.parameters}
-
-        model = model_cls(self, *args, **filtered_kwargs)
-
-        if not cpu:
-            model.cuda(torch.cuda.current_device())
-
-        return model
-
-
 def get_vocab_size(
     config,
     vocab_size: int,

From d8291b110441bf3048ae0ddfebc9883320e94091 Mon Sep 17 00:00:00 2001
From: zhehuaichen <139396994+zhehuaichen@users.noreply.github.com>
Date: Thu, 6 Jun 2024 23:25:13 -0400
Subject: [PATCH 175/178] Extend multimodal/speech_llm with lhotse, t5 and
 bestow supports (#9169)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fixes

* Docs fix

* Add support for custom NeMo fields in Lhotse-NeMo adapters (attach to cut.custom)

* Add support for custom NeMo fields in Lhotse-NeMo adapters (attach to cut.custom)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* support distributed_fused_adam

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support distributed_fused_adam

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* Add support for sharded NeMo manifest files

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* support megatron_amp_O2

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* Support heterogeneous sampling rates in non tarred NeMo manifests

* migrate to PTL2.0

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update manifest util

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Support multiple tokenizer/parser types, aggregate tokenizers, and custom language fields

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* agg and normal tokenizers actually work

* Support weights for NeMo tarred manifests

* Temporarily hardcoded pnc stripping/lowercasing

* fix

* make pnc hack configurable from the config and disabled by default

* fix the hack

* migrate to ptl2.1 to support multiple dataloaders

Signed-off-by: stevehuang52 <heh@nvidia.com>

* support encoder overwrite

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update misc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix eval and clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* support add_sep for perception model

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix https://github.com/Lightning-AI/pytorch-lightning/issues/18803

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* add_bos

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* Transformer decoder with conditioning for canary (#8091)

* initial commit for multi-task conf-enc transf-dec for canary

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* removing decoder states caching during training

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Option to limit the number of open streams (#8095)

* audio signal support in multi

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update asr evaluator

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix from
https://github.com/NVIDIA/NeMo/commit/fcc0f9f6ff7947c3c7fba3ed17d8ec8af6391397
and
https://github.com/NVIDIA/NeMo/commit/f97c9016e6438ca4174b66bf9c3e248b28197aaa

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* transcribe fn for Canary models (#8110)

* improve readability

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* adding context in transcribe function for ConfTransfModels

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* supporting relative paths in transcribe function for canary

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* removing cuts.sort_by_duration in __getitem__ to maintain manifest order during inference

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* update for evaluation

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for eval

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for evaluation

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix bleu

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix typo

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Add missing audio_filepath validation for Canary (#8119)

* Add missing audio_filepath validation for Canary

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* add default concat_sampling_probabilities

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support lhotse dataset in speechllm

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* bypass get_iterator_k_split

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* tmp fix

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* try to use fixed batch with megatron

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* add batch logging

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support unfrozen llm

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* Create README.md

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update README.md

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update README.md

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* rename

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add llama prompt template

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* support sample alpha

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support lhotse validation set and canary pretrained ckpt with pseudo label

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* make sure backward compatibility

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* remove pad

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* make sure asr_model is frozen

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support greedy decoding

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* valid on lhotse

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix multi dataloader in val case for lhotse SALM; add default data
names; keep asr model tokenizer by default to enable adding canary
dataset

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* remove the bruteforce _keep_special_tokens implementation

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* decoding_ratio and convert_canary_prompt_to_text support

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* canary_tokens_augment_ratio

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* debug

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* bug fix

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix lhotse based eval of llama canary model

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support some overwrite for eval

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support zero shot prompt in training

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support cross attention based SALM

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support cross attention based SALM

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix for batch train/valid of cross

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support learnable gate and plotting

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support using pseudo label in prompt rather than cross att

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* bug fix for perception cfg and context tokens shift

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* DentityConnectorsAdd

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix ckpt saving

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* Support RnnGatedCrossAttention

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* add include_ffw and fix _optimizer_param_groups for all unfrozen run

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support grad acc when using bucket

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support TransformerCrossAttention

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support ProjectTransformerCrossAttention

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support ++model.use_am_tokenizer ++model.override_vocab_size ++model.override.hidden_size

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support question set on val without canary

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support load_audio_encoder and wip in optim_param_groups

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* minor fix for audio pretrain model init

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* simplify canary_tokens_augment

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* use question in the manifest if it exists

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support dataset weighting for non tar

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* Update SpeechLLM code (#8475)

* add pleasefixme marker for potential failed nightly tests. (#7678)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Add new text segmentation library for better TTS quality (#7645)

* Add new text segmentation library for better TTS quality
* Update zh_cn_pinyin.py

added detailed instruction on how to install pkuseg.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Update requirements_tts.txt

remove pkuseg as the default dependency of NeMo TTS, and instead, direct users to manually install pkuseg if they really need.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>


---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Create PrecisionPlugin for megatron_ckpt_to_nemo.py trainer (#7767) (#7774)

* Create PrecisionPlugin for megatron_ckpt_to_nemo.py trainer


* Add ddp_find_unused_parameters_true for punctuation_capitalization_train_evaluate.py


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add '32-true' for precision values


---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix(clustering_diarizer.py): fix typo (#7772)

Signed-off-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>

* fix(diarization-README): typo (#7771)

Signed-off-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>

* Fix bug wrt change decoding strategy for bpe models (#7762) (#7764)

* Fix bug wrt change decoding strategy for bpe models


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Remove incorrect extra argument for load_from_checkpoint_dir() (#7500)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add nemo to mcore GPT conversion script  (#7730)

* add conversion script

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove references to 'ckpt'

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add one more sanity check to make sure there is no unexpected keys in state dict

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* make cpu loading work

Signed-off-by: Chen Cui <chcui@nvidia.com>

* make script work for llama2 models

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address code check

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove trainer precision (was for old sanity check)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix script for llama2 model

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove commented code

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix bug in ConditionalInput: cat along the feature dim, not the batch dim (#7785)

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* Add some docs and update scripts for ASR (#7790)

* Add some docs and update scripts

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* set context for text memmap to fork (#7784)

* set context for text memmap to fork

Signed-off-by: arendu <adithyare@nvidia.com>

* typo

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>

* add training with multiple audios

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Support flash decoding (#7744)

* Add flash-decoding

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>

* Change accelerator to 'auto' in nlp_checkpoint_port.py (#7761)

* Change accelerator to 'auto' in nlp_checkpoint_port.py (#7747)

* Change accelerator to auto

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass omegaconf object to trainer in nlp_checkpoint_port.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass omegaconf object to trainer in export.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

* docs: fix typos (#7758)

Signed-off-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Snake act (#7736)

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update gpt_dataset.py (#6963)

Signed-off-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Signed-off-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Xin Yao <yaox12@outlook.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>

* Add selection criteria for reference audios in the `GlobalStyleToken` submodule (#7788)

* add selection criteria for reference audios

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* Update configuration files

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* add informative comment in config files

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* sample random index for reference audio selection

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: anferico <f.cariaggi4@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* update text server to support compute logprobs (#7733)

* update text server to support compute logprobs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

---------

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* add multi-layer feat extract and fix random question insertion

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Configure MCore logger (#7781)

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Revert "PEFT eval fix (#7626) (#7638)" (#7693)

This reverts commit f03dd660bd26d88fd569e76c6f74b83a7c203ff9.

* remove TN from ctc_segm tut (#7807)

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* [TTS] Support audio offsets in TTS data loaders (#7156)

* [TTS] Support audio offsets in TTS data loaders

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Change docstring mentions of .pt to .npy

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* Update Apex install command in Dockerfile (#7794) (#7804)

* move core install to /workspace (#7706)


* update apex install in dockerfile


* use fetch head


---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Abhinav Khattar <aklife97@gmail.com>

* fix typo

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Nemo to HF converter for LLaMA model (#7770)

* Create config_llama_truncate.yaml

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Add files via upload

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update config_llama_truncate.yaml

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* clean up trainer

* remove dependency on yaml config. load config from nemo file instead.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* enable ckpt saving into other precision formats

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* support 70b + cleanup qkv slice logic

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix bug

* move hf model folder code from comment to function and add instruction to run

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* Save best NeMo model only when necessary (#7836)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* add guard if its a distributed checkpoint (#7845)

Signed-off-by: Gerald Shen <geshen@nvidia.com>

* Fix tn duplex (#7808)

* fix duplex tn infer

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* fix typo

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix TN docs

Signed-off-by: Evelina <ebakhturina@nvidia.com>

---------

Signed-off-by: Evelina <ebakhturina@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update transformers cache on Jenkins (#7854)

* update transformers cache

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* add cd

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>

* Update README.rst for container update (#7844)

Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com>

* Add support for finetuning with huggingface datasets (#7834)

* add finetune with huggingface dataset

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update yaml

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add extrac hf text and update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* move dataset dependency to common

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Add to Dics

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add ci test

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add max steps in jenkins

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* reduce max steps

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* jenkins test

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add bs=2

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

* Multimodal merge (#7728)

* ControlNet TRT export

* Final MR before release

* SD2 update

* Fixed export issue

* Fix for instruct p2p and reformat

* Fix SD export issue

* Add nemo clip export for DB

* Fix ins pix2pix

* fix sd2 config

* [Mingyuan Ma] BF16 and SD conversion script

* [Imagen] NHWC Feature

* Fix .nemo loading issue for NeMo CLIP in SD

* NeMo r1.20.0 Multimodal Merge

* fix the inductor issue in inference

* Fix inductor loading .nemo issue

* Add Neva Model Support

* Imagen Optimizations

* Neva inference code

* NeMo TOT 1.21 to Internal/main

* Update neva_inference.yaml

* REBASING  for latest code changes

* Update internal/main to main tot

* Parallel DDIM implementation

* 1. Fixing indentation bug. (#7352)

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* NeMo MCore llama2 support + MCore PEFT adapters (#7299)

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* add TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* add todo

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* remove import

Signed-off-by: ericharper <complex451@gmail.com>

* small clean up

Signed-off-by: ericharper <complex451@gmail.com>

* update hidden size in peft base model, add mcore commit to jenkins

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* add config obj to flash attention tests

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove sequence parallel arg

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to test

Signed-off-by: ericharper <complex451@gmail.com>

* get hidden_size from config

Signed-off-by: ericharper <complex451@gmail.com>

* add try except

Signed-off-by: ericharper <complex451@gmail.com>

* use default

Signed-off-by: ericharper <complex451@gmail.com>

* update config with hidden size

Signed-off-by: ericharper <complex451@gmail.com>

* remove arg

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* comment out jenkins test

Signed-off-by: ericharper <complex451@gmail.com>

* revert import

Signed-off-by: ericharper <complex451@gmail.com>

* build transformer config

Signed-off-by: ericharper <complex451@gmail.com>

* add model to provider func

Signed-off-by: ericharper <complex451@gmail.com>

* update forward and float16 wrapper

Signed-off-by: ericharper <complex451@gmail.com>

* instantiate model parallel config after init model parallel

Signed-off-by: ericharper <complex451@gmail.com>

* set virtual rank

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add GQA config to megatron gpt model (#7096)

* Add GQA config in gpt config file

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Verify mcore is enabled when using GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* mcore llama2 ckpt conversion & small fix

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Add inference & sft config by Hongbin

Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add inference param. update TP/PP script to support mcore gpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* modify ckpt conversion script (adding model cast)

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ckpt conversion use relative path for config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* add todo

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove import

Signed-off-by: ericharper <complex451@gmail.com>

* small clean up

Signed-off-by: ericharper <complex451@gmail.com>

* update hidden size in peft base model, add mcore commit to jenkins

Signed-off-by: ericharper <complex451@gmail.com>

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add config obj to flash attention tests

Signed-off-by: ericharper <complex451@gmail.com>

* remove args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove sequence parallel arg

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to test

Signed-off-by: ericharper <complex451@gmail.com>

* get hidden_size from config

Signed-off-by: ericharper <complex451@gmail.com>

* add try except

Signed-off-by: ericharper <complex451@gmail.com>

* use default

Signed-off-by: ericharper <complex451@gmail.com>

* update config with hidden size

Signed-off-by: ericharper <complex451@gmail.com>

* remove arg

Signed-off-by: ericharper <complex451@gmail.com>

* comment out jenkins test

Signed-off-by: ericharper <complex451@gmail.com>

* revert import

Signed-off-by: ericharper <complex451@gmail.com>

* remove optimizer_idx

Signed-off-by: eharper <eharper@nvidia.com>

* prefetch num microbatches

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* fix for p-tuning sequence parallel

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support SFT/distOpt mcore (#7207)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* change layer names for SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug in SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* build transformer config

Signed-off-by: ericharper <complex451@gmail.com>

* add model to provider func

Signed-off-by: ericharper <complex451@gmail.com>

* update forward and float16 wrapper

Signed-off-by: ericharper <complex451@gmail.com>

* instantiate model parallel config after init model parallel

Signed-off-by: ericharper <complex451@gmail.com>

* set virtual rank

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add GQA config to megatron gpt model (#7096)

* Add GQA config in gpt config file

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Verify mcore is enabled when using GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* remove import

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rollback model cast for p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update for dist adam

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* use get_gpt_module_list

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ptl2.0 patch for llama config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add plugins to trainer in scripts

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix activation checkpointing mcore

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix variable names

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* overwrite normalization type for mcore/te

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Update megatron_llama_sft.yaml

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* add PEFT adapter support for mcore gpt path (#7276)

* implementation for mcore adapter/mxins

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* small fix for lora and ptuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support layerwise peft

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support multiple target layers

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support lora GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support amp O2

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert & more O2 fix

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* lora inject to attention

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support lora weight tying

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add copyright header

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback ptuning name change. full string match mcore target

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove comment

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* clean up config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Sync llama branch (#7297)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* change layer names for SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug in SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug: cpu initialization is not really enabled

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* add use_cpu_initialization to TransformerConfig

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug: wrong config path when using relative cjpt path

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* revert mcore config change

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

* clean up ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback git merge errors

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update mcore, add check for mcore+te

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* formatting

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* make sft test dataset optional. fix indentation in config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* one more fix for optional test set

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support merging lora weights in mcore

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update mcore for cpu init

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update ckpt conversion for code llama

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add seq_len_interpolation_factor support for long-context llama ckpts (#7312)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add seq_len_interpolation_factor

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

* fix old ptuning model, update mcore to support seq_len_interpolation_factor

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support fused layernorm linear, fix ptuning O2

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* drop loss mask for mcore for now

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* disable dist ckpt in peft

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix loading non dist ckpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add ckpt conversion to CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* mcore_mixin docstring

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor change in mcore peft error message

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix amp o2 in lora weight tying

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* correct mcore fp8 config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add TE installation

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support mcore adapter tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* comment out new CI test. rollback docker image

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ignore FA tests, try new CI on 23.08

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* mark new CI as L2, put to beginning to test

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor fix for prompt learning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback to 23.06. comment out CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor fix ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor rollback gpt model change

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: ericharper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: eharper <eharper@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Kelvin Liu <lhb8125@users.noreply.github.com>

* Hiddens modules documentation (#7303)

* 1. Changed hiddens transformations module from `transformations` to `hiddens`.

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* 1. Finished doc.

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

---------

Signed-off-by: Micha Livne <mlivne@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Support for flash attention 2.0 (#7063)

* Add flash attn 2

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add FA2 feature

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove debugging

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* lora merge fix for O2 names (#7325)

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* adjust key names based on O2

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* minor

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* multiple fields can form a context (#7147)

* list of context fields and flexible prompt template

Signed-off-by: arendu <adithya.r@gmail.com>

* list of fields for context

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add multiple truncation fields and middle truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Compatible to old ckpt

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix tokenize detokenize issue

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove detokenization, add truncation augmentation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Resolve comments

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove unused import

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert eos

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add tokenizer space_sensitive attribute

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix error

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix erorr and use re

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Change assert logic

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Follow adi suggestion

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove merge function

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add example and comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove context_key and add comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove random truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix template none

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* Load buffers in checkpoint (#7357)

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* Add migration guide for lightning 2.0 upgrade (#7360)

* Add lightning 2.0 migration guide in NeMo docs

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add remaining guide for lightning 2.0 upgrade

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove line spill over and continue in next line

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add missing dataloader_iter in the guide

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix minor typo

Signed-off-by: Abhishree <abhishreetm@gmail.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* adding bias_dropout_add_fusion option for BERT (#7332)

Signed-off-by: Alexander Jipa <azzhipa@amazon.com>
Co-authored-by: Alexander Jipa <azzhipa@amazon.com>

* [TTS] Change audio codec token type to TokenIndex (#7356)

Signed-off-by: Ryan <rlangman@nvidia.com>

* enable selective unfreeze (#7326)

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* avoid PTL method conflicts

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix typos (#7361)

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

---------

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* pin numba=0.57.1 to fix reinstall.sh error (#7366)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Update new conversion script for converting safetensors.

* Upgrade pytorch container to 23.08 (#7353)

* upgrade pytorch container

Signed-off-by: eharper <eharper@nvidia.com>

* use mcore

Signed-off-by: eharper <eharper@nvidia.com>

* revert test change

Signed-off-by: eharper <eharper@nvidia.com>

* pleasefixme

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for ampere

Signed-off-by: eharper <eharper@nvidia.com>

* comment test temporarily

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* enable fp32 optimizer for output_layer in mcore (#7355)

Signed-off-by: lhb8125 <lhb8125@gmail.com>

* revert comment (#7368)

Signed-off-by: eharper <eharper@nvidia.com>

* Update to core 23.08 branch ToT (#7371)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* upper bounding ptl (#7370)

Signed-off-by: eharper <eharper@nvidia.com>

* fix pipeline parallel inference (#7367)

* fix pp inference

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix for peft tied weights (#7372)

Signed-off-by: arendu <adithyare@nvidia.com>

* fixed trainer.strategy=auto from None. (#7369)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add O2 option in gpt eval (#7358)

* add O2 option in eval

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add doc for O2 config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add to llama inference config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Move model precision copy (#7336)

* move cfg precision set to megatron base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* remove copy from other models

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* modify attribute not arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix gpt model test for ptl 2.0

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename function and add docstring

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* replace precision to dtype conditionals with func call

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unnecessary function and cfg reset

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set default value

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix precision lookup in a few more places

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename mapping function

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* ununsed import

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* save torch datatype to model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set weights precision wrt amp o2

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Revert "set weights precision wrt amp o2"

This reverts commit 313a4bfe5eb69d771a6d2433898c0685836aef5c.

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* revert half precision at inference attempt

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move autocast dtype to base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move params dtype to base model, enable fp16 O2 inf

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unused imports

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Fix PEFT checkpoint loading (#7388)

* Fix PEFT checkpoint loading

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Use distributed optimizer support for multiple dtypes (#7359)

* Update distopt wrapper with multiple dtype support

Remove manual handling of separate FP32 optimizer.

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Use distopt support for contiguous buffers with multiple dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Fix typo

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Separate distopt buckets for first GPT layer and non-overlapped params

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Add distopt logic for int dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Remove unused variables

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit in README and Jenkensfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Debug Dockerfile and Jenkinsfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* minor fix for llama ckpt conversion script (#7387)

* minor fix for llama ckpt conversion script

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* Update Jenkinsfile

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* remove fast_swiglu configuration

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix wrong calling of librosa.get_duration() in notebook (#7376)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* [PATCH] PEFT import mcore (#7393)

* [PATCH] PEFT import mcore

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [TTS] Added a callback for logging initial data (#7384)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Update Core Commit (#7402)

* Update Core Commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* update commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* Use cfg attribute in bert (#7394)

* use cfg attribute instead of arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use torch_dtype in place of cfg.precision

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move precision copy before super constructor

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use trainer arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Add support for bias conversion in Swiglu models (#7386)

* Add support for bias conversion in Swiglu models

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix issue with missing tokenizer

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update save_to and restore_from for dist checkpointing (#7343)

* add dist ckpt to save to, in progress

Signed-off-by: eharper <eharper@nvidia.com>

* move dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update restore from, need to figure out how to initialize distributed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* launch distrib if needed when restoring dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* when using mcore we can change tp pp on the fly

Signed-off-by: eharper <eharper@nvidia.com>

* add load_from_checkpoint support for dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update llama convert script to save dist .nemo

Signed-off-by: eharper <eharper@nvidia.com>

* fix load dist ckpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup TE TP groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup te tp groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* remove import

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>

* fix forward for with mcore=false (#7403)

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>

* Fix logging to remove 's/it' from progress bar in Megatron models and add train_step_timing (#7374)

* Add CustomProgressBar class to exp_manager and trainer callbacks

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix the progress bar to reflect total microbatch cnt

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Modify CustomProgressBar class

1) Modify CustomProgressBar class to update progress bar per global_step instead of per microbatch
2) Add the callback to other megatron training/finetuning files that are not using MegatronTrainerBuilder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add CustomProgressBar callback to tuning files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Set Activation Checkpointing Defaults (#7404)

* Set Activation Checkpointing Defaults

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for None

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* make loss mask default to false (#7407)

Signed-off-by: eharper <eharper@nvidia.com>

* Add dummy userbuffer config files (#7408)

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* add missing ubconf files (#7412)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* New tutorial on Speech Data Explorer (#7405)

* Added Google Colab based tutorial on Speech Data Explorer

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>

* Update ptl training ckpt conversion script to work with dist ckpt (#7416)

* update ptl convert script

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* don't break legacy

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Allow disabling sanity checking when num_sanity_val_steps=0 (#7413)

* Allow disabling sanity checking when num_sanity_val_steps=0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update num_sanity_val_steps to be a multiple of num_microbatches

Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add comprehensive error messages (#7261)

Signed-off-by: Anton Peganov <apeganov@nvidia.com>

* check NEMO_PATH (#7418)

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>

* layer selection for ia3 (#7417)

* layer selection for ia3

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix missing pip package 'einops' (#7397)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Fix failure of pyaudio in Google Colab (#7396)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update README.md: output_path --> output_manifest_filepath (#7442)

Signed-off-by: Samuele Cornell <cornellsamuele@gmail.com>

* Updating FlashAttention API to match FlashAttentionV2

* Multiple fixes for mm

* Fix CI inductor issue and update to torch compile

* Remove suppress error

* Fix when conversion config uses fp16 and it complains about precision plugin

* Fixing FAv2 API usage

* Initial release of content filtering model

* Added synthetic dataloader for precached and online mode

* Mingyuanm/dreambooth opt

* Add llama2 support in neva training

* Fix sampler length

* Fix all precision issues in nemo multimodal

* Add rope dynamic linear scaling (#7437)

* Add dynamic linear scaling

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>

* Fix None dataloader issue in PTL2.0 (#7455)

* Fix None dataloader issue in PTL2.0

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [ASR] Confidence measure -> method renames (#7434)

* measure -> method

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add steps for document of getting dataset 'SF Bilingual Speech' (#7378)

* Add steps for document of getting dataset 'SF Bilingual Speech'

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update datasets.rst

added a link from a tutorial demonstrating detailed data prep steps.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* RNN-T confidence and alignment bugfix (#7381)

* new frame_confidence and alignments lists are now always created after the while loop

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* tests added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* Fix resume from checkpoint in exp_manager (#7424) (#7426)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix checking of cuda/cpu device for inputs of Decoder (#7444)

* Fix checking of cuda/cpu device for inputs of Decoder

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update tacotron2.py

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>

* Fix failure of ljspeech's get_data.py (#7430)

* Fix failure of ljspeech's get_data.py

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [TTS] Fix audio codec type checks (#7373)

* [TTS] Fix audio codec type checks

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix audio codec tests

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add dataset to path of logged artifacts (#7462)

* [TTS] Add dataset to path of logged artifacts

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Revert axis name back to Audio Frames

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* Fix sft dataset truncation (#7464)

* Add fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Automatic Lip Reading Recognition (ALR) - ASR/CV (Visual ASR) (#7330)

* striding_conv1d_k5 and dw_striding_conv1d_k5 subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* transpose conv1d inputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, s…

* Update README.md

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* update speechllm (#8486)

* fix(clustering_diarizer.py): fix typo (#7772)

Signed-off-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>

* fix(diarization-README): typo (#7771)

Signed-off-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>

* Fix bug wrt change decoding strategy for bpe models (#7762) (#7764)

* Fix bug wrt change decoding strategy for bpe models


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Remove incorrect extra argument for load_from_checkpoint_dir() (#7500)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add nemo to mcore GPT conversion script  (#7730)

* add conversion script

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove references to 'ckpt'

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add one more sanity check to make sure there is no unexpected keys in state dict

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* make cpu loading work

Signed-off-by: Chen Cui <chcui@nvidia.com>

* make script work for llama2 models

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address code check

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove trainer precision (was for old sanity check)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix script for llama2 model

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove commented code

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix bug in ConditionalInput: cat along the feature dim, not the batch dim (#7785)

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* Add some docs and update scripts for ASR (#7790)

* Add some docs and update scripts

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* set context for text memmap to fork (#7784)

* set context for text memmap to fork

Signed-off-by: arendu <adithyare@nvidia.com>

* typo

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>

* add training with multiple audios

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Support flash decoding (#7744)

* Add flash-decoding

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>

* Change accelerator to 'auto' in nlp_checkpoint_port.py (#7761)

* Change accelerator to 'auto' in nlp_checkpoint_port.py (#7747)

* Change accelerator to auto

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass omegaconf object to trainer in nlp_checkpoint_port.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass omegaconf object to trainer in export.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

* docs: fix typos (#7758)

Signed-off-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Snake act (#7736)

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update gpt_dataset.py (#6963)

Signed-off-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Abhishree <abhishreetm@gmail.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Signed-off-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Xin Yao <yaox12@outlook.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>

* Add selection criteria for reference audios in the `GlobalStyleToken` submodule (#7788)

* add selection criteria for reference audios

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* Update configuration files

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* add informative comment in config files

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* sample random index for reference audio selection

Signed-off-by: anferico <f.cariaggi4@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: anferico <f.cariaggi4@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* update text server to support compute logprobs (#7733)

* update text server to support compute logprobs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

---------

Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* add multi-layer feat extract and fix random question insertion

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Configure MCore logger (#7781)

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Revert "PEFT eval fix (#7626) (#7638)" (#7693)

This reverts commit f03dd660bd26d88fd569e76c6f74b83a7c203ff9.

* remove TN from ctc_segm tut (#7807)

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* [TTS] Support audio offsets in TTS data loaders (#7156)

* [TTS] Support audio offsets in TTS data loaders

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Change docstring mentions of .pt to .npy

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* Update Apex install command in Dockerfile (#7794) (#7804)

* move core install to /workspace (#7706)


* update apex install in dockerfile


* use fetch head


---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Abhinav Khattar <aklife97@gmail.com>

* fix typo

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Nemo to HF converter for LLaMA model (#7770)

* Create config_llama_truncate.yaml

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Add files via upload

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update config_llama_truncate.yaml

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update convert_nemo_llama_to_hf.py

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>

* clean up trainer

* remove dependency on yaml config. load config from nemo file instead.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* enable ckpt saving into other precision formats

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* support 70b + cleanup qkv slice logic

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix bug

* move hf model folder code from comment to function and add instruction to run

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* Save best NeMo model only when necessary (#7836)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* add guard if its a distributed checkpoint (#7845)

Signed-off-by: Gerald Shen <geshen@nvidia.com>

* Fix tn duplex (#7808)

* fix duplex tn infer

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* fix typo

Signed-off-by: Evelina <ebakhturina@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix TN docs

Signed-off-by: Evelina <ebakhturina@nvidia.com>

---------

Signed-off-by: Evelina <ebakhturina@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update transformers cache on Jenkins (#7854)

* update transformers cache

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* add cd

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>

* Update README.rst for container update (#7844)

Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com>

* Add support for finetuning with huggingface datasets (#7834)

* add finetune with huggingface dataset

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update yaml

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add extrac hf text and update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* move dataset dependency to common

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Add to Dics

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add ci test

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add max steps in jenkins

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* reduce max steps

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* jenkins test

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add bs=2

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

* Multimodal merge (#7728)

* ControlNet TRT export

* Final MR before release

* SD2 update

* Fixed export issue

* Fix for instruct p2p and reformat

* Fix SD export issue

* Add nemo clip export for DB

* Fix ins pix2pix

* fix sd2 config

* [Mingyuan Ma] BF16 and SD conversion script

* [Imagen] NHWC Feature

* Fix .nemo loading issue for NeMo CLIP in SD

* NeMo r1.20.0 Multimodal Merge

* fix the inductor issue in inference

* Fix inductor loading .nemo issue

* Add Neva Model Support

* Imagen Optimizations

* Neva inference code

* NeMo TOT 1.21 to Internal/main

* Update neva_inference.yaml

* REBASING  for latest code changes

* Update internal/main to main tot

* Parallel DDIM implementation

* 1. Fixing indentation bug. (#7352)

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* NeMo MCore llama2 support + MCore PEFT adapters (#7299)

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* add TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* add todo

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* remove import

Signed-off-by: ericharper <complex451@gmail.com>

* small clean up

Signed-off-by: ericharper <complex451@gmail.com>

* update hidden size in peft base model, add mcore commit to jenkins

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* add config obj to flash attention tests

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove sequence parallel arg

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to test

Signed-off-by: ericharper <complex451@gmail.com>

* get hidden_size from config

Signed-off-by: ericharper <complex451@gmail.com>

* add try except

Signed-off-by: ericharper <complex451@gmail.com>

* use default

Signed-off-by: ericharper <complex451@gmail.com>

* update config with hidden size

Signed-off-by: ericharper <complex451@gmail.com>

* remove arg

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* comment out jenkins test

Signed-off-by: ericharper <complex451@gmail.com>

* revert import

Signed-off-by: ericharper <complex451@gmail.com>

* build transformer config

Signed-off-by: ericharper <complex451@gmail.com>

* add model to provider func

Signed-off-by: ericharper <complex451@gmail.com>

* update forward and float16 wrapper

Signed-off-by: ericharper <complex451@gmail.com>

* instantiate model parallel config after init model parallel

Signed-off-by: ericharper <complex451@gmail.com>

* set virtual rank

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add GQA config to megatron gpt model (#7096)

* Add GQA config in gpt config file

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Verify mcore is enabled when using GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* mcore llama2 ckpt conversion & small fix

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Add inference & sft config by Hongbin

Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add inference param. update TP/PP script to support mcore gpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* modify ckpt conversion script (adding model cast)

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ckpt conversion use relative path for config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* set vp size to none if it is 1

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* add todo

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove import

Signed-off-by: ericharper <complex451@gmail.com>

* small clean up

Signed-off-by: ericharper <complex451@gmail.com>

* update hidden size in peft base model, add mcore commit to jenkins

Signed-off-by: ericharper <complex451@gmail.com>

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add config obj to flash attention tests

Signed-off-by: ericharper <complex451@gmail.com>

* remove args

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove sequence parallel arg

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to test

Signed-off-by: ericharper <complex451@gmail.com>

* get hidden_size from config

Signed-off-by: ericharper <complex451@gmail.com>

* add try except

Signed-off-by: ericharper <complex451@gmail.com>

* use default

Signed-off-by: ericharper <complex451@gmail.com>

* update config with hidden size

Signed-off-by: ericharper <complex451@gmail.com>

* remove arg

Signed-off-by: ericharper <complex451@gmail.com>

* comment out jenkins test

Signed-off-by: ericharper <complex451@gmail.com>

* revert import

Signed-off-by: ericharper <complex451@gmail.com>

* remove optimizer_idx

Signed-off-by: eharper <eharper@nvidia.com>

* prefetch num microbatches

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* start adding gpt from megatron core path

Signed-off-by: ericharper <complex451@gmail.com>

* set model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* use model parallel config object

Signed-off-by: ericharper <complex451@gmail.com>

* update args

Signed-off-by: ericharper <complex451@gmail.com>

* fix for p-tuning sequence parallel

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support SFT/distOpt mcore (#7207)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* change layer names for SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug in SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* start updating to TransformerConfig

Signed-off-by: ericharper <complex451@gmail.com>

* revert to model parallel config

Signed-off-by: ericharper <complex451@gmail.com>

* add hidden_size to model_parallel_config

Signed-off-by: ericharper <complex451@gmail.com>

* remove imports

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update module args

Signed-off-by: ericharper <complex451@gmail.com>

* add config to self

Signed-off-by: ericharper <complex451@gmail.com>

* build transformer config

Signed-off-by: ericharper <complex451@gmail.com>

* add model to provider func

Signed-off-by: ericharper <complex451@gmail.com>

* update forward and float16 wrapper

Signed-off-by: ericharper <complex451@gmail.com>

* instantiate model parallel config after init model parallel

Signed-off-by: ericharper <complex451@gmail.com>

* set virtual rank

Signed-off-by: ericharper <complex451@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add GQA config to megatron gpt model (#7096)

* Add GQA config in gpt config file

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Verify mcore is enabled when using GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert

Signed-off-by: ericharper <complex451@gmail.com>

* remove import

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rollback model cast for p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update for dist adam

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* use get_gpt_module_list

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ptl2.0 patch for llama config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add plugins to trainer in scripts

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix activation checkpointing mcore

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix variable names

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* overwrite normalization type for mcore/te

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Update megatron_llama_sft.yaml

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* add PEFT adapter support for mcore gpt path (#7276)

* implementation for mcore adapter/mxins

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* small fix for lora and ptuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support layerwise peft

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support multiple target layers

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support lora GQA

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support amp O2

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* revert & more O2 fix

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* lora inject to attention

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support lora weight tying

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add copyright header

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback ptuning name change. full string match mcore target

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove comment

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* clean up config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* Sync llama branch (#7297)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* change layer names for SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug in SFT

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug: cpu initialization is not really enabled

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* add use_cpu_initialization to TransformerConfig

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* fix bug: wrong config path when using relative cjpt path

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

* revert mcore config change

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

* clean up ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback git merge errors

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update mcore, add check for mcore+te

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* formatting

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* make sft test dataset optional. fix indentation in config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* one more fix for optional test set

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support merging lora weights in mcore

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update mcore for cpu init

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update ckpt conversion for code llama

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add seq_len_interpolation_factor support for long-context llama ckpts (#7312)

* add inference param. update TP/PP script to support mcore gpt

* p-tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add seq_len_interpolation_factor

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

* fix old ptuning model, update mcore to support seq_len_interpolation_factor

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support fused layernorm linear, fix ptuning O2

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* drop loss mask for mcore for now

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* disable dist ckpt in peft

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix loading non dist ckpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add ckpt conversion to CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* update CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* mcore_mixin docstring

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor change in mcore peft error message

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* fix amp o2 in lora weight tying

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* correct mcore fp8 config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add TE installation

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* support mcore adapter tuning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* comment out new CI test. rollback docker image

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* ignore FA tests, try new CI on 23.08

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* mark new CI as L2, put to beginning to test

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor fix for prompt learning

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* rollback to 23.06. comment out CI

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* minor fix ckpt conversion script

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor rollback gpt model change

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: ericharper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: eharper <eharper@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Kelvin Liu <lhb8125@users.noreply.github.com>

* Hiddens modules documentation (#7303)

* 1. Changed hiddens transformations module from `transformations` to `hiddens`.

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* 1. Finished doc.

Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

* 1. Debugging. Signed-off-by: Micha Livne <mlivne@nvidia.com>

---------

Signed-off-by: Micha Livne <mlivne@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Support for flash attention 2.0 (#7063)

* Add flash attn 2

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add FA2 feature

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove debugging

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* lora merge fix for O2 names (#7325)

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* adjust key names based on O2

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* minor

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* multiple fields can form a context (#7147)

* list of context fields and flexible prompt template

Signed-off-by: arendu <adithya.r@gmail.com>

* list of fields for context

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add multiple truncation fields and middle truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Compatible to old ckpt

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix tokenize detokenize issue

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove detokenization, add truncation augmentation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Resolve comments

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove unused import

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert eos

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add tokenizer space_sensitive attribute

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix error

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix erorr and use re

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Change assert logic

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Follow adi suggestion

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove merge function

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add example and comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove context_key and add comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove random truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix template none

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* Load buffers in checkpoint (#7357)

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* Add migration guide for lightning 2.0 upgrade (#7360)

* Add lightning 2.0 migration guide in NeMo docs

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add remaining guide for lightning 2.0 upgrade

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove line spill over and continue in next line

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add missing dataloader_iter in the guide

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix minor typo

Signed-off-by: Abhishree <abhishreetm@gmail.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* adding bias_dropout_add_fusion option for BERT (#7332)

Signed-off-by: Alexander Jipa <azzhipa@amazon.com>
Co-authored-by: Alexander Jipa <azzhipa@amazon.com>

* [TTS] Change audio codec token type to TokenIndex (#7356)

Signed-off-by: Ryan <rlangman@nvidia.com>

* enable selective unfreeze (#7326)

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* wip

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* avoid PTL method conflicts

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix typos (#7361)

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typos

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* fix typo

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

---------

Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>

* pin numba=0.57.1 to fix reinstall.sh error (#7366)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Update new conversion script for converting safetensors.

* Upgrade pytorch container to 23.08 (#7353)

* upgrade pytorch container

Signed-off-by: eharper <eharper@nvidia.com>

* use mcore

Signed-off-by: eharper <eharper@nvidia.com>

* revert test change

Signed-off-by: eharper <eharper@nvidia.com>

* pleasefixme

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for ampere

Signed-off-by: eharper <eharper@nvidia.com>

* comment test temporarily

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* enable fp32 optimizer for output_layer in mcore (#7355)

Signed-off-by: lhb8125 <lhb8125@gmail.com>

* revert comment (#7368)

Signed-off-by: eharper <eharper@nvidia.com>

* Update to core 23.08 branch ToT (#7371)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* upper bounding ptl (#7370)

Signed-off-by: eharper <eharper@nvidia.com>

* fix pipeline parallel inference (#7367)

* fix pp inference

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix for peft tied weights (#7372)

Signed-off-by: arendu <adithyare@nvidia.com>

* fixed trainer.strategy=auto from None. (#7369)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add O2 option in gpt eval (#7358)

* add O2 option in eval

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add doc for O2 config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* add to llama inference config

Signed-off-by: jasonwan <jasonwan@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Move model precision copy (#7336)

* move cfg precision set to megatron base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* remove copy from other models

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* modify attribute not arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix gpt model test for ptl 2.0

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename function and add docstring

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* replace precision to dtype conditionals with func call

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unnecessary function and cfg reset

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set default value

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix precision lookup in a few more places

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename mapping function

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* ununsed import

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* save torch datatype to model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set weights precision wrt amp o2

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Revert "set weights precision wrt amp o2"

This reverts commit 313a4bfe5eb69d771a6d2433898c0685836aef5c.

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* revert half precision at inference attempt

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move autocast dtype to base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move params dtype to base model, enable fp16 O2 inf

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unused imports

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Fix PEFT checkpoint loading (#7388)

* Fix PEFT checkpoint loading

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Use distributed optimizer support for multiple dtypes (#7359)

* Update distopt wrapper with multiple dtype support

Remove manual handling of separate FP32 optimizer.

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Use distopt support for contiguous buffers with multiple dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Fix typo

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Separate distopt buckets for first GPT layer and non-overlapped params

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Add distopt logic for int dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Remove unused variables

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit in README and Jenkensfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Debug Dockerfile and Jenkinsfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* minor fix for llama ckpt conversion script (#7387)

* minor fix for llama ckpt conversion script

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* Update Jenkinsfile

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* remove fast_swiglu configuration

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix wrong calling of librosa.get_duration() in notebook (#7376)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* [PATCH] PEFT import mcore (#7393)

* [PATCH] PEFT import mcore

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [TTS] Added a callback for logging initial data (#7384)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Update Core Commit (#7402)

* Update Core Commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* update commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* Use cfg attribute in bert (#7394)

* use cfg attribute instead of arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use torch_dtype in place of cfg.precision

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move precision copy before super constructor

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use trainer arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Add support for bias conversion in Swiglu models (#7386)

* Add support for bias conversion in Swiglu models

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix issue with missing tokenizer

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update save_to and restore_from for dist checkpointing (#7343)

* add dist ckpt to save to, in progress

Signed-off-by: eharper <eharper@nvidia.com>

* move dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update restore from, need to figure out how to initialize distributed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* launch distrib if needed when restoring dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* when using mcore we can change tp pp on the fly

Signed-off-by: eharper <eharper@nvidia.com>

* add load_from_checkpoint support for dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update llama convert script to save dist .nemo

Signed-off-by: eharper <eharper@nvidia.com>

* fix load dist ckpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup TE TP groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup te tp groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* remove import

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>

* fix forward for with mcore=false (#7403)

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>

* Fix logging to remove 's/it' from progress bar in Megatron models and add train_step_timing (#7374)

* Add CustomProgressBar class to exp_manager and trainer callbacks

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix the progress bar to reflect total microbatch cnt

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Modify CustomProgressBar class

1) Modify CustomProgressBar class to update progress bar per global_step instead of per microbatch
2) Add the callback to other megatron training/finetuning files that are not using MegatronTrainerBuilder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add CustomProgressBar callback to tuning files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Set Activation Checkpointing Defaults (#7404)

* Set Activation Checkpointing Defaults

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for None

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* make loss mask default to false (#7407)

Signed-off-by: eharper <eharper@nvidia.com>

* Add dummy userbuffer config files (#7408)

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* add missing ubconf files (#7412)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* New tutorial on Speech Data Explorer (#7405)

* Added Google Colab based tutorial on Speech Data Explorer

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>

* Update ptl training ckpt conversion script to work with dist ckpt (#7416)

* update ptl convert script

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* don't break legacy

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Allow disabling sanity checking when num_sanity_val_steps=0 (#7413)

* Allow disabling sanity checking when num_sanity_val_steps=0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update num_sanity_val_steps to be a multiple of num_microbatches

Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add comprehensive error messages (#7261)

Signed-off-by: Anton Peganov <apeganov@nvidia.com>

* check NEMO_PATH (#7418)

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>

* layer selection for ia3 (#7417)

* layer selection for ia3

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix missing pip package 'einops' (#7397)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Fix failure of pyaudio in Google Colab (#7396)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update README.md: output_path --> output_manifest_filepath (#7442)

Signed-off-by: Samuele Cornell <cornellsamuele@gmail.com>

* Updating FlashAttention API to match FlashAttentionV2

* Multiple fixes for mm

* Fix CI inductor issue and update to torch compile

* Remove suppress error

* Fix when conversion config uses fp16 and it complains about precision plugin

* Fixing FAv2 API usage

* Initial release of content filtering model

* Added synthetic dataloader for precached and online mode

* Mingyuanm/dreambooth opt

* Add llama2 support in neva training

* Fix sampler length

* Fix all precision issues in nemo multimodal

* Add rope dynamic linear scaling (#7437)

* Add dynamic linear scaling

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>

* Fix None dataloader issue in PTL2.0 (#7455)

* Fix None dataloader issue in PTL2.0

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [ASR] Confidence measure -> method renames (#7434)

* measure -> method

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add steps for document of getting dataset 'SF Bilingual Speech' (#7378)

* Add steps for document of getting dataset 'SF Bilingual Speech'

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update datasets.rst

added a link from a tutorial demonstrating detailed data prep steps.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* RNN-T confidence and alignment bugfix (#7381)

* new frame_confidence and alignments lists are now always created after the while loop

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* tests added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* Fix resume from checkpoint in exp_manager (#7424) (#7426)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix checking of cuda/cpu device for inputs of Decoder (#7444)

* Fix checking of cuda/cpu device for inputs of Decoder

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update tacotron2.py

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>

* Fix failure of ljspeech's get_data.py (#7430)

* Fix failure of ljspeech's get_data.py

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [TTS] Fix audio codec type checks (#7373)

* [TTS] Fix audio codec type checks

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix audio codec tests

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add dataset to path of logged artifacts (#7462)

* [TTS] Add dataset to path of logged artifacts

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Revert axis name back to Audio Frames

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* Fix sft dataset truncation (#7464)

* Add fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Automatic Lip Reading Recognition (ALR) - ASR/CV (Visual ASR) (#7330)

* striding_conv1d_k5 and dw_striding_conv1d_k5 subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* transpose conv1d inputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* Update subsampling.py

change striding_conv1d_k5 to striding_conv1d

Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* cv branch

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* video manifest

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* add collection classes

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test_step_outputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest bug when having only audio or only videos

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest bug when having only audio or only videos

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* clean references

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* freeze unfreeze transcribe cv models

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest get_full_path bug

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* update for PR

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* guard torchvision

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/cv/data/video_to_text_dataset.py

Co-aut…

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* for now bypass asr_model init in perception since that causes issues in tp=2

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update doc and infer

Signed-off-by: stevehuang52 <heh@nvidia.com>

* https://github.com/NVIDIA/NeMo/pull/8464/files

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add a debug script

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support text-only training and speech and text joint training

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* always require text only data has question field in the data and use it

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support prepend_to_exist_question

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support random_context_prob

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* apply random_context_prob for w/ and w/o canary

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* guard random context

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* protect the case where answer is empty

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix for ++model.pretrained_canary_model=$ASR_MODEL

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support unfreeze_emb

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* minor update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix import

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* support t5 + lhotse

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* add xattn

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* CrossAttendModularizedAudioT5Model is WIP and replaced by audio_prompt_first=False

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support distributed adam

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix pretrained info

Signed-off-by: stevehuang52 <heh@nvidia.com>

* support with_distributed_adam

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix distributed adam

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* add local_batch_size

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support mt5

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update dockerfile

Signed-off-by: stevehuang52 <heh@nvidia.com>

* support mt5 and bypass bos_id=-1

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support configurating legacy_tokenizer for mt5 models

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update for merging main

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix for merge main

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix speechlm test

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update doc

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix multi-layer feat

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for webdataset

Signed-off-by: stevehuang52 <heh@nvidia.com>

* support setting dropout and label smoothing

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* make sure the updated cfg is passed to frozen_model

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* mv model paths

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* force str to avoid bugs with implicit conversion of str  to bool type

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Update examples/multimodal/speech_llm/README.md

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update examples/multimodal/speech_llm/README.md

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update for saving nemo

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update eval and ngc ckpt

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Update nemo/collections/multimodal/speech_llm/data/audio_text_qa_dataset.py

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_utils.py

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update tests/collections/multimodal/test_speechllm_models.py

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* refactor and remove nlp adapter mixin assert

Signed-off-by: stevehuang52 <heh@nvidia.com>

* remove random context augmentation

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* minor refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fixes to be compatible with 24.01

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* refactor and fix missing import

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix for unfreeze llm

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* for unfreeze am

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* major refactor on input format and minor update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix codeQL

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix for canary prompt

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix for canary prompt and support t5

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* configurable random_context_positive_percent

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update default random_context_num to 8 to reduce seq len

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* inference support

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* support TP>1

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix for salm decode

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* update for NGC ckpt and refactor

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* support output metainfo with audio_filepath

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* revert unrelated changes

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* revert unrelated changes

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* some fixes for t5

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* clean up and test inference

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* move dataset code to one place

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* verify train and inference for bestow+gpt and salm+t5

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* skip speechlm test until data moved to CI machines

Signed-off-by: stevehuang52 <heh@nvidia.com>

* use pad_id for pad and add eos_id when enabled

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* refactor and update to avoid changing nlp_adapter_mixin

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: stevehuang52 <stevehuang52@users.noreply.github.com>

* minor edit

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* Apply isort and black reformatting

Signed-off-by: zhehuaichen <zhehuaichen@users.noreply.github.com>

* fixes per Piotr and Steve's comments

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* WIP in getting rid of canary specific things in dataset

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* remove canary specific design; bugfix for asr/models/aed_multitask_models.py

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* remove random_context and submit it later by rewriting with augmenter

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* remove canary specific stuffs in dataloading; use input_cfg in lhotse to support context

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix for https://github.com/NVIDIA/NeMo/pull/9169/#pullrequestreview-2091103480

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* minor fix

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* make sure NGC inference and fix CodeQL https://github.com/NVIDIA/NeMo/pull/9169/checks?check_run_id=25818322332

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* add back the assert in nlp collection and add a enforce_divisible_batch flag

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* nit

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fixes per Som s comments https://github.com/NVIDIA/NeMo/pull/9169#pullrequestreview-2099829608

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* nit

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* fix split_list

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

---------

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: anferico <f.cariaggi4@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: arendu <adithyare@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Signed-off-by: Xin Yao <xiny@nvidia.com>
Signed-off-by: Zhilin Wang <zhilinw@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Evelina <ebakhturina@nvidia.com>
Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>
Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Signed-off-by: Gerald Shen <geshen@nvidia.com>
Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Micha Livne <mlivne@nvidia.com>
Signed-off-by: ericharper <complex451@gmail.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Alexander Jipa <azzhipa@amazon.com>
Signed-off-by: omahs <73983677+omahs@users.noreply.github.com>
Signed-off-by: lhb8125 <lhb8125@gmail.com>
Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: Tim Moon <tmoon@nvidia.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Anton Peganov <apeganov@nvidia.com>
Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
Signed-off-by: Samuele Cornell <cornellsamuele@gmail.com>
Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Signed-off-by: mburchi <maxime.burchi@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Tamerlan Tabolov <tktabolov@gmail.com>
Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Stas Bekman <stas00@users.noreply.github.com>
Signed-off-by: Jocelyn Huang <jocelynh@nvidia.com>
Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>
Signed-off-by: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: BestJuly <chntaoli@163.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: George <37293288+Jorjeous@users.noreply.github.com>
Signed-off-by: Mehadi Hasan Menon <mehadihasan80@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>
Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Yi Dong <yidong@nvidia.com>
Signed-off-by: fayejf <fayejf07@gmail.com>
Signed-off-by: Igor Gitman <igitman@nvidia.com>
Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Signed-off-by: Seonghun Noh <jzi040941@naver.com>
Signed-off-by: Seonghun <jzi040941@naver.com>
Signed-off-by: Eric Harper <complex451@gmail.com>
Signed-off-by: David Mosallanezhad <dmosallanezh@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Selvaraj Anandaraj <selvaraja@computelab-frontend-3.nvidia.com>
Signed-off-by: dimapihtar <dpykhtar@nvidia.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Xiaowei Ren <xren@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Daniel Egert <degert@nvidia.com>
Signed-off-by: Faith Wenyi Nchifor <52848633+Faith-Nchifor@users.noreply.github.com>
Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: Martin <martin.ku@skysource.com.tw>
Signed-off-by: Oren Amsalem <oren.a4@gmail.com>
Signed-off-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Vivian <xuanzic@nvidia.com>
Signed-off-by: Vivian chen <xuanzic@nvidia.com>
Signed-off-by: Vivian Chen <140748220+xuanzic@users.noreply.github.com>
Signed-off-by: Vivian Chen <xuanzic@nvidia.com>
Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Alexandra Antonova <antonova_sasha@list.ru>
Signed-off-by: Shantanu Acharya <shantanua@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Agoniii <815244047@qq.com>
Signed-off-by: Stephen <stephen.mcconnachie@bfi.org.uk>
Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Signed-off-by: popcornell <cornellsamuele@gmail.com>
Signed-off-by: Michal Futrega <michal.futrega@gmail.com>
Signed-off-by: xren <xren@nvidia.com>
Signed-off-by: Iztok Lebar Bajec <itzsimpl@gmail.com>
Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
Signed-off-by: Pablo Garay <pagaray@nvidia.com>
Signed-off-by: Harishankar G <harishankar.gopalan@ymail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>
Signed-off-by: jiemingz <jiemingz@nvidia.com>
Signed-off-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Jacek Bieniusiewicz <jbieniusiewi@nvidia.com>
Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>
Signed-off-by: Huiying Li <huiyingl@nvidia.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: stevehuang52 <stevehuang52@users.noreply.github.com>
Signed-off-by: zhehuaichen <zhehuaichen@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <pzelasko@nvidia.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Francesco Cariaggi <f.cariaggi4@gmail.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>
Co-authored-by: shuoer86 <129674997+shuoer86@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Xin Yao <yaox12@outlook.com>
Co-authored-by: Sandeep Subramanian <sandeep.subramanian.1@umontreal.ca>
Co-authored-by: Zhilin Wang <zhilinw@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com>
Co-authored-by: Ryan Langman <rlangman@nvidia.com>
Co-authored-by: Abhinav Khattar <aklife97@gmail.com>
Co-authored-by: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com>
Co-authored-by: anteju <108555623+anteju@users.noreply.github.com>
Co-authored-by: Gerald Shen <119401249+gshennvm@users.noreply.github.com>
Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: Yu Yao <yuya@nvidia.com>
Co-authored-by: Alexandre Milesi <alexandrem@nvidia.com>
Co-authored-by: Ao Tang <aot@nvidia.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: Maanu Grover <maanug@nvidia.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@nvidia.com>
Co-authored-by: Mateusz Sieniawski <msieniawski@nvidia.com>
Co-authored-by: Micha Livne <michalivne@users.noreply.github.com>
Co-authored-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: eharper <eharper@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Kelvin Liu <lhb8125@users.noreply.github.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Alexander Jipa <alexander.jipa@gmail.com>
Co-authored-by: Alexander Jipa <azzhipa@amazon.com>
Co-authored-by: omahs <73983677+omahs@users.noreply.github.com>
Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: PeganovAnton <apeganov@nvidia.com>
Co-authored-by: Nikolay Karpov <karpnv@gmail.com>
Co-authored-by: Samuele Cornell <cornellsamuele@gmail.com>
Co-authored-by: Parth Mannan <pmannan@nvidia.com>
Co-authored-by: Lukasz Pierscieniewski <lukaszp@nvidia.com>
Co-authored-by: Kunal Dhawan <kunaldhawan97@gmail.com>
Co-authored-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>
Co-authored-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>
Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: Tamerlan Tabolov <nektonikto999@gmail.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
Co-authored-by: Jocelyn <jocelynh@nvidia.com>
Co-authored-by: Giacomo Leone Maria Cavallini <72698188+GiacomoLeoneMaria@users.noreply.github.com>
Co-authored-by: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Co-authored-by: meatybobby <meatybobby@gmail.com>
Co-authored-by: Marc Romeyn <marcromeyn@gmail.com>
Co-authored-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Co-authored-by: Yuanzhe Dong <yudong@nvidia.com>
Co-authored-by: Li Tao <chntaoli@163.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Co-authored-by: Igor Gitman <igitman@nvidia.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Mehadi Hasan Menon <mehadihasan80@gmail.com>
Co-authored-by: Ahmad Kiswani <kiswani.ahmad@gmail.com>
Co-authored-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com>
Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Co-authored-by: Seonghun Noh <jzi040941@naver.com>
Co-authored-by: David <amosalla@asu.edu>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: Selvaraj Anandaraj <anandaraj@wisc.edu>
Co-authored-by: Selvaraj Anandaraj <selvaraja@computelab-frontend-3.nvidia.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Xiaowei Ren <103958965+xrennvidia@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: trias702 <25867060+trias702@users.noreply.github.com>
Co-authored-by: Faith Wenyi Nchifor <52848633+Faith-Nchifor@users.noreply.github.com>
Co-authored-by: Nikolay Karpov <nkarpov@nvidia.com>
Co-authored-by: Martin <martin.ku@skysource.com.tw>
Co-authored-by: Oren Amsalem <oren.amsalem1@mail.huji.ac.il>
Co-authored-by: Szymon Mikler <sjmikler@gmail.com>
Co-authored-by: Vivian Chen <140748220+xuanzic@users.noreply.github.com>
Co-authored-by: Huiying Li <huiyingl@nvidia.com>
Co-authored-by: HuiyingLi <willwin.lee@gmail.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: bene-ges <antonova_sasha@list.ru>
Co-authored-by: Shantanu Acharya <shantanua@nvidia.com>
Co-authored-by: Oren Amsalem <oren.a4@gmail.com>
Co-authored-by: Cathy <815244047@qq.com>
Co-authored-by: Stephen <stephen.mcconnachie@bfi.org.uk>
Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com>
Co-authored-by: Terry Kong <terrycurtiskong@gmail.com>
Co-authored-by: Michal Futrega <michal.futrega@gmail.com>
Co-authored-by: Iztok Lebar Bajec <ilb@fri.uni-lj.si>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Zhuoyao Wang <zhuoyaow@nvidia.com>
Co-authored-by: Szymon Mikler <smikler@nvidia.com>
Co-authored-by: Marek Wawrzos <mwawrzos@nvidia.com>
Co-authored-by: Chia-Chih Chen <chiachihc@nvidia.com>
Co-authored-by: Ali Taghibakhshi <ataghibakhsh@nvidia.com>
Co-authored-by: Harishankar G <harishankar.gopalan@ymail.com>
Co-authored-by: Layali R <31741533+layalir@users.noreply.github.com>
Co-authored-by: Hainan Xu <hainan.xv@gmail.com>
Co-authored-by: Hainan Xu <hainanx@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: jbieniusiewi <152396322+jbieniusiewi@users.noreply.github.com>
Co-authored-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com>
Co-authored-by: stevehuang52 <stevehuang52@users.noreply.github.com>
Co-authored-by: zhehuaichen <zhehuaichen@users.noreply.github.com>
---
 ...r_audio_gpt_config_cross_llama_lhotse.yaml |  329 ++++
 .../conf/modular_audio_gpt_config_eval.yaml   |    1 -
 ...modular_audio_gpt_config_llama_lhotse.yaml |  317 ++++
 .../conf/salm/modular_audio_t5_config.yaml    |  334 ++++
 .../speech_llm/modular_audio_gpt_train.py     |    8 +-
 .../speech_llm/data/audio_text_dataset.py     |  208 +--
 .../speech_llm/data/build_dataset.py          |  229 +++
 .../speech_llm/data/lhotse_dataset.py         |  166 ++
 .../speech_llm/models/modular_models.py       |  247 ++-
 .../speech_llm/models/modular_t5_models.py    | 1367 +++++++++++++++++
 .../common/audio_text_generation_strategy.py  |  117 +-
 .../speech_llm/modules/modality_adapters.py   |   12 +
 .../speech_llm/modules/perception_modules.py  |   76 +-
 .../speech_llm/parts/utils/data_utils.py      |  225 +++
 .../language_modeling/megatron_base_model.py  |    2 +-
 .../megatron_base_prompt_learning_model.py    |   48 +-
 .../megatron_gpt_sft_model.py                 |    3 +-
 .../megatron_lm_encoder_decoder_model.py      |    4 +
 .../nlp/modules/common/megatron/utils.py      |   24 +-
 19 files changed, 3344 insertions(+), 373 deletions(-)
 create mode 100644 examples/multimodal/speech_llm/conf/bestow/modular_audio_gpt_config_cross_llama_lhotse.yaml
 create mode 100644 examples/multimodal/speech_llm/conf/salm/modular_audio_gpt_config_llama_lhotse.yaml
 create mode 100644 examples/multimodal/speech_llm/conf/salm/modular_audio_t5_config.yaml
 create mode 100644 nemo/collections/multimodal/speech_llm/data/build_dataset.py
 create mode 100644 nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py
 create mode 100644 nemo/collections/multimodal/speech_llm/models/modular_t5_models.py

diff --git a/examples/multimodal/speech_llm/conf/bestow/modular_audio_gpt_config_cross_llama_lhotse.yaml b/examples/multimodal/speech_llm/conf/bestow/modular_audio_gpt_config_cross_llama_lhotse.yaml
new file mode 100644
index 000000000000..6145a1a4c462
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/bestow/modular_audio_gpt_config_cross_llama_lhotse.yaml
@@ -0,0 +1,329 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: megatron_audio_gpt_bestow_lhotse
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  limit_train_batches : 1000
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 1000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+model_target: nemo.collections.multimodal.speech_llm.models.modular_models.CrossAttendModularAudioGPTModel
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: stt_en_fastconformer_transducer_large
+  freeze_llm: True
+  freeze_audio_encoder: False
+  freeze_modality_adapter: False
+  load_audio_encoder: True
+
+  ## Legacy batch_size configuration
+  # When used with lhotse, the batch composition is decided by dataloader configs
+  # and batch size here is only used for deciding gradient accumulation.
+  # gradient accumulation = global_batch_size / micro_batch_size / data_parallel_size
+  # where data_parallel_size = num_nodes * num_gpus / TP_size
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  # use_am_tokenizer: True
+  # override_vocab_size: 1024
+
+  peft:
+    peft_scheme: "lora"  # can be either lora, adapter, ia3 or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+  perception:
+    target: nemo.collections.multimodal.speech_llm.modules.perception_modules.AudioPerceptionModule
+    use_multi_layer_feat: false
+    xattn:
+      target: nemo.collections.multimodal.speech_llm.modules.perception_modules.TransformerCrossAttention
+      num_attention_heads: 8
+      attn_score_dropout: 0.1
+      attn_layer_dropout: 0.1
+      ffn_dropout: 0.1
+      hidden_act: "relu"
+      pre_ln: true
+      pre_ln_final_layer_norm: true
+
+    multi_layer_feat:
+      layer_idx_list: [0,16]  # layer indices to extract features from
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained AM:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'question': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'question' field is added if missing in manigests, so as to work with ASR manifests
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "[INST]\n<<SYS>>\nPlease answer the following based on the previous speech feature.\n<</SYS>>\n\n{context}[/INST] {answer}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      use_lhotse: True
+      text_field : "text"
+      batch_duration : 80  # 0
+      quadratic_duration : 30
+      num_buckets : 30
+      buffer_size : 10000
+      shuffle_buffer_size : 10000
+      duration_bins: null
+
+    validation_ds:
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+      log_every_n_steps: 10
+      metric:
+        name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml
index e2ef61a8046d..62b9030b4708 100644
--- a/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml
+++ b/examples/multimodal/speech_llm/conf/modular_audio_gpt_config_eval.yaml
@@ -81,7 +81,6 @@ model:
 
   data:
     test_ds:
-      manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
       names: null # Names of the corresponding datasets used to log metrics.
       global_batch_size: 1
       micro_batch_size: 1
diff --git a/examples/multimodal/speech_llm/conf/salm/modular_audio_gpt_config_llama_lhotse.yaml b/examples/multimodal/speech_llm/conf/salm/modular_audio_gpt_config_llama_lhotse.yaml
new file mode 100644
index 000000000000..cc848562f70e
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/salm/modular_audio_gpt_config_llama_lhotse.yaml
@@ -0,0 +1,317 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: megatron_audio_gpt_salm_lhotse
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  limit_train_batches : 1000
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 1000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: stt_en_fastconformer_transducer_large
+  freeze_llm: True
+  freeze_audio_encoder: False
+  freeze_modality_adapter: False
+  load_audio_encoder: True
+
+  ## Legacy batch_size configuration
+  # When used with lhotse, the batch composition is decided by dataloader configs
+  # and batch size here is only used for deciding gradient accumulation.
+  # gradient accumulation = global_batch_size / micro_batch_size / data_parallel_size
+  # where data_parallel_size = num_nodes * num_gpus / TP_size
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  # use_am_tokenizer: True
+  # override_vocab_size: 1024
+
+  peft:
+    peft_scheme: "lora"  # can be either lora, adapter, ia3 or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+  perception:
+    target: nemo.collections.multimodal.speech_llm.modules.perception_modules.AudioPerceptionModule
+    use_multi_layer_feat: false
+    multi_layer_feat:
+      layer_idx_list: [0,16]  # layer indices to extract features from
+      aggregator:
+        mode: "cat"  # ways to combine features from different layers, choices=['cat','sum','mean', 'max', 'min'], default to concat ('cat')
+        pooling: "avg"  # ways to pool features if they have different temporal lengths and align_mode=min, choices=['mean', 'max', 'min']
+        align_mode: "min"  # if features have different temporal lengths, set `min` to pool to the shortest length or `max` to repeat to the longest.
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained AM:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    end_string: "[EOG]"
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'question': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'question' field is added if missing in manigests, so as to work with ASR manifests
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      end_string: ${model.data.end_string}
+      add_sep: False
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "[INST]\n<<SYS>>\nPlease answer the following based on the previous speech feature.\n<</SYS>>\n\n{context}[/INST] {answer}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      use_lhotse: True
+      text_field : "text"
+      batch_duration : 80  # 0
+      quadratic_duration : 30
+      num_buckets : 30
+      buffer_size : 10000
+      shuffle_buffer_size : 10000
+      duration_bins: null
+
+    validation_ds:
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      end_string: ${model.data.end_string}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+      log_every_n_steps: 10
+      metric:
+        name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/conf/salm/modular_audio_t5_config.yaml b/examples/multimodal/speech_llm/conf/salm/modular_audio_t5_config.yaml
new file mode 100644
index 000000000000..a76de9e312e2
--- /dev/null
+++ b/examples/multimodal/speech_llm/conf/salm/modular_audio_t5_config.yaml
@@ -0,0 +1,334 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: megatron_audio_t5_salm_lhotse
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  limit_train_batches : 1000
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  accumulate_grad_batches: 1
+
+model_target: nemo.collections.multimodal.speech_llm.models.modular_t5_models.ModularizedAudioT5Model
+exp_manager:
+  # explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  virtual_prompt_style: 'no-prompts'  # make cls happy
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  
+  pretrained_audio_model: stt_en_fastconformer_transducer_large
+  freeze_llm: True
+  freeze_audio_encoder: False
+  freeze_modality_adapter: False
+  load_audio_encoder: True
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  language_model_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint 
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: True
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  # use_am_tokenizer: True
+  # override_vocab_size: 1024
+
+  lora_tuning:
+    kqv_adapter_dim: 128
+    kv_adapter_dim: 64
+    q_adapter_dim: 32
+    adapter_dropout: 0.0
+    column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+    row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+
+  peft:
+    peft_scheme: "adapter"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+        
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+  perception:
+    target: nemo.collections.multimodal.speech_llm.modules.perception_modules.AudioPerceptionModule
+    use_multi_layer_feat: false
+
+    modality_adapter: 
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: 1024
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 8 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+    spec_augment:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 2 # set to zero to disable it
+      time_masks: 10 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
+
+    # the following are read from the pretrained AM:
+    # output_dim: null
+    # encoder: null
+    # preprocessor: null
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # manifest_filepath:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'question': 'transcribe this audio', 'answer': 'I have a dream...'}
+      # the 'answer' field can also be 'text', and a default 'question' field is added if missing in manigests, so as to work with ASR manifests
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Notably, the data weights are controlled by either bucketing_weights
+      # or concat_sampling_probabilities depending on the dataset type (tar and
+      # non-tar).
+      # See audio_text_qa_dataset.py for details.
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      context_key: 'context'
+      answer_key: 'answer'
+      add_eos: True
+      # add_eos: False
+      add_sep: True
+      add_bos: False
+      separate_prompt_and_response_with_newline: False
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+      max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+      min_duration: 0.1
+      # tarred datasets
+      is_tarred: false
+      tarred_audio_filepaths: null
+      shuffle_n: 2048
+      # bucketing params
+      bucketing_strategy: "fully_randomized"
+      bucketing_batch_size: null
+      # sample_alpha: 0.1
+      use_lhotse: True
+      text_field : "text"
+      batch_duration : 80  # 0
+      quadratic_duration : 30
+      max_open_streams: 50
+      num_buckets : 30
+      buffer_size : 10000
+      shuffle_buffer_size : 10000
+      duration_bins: [2.92,3.474,3.924,4.335,4.728,5.11,5.487,5.872,6.288,6.696,7.128,7.62,8.208,8.934,9.883,10.56,11.22,11.88,12.51,13.05,13.59,14.13,14.64,15.17875,15.81,16.54,17.37,18.241,19.18]
+      # sample_alpha: 0.1
+
+    validation_ds:
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: ${model.data.train_ds.context_key}
+      answer_key: ${model.data.train_ds.answer_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "context" # Options: ['context', 'answer']
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      tokens_to_generate: 128
+      # ASR configs
+      sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+      log_every_n_steps: 1
+      metric:
+        name: "wer" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+    # make model init happy
+    num_workers: 0
+    # test_ds:
+    #   manifest_filepath: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    #   names: null # Names of the corresponding datasets used to log metrics.
+    #   global_batch_size: ${model.global_batch_size}
+    #   micro_batch_size: ${model.micro_batch_size}
+    #   shuffle: False
+    #   num_workers: 4
+    #   pin_memory: True
+    #   max_seq_length: 2048
+    #   min_seq_length: 1
+    #   drop_last: False
+    #   context_key: 'input'
+    #   label_key: 'output'
+    #   add_eos: ${model.data.train_ds.add_eos}
+    #   add_sep: ${model.data.train_ds.add_sep}
+    #   add_bos: ${model.data.train_ds.add_bos}
+    #   separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline}
+    #   write_predictions_to_file: False
+    #   output_file_path_prefix: null # Prefix of the file to write predictions to.
+    #   truncation_field: "context" # Options: ['context', 'answer']
+    #   index_mapping_dir: null # Path to a directory to write index mapping files.
+    #   prompt_template: ${model.data.train_ds.prompt_template}
+    #   # ASR configs
+    #   sample_rate: 16000 #${model.audio_encoder.preprocessor.sample_rate}
+
+    #   metric:
+    #     name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+    #     average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+    #     num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
diff --git a/examples/multimodal/speech_llm/modular_audio_gpt_train.py b/examples/multimodal/speech_llm/modular_audio_gpt_train.py
index 04bff37e7a3f..ad8aacef2af2 100644
--- a/examples/multimodal/speech_llm/modular_audio_gpt_train.py
+++ b/examples/multimodal/speech_llm/modular_audio_gpt_train.py
@@ -18,7 +18,7 @@
 from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
 from nemo.core.config import hydra_runner
-from nemo.utils import logging
+from nemo.utils import logging, model_utils
 from nemo.utils.exp_manager import exp_manager
 
 mp.set_start_method("spawn", force=True)
@@ -61,7 +61,11 @@ def main(cfg) -> None:
     # update resume from checkpoint found by exp_manager
     logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
 
-    model = ModularAudioGPTModel.restore_from_pretrained_models(cfg, trainer=trainer)
+    if hasattr(cfg, 'model_target'):
+        imported_cls = model_utils.import_class_by_path(cfg.model_target)
+    else:
+        imported_cls = ModularAudioGPTModel
+    model = imported_cls.restore_from_pretrained_models(cfg, trainer=trainer)
 
     trainer.fit(model)
 
diff --git a/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py b/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
index 7d0ee6afbfa2..94d2cd50a240 100644
--- a/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
+++ b/nemo/collections/multimodal/speech_llm/data/audio_text_dataset.py
@@ -32,6 +32,8 @@
 from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
 from nemo.collections.common.parts.preprocessing import collections
 from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import (
+    TextProcessing,
+    build_loss_mask,
     ceil_to_nearest,
     get_num_samples_from_files,
     maybe_cast_to_list,
@@ -90,19 +92,6 @@ def _audio_collate_fn(audio_signals, audio_lengths):
     return audio_signals_padded, audio_lengths
 
 
-def _build_loss_mask(processed_example: Dict, answer_only_loss: bool = True):
-    """Pad input_ids in batch to max batch length while building loss mask"""
-    # function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
-    input_ids = processed_example['input_ids']
-    answer_start_idx = processed_example['answer_start_idx']
-    if answer_only_loss:
-        loss_mask = [float(idx >= answer_start_idx) for idx in range(len(input_ids))]
-    else:
-        loss_mask = [1.0] * len(input_ids)
-
-    return loss_mask
-
-
 def _collate_item(item: Union[torch.Tensor, np.ndarray, List], max_length: int, pad_id: int = 0):
     # function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
     item = maybe_cast_to_list(item)
@@ -132,7 +121,7 @@ def _speechllm_audio_text_collate_fn(
     context_lengths = torch.LongTensor([item['context_length'] for item in batch])
     answers = [item['answer_ids'] for item in batch]
 
-    loss_mask = [_build_loss_mask(item)[1:] for item in batch]
+    loss_mask = [build_loss_mask(item)[1:] for item in batch]
 
     max_length = max([len(x) for x in input_ids]) + tokens_to_generate
     # increase max length to nearest multiple of 4 or 8
@@ -205,197 +194,6 @@ def _speechllm_multi_audio_text_collate_fn(
     return batch
 
 
-class TextProcessing(object):
-    """
-    Text processing pipeline for AudioTextDataset and TarredAudioTextDataset.
-    This class is adapted from the one used in nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
-    """
-
-    def __init__(
-        self,
-        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
-        max_seq_length: int = 1024,
-        min_seq_length: int = 1,
-        add_bos: bool = False,
-        add_eos: bool = True,
-        add_sep: bool = False,
-        sep_id: Optional[int] = None,
-        seed: int = 1234,
-        separate_prompt_and_response_with_newline: bool = False,
-        answer_only_loss: bool = True,
-        truncation_field: str = "answer",
-        pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
-        prompt_template: str = None,
-        virtual_tokens: int = 0,
-        tokens_to_generate: int = 0,
-        context_key: str = 'context',
-        answer_key: str = 'answer',
-        end_string: Optional[str] = None,
-        sample_alpha: Optional[float] = None,
-        audio_locator: Optional[str] = None,
-    ):
-        self.context_key = context_key
-        self.answer_key = answer_key
-        self.tokenizer = tokenizer
-        self.max_seq_length = max_seq_length
-        self.min_seq_length = min_seq_length
-        self.seed = seed
-        self.separate_prompt_and_response_with_newline = separate_prompt_and_response_with_newline
-        self.answer_only_loss = answer_only_loss
-        self.truncation_field = truncation_field
-        self.pad_to_max_length = pad_to_max_length
-        self.prompt_template = prompt_template
-        self.virtual_tokens = virtual_tokens
-        self.tokens_to_generate = tokens_to_generate
-        self.add_bos = add_bos
-        self.add_eos = add_eos
-        self.add_sep = add_sep
-        self.end_string = end_string
-        self.sample_alpha = sample_alpha
-        self.audio_locator = audio_locator
-
-        if add_bos and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0:
-            self.bos_id = tokenizer.bos_id
-        else:
-            self.bos_id = None
-
-        if add_eos and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0:
-            self.eos_id = tokenizer.eos_id
-        else:
-            self.eos_id = None
-
-        if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0:
-            self.pad_id = tokenizer.pad_id
-        else:
-            self.pad_id = self.eos_id if self.eos_id is not None else 0
-
-        self.sep_id = sep_id if add_sep else None
-
-        if self.prompt_template is not None:
-            # When providing things like newlines in the prompt template via the CLI, they are escaped. This line unescapes them.
-            self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape')
-        assert self.truncation_field in ["answer", "context"]
-
-    def _process_example(self, context: str, output: str):
-        """
-        Create an example by concatenating text and answer.
-        Truncation is carried out when needed, but it is performed only on the prompt side.
-        BOS, EOS, and SEP, are added if specified.
-
-        function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
-        """
-        if self.prompt_template is not None:
-            if self.context_key not in self.prompt_template or self.answer_key not in self.prompt_template:
-                if "input" in self.prompt_template and "output" in self.prompt_template:
-                    logging.warning(
-                        f"Using 'input' and 'output' as context and answer keys, since given ones ({self.context_key}, {self.answer_key}) are not found in the prompt template: {self.prompt_template}.",
-                        mode=logging_mode.ONCE,
-                    )
-                    self.context_key = "input"
-                    self.answer_key = "output"
-            assert f'{{{self.context_key}}}' in self.prompt_template
-            assert f'{{{self.answer_key}}}' in self.prompt_template
-            # Make sure that '{output}' always occurs at the end of the prompt template string
-            assert self.prompt_template.index(f'{{{self.answer_key}}}') == len(self.prompt_template) - len(
-                f'{{{self.answer_key}}}'
-            )
-            # Get the context by replacing only the input
-            original_context = context
-            context = (
-                self.prompt_template.replace(f'{{{self.context_key}}}', context)
-                .replace(f'{{{self.answer_key}}}', '')
-                .strip(' ')
-            )
-            # Replace the input and output placeholders with the actual input and output
-            text = self.prompt_template.replace(f'{{{self.context_key}}}', original_context).replace(
-                f'{{{self.answer_key}}}', output
-            )
-
-        elif self.separate_prompt_and_response_with_newline:
-            text = context + '\n' + output
-        else:
-            text = context + ' ' + output
-
-        if self.virtual_tokens:
-            # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context
-            # these pad/eos tokens are placeholders for virtual tokens
-            pre_pad = [self.tokenizer.eos_id] * self.virtual_tokens
-        else:
-            pre_pad = []
-        answer_text = text[len(context) :]
-        answer_ids = pre_pad + self.tokenizer.text_to_ids(answer_text, self.sample_alpha)
-        if self.end_string:
-            answer_ids += self.tokenizer.text_to_ids(self.end_string)
-
-        if self.audio_locator is None:
-            # signle audio case
-            context_ids = self.tokenizer.text_to_ids(context)
-            context_start_idx = [0]
-        else:
-            # multiple audio case
-            context_ids = []
-            context_start_idx = []
-            for context_seg in context.split(self.audio_locator):
-                context_start_idx.append(len(context_ids))
-                context_ids.extend(self.tokenizer.text_to_ids(context_seg))
-        context_ids = pre_pad + context_ids
-        context_start_idx = [x + len(pre_pad) for x in context_start_idx]
-
-        # for the long context cases, collate_fn includes self.tokens_to_generate for padding
-        total_ids = len(context_ids) + max(len(answer_ids), self.tokens_to_generate)
-        if self.add_bos:
-            total_ids += 1
-        if self.add_sep:
-            total_ids += 1
-        # Only training need to consider eos token
-        if self.add_eos and self.tokens_to_generate == 0:
-            total_ids += 1
-
-        # If the total number of token is greater than the max, we will try to truncate the answer
-        if total_ids > self.max_seq_length:
-            truncation_length = total_ids - self.max_seq_length
-            if self.truncation_field == "answer":
-                answer_ids = answer_ids[: -min(truncation_length, len(answer_ids))]
-            elif self.truncation_field == "context":
-                context_ids = context_ids[: -min(truncation_length, len(context_ids))]
-
-        input_ids = context_ids
-        answer_start_idx = len(input_ids)
-
-        # Adds bos token in the start
-        if self.add_bos:
-            context_ids = [self.tokenizer.bos_id] + context_ids
-            input_ids = [self.tokenizer.bos_id] + input_ids
-            answer_start_idx += 1
-
-        # Adds sep token between text/prompt and answer
-        if self.add_sep:
-            context_ids = context_ids + [self.sep_id]
-            input_ids = input_ids + [self.sep_id]
-            answer_start_idx += 1
-
-        input_ids = input_ids + answer_ids
-
-        # Only training need to consider eos token
-        if self.add_eos and self.tokens_to_generate == 0:
-            input_ids = input_ids + [self.tokenizer.eos_id]
-
-        if len(input_ids) > self.max_seq_length:
-            logging.warning(f'Input ids length {len(input_ids)} exceed max sequence length {self.max_seq_length}')
-            input_ids = input_ids[: self.max_seq_length]
-
-        processed_example = {
-            'input_ids': input_ids,
-            'answer_start_idx': answer_start_idx,
-            'context_ids': context_ids,
-            'context_length': len(context_ids),
-            'answer_ids': answer_ids,
-            'context_start_idx': context_start_idx,
-        }
-
-        return processed_example
-
-
 class AudioTextDataset(TextProcessing, Dataset):
     """
     Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds).
diff --git a/nemo/collections/multimodal/speech_llm/data/build_dataset.py b/nemo/collections/multimodal/speech_llm/data/build_dataset.py
new file mode 100644
index 000000000000..b042386cea3b
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/data/build_dataset.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from pathlib import Path
+
+import torch
+from megatron.core import parallel_state
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
+from nemo.collections.multimodal.speech_llm.data.audio_text_dataset import (
+    get_audio_text_dataset_from_config,
+    get_tarred_audio_text_dataset_from_config,
+)
+from nemo.collections.multimodal.speech_llm.data.lhotse_dataset import LhotseAudioQuestionAnswerDataset
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import TextProcessing
+from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
+from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import (
+    MegatronPretrainingBatchSampler,
+)
+from nemo.utils import logging
+
+
+def build_speechllm_dataset(model_instance, data_cfg, is_train):
+    if 'augmentor' in data_cfg:
+        augmentor = process_augmentations(
+            data_cfg['augmentor'], global_rank=model_instance.global_rank, world_size=model_instance.world_size
+        )
+    else:
+        augmentor = None
+
+    # Check dataset max_seq_legnth and max_position_embeddings size
+    if (
+        model_instance.cfg.get('position_embedding_type', None) in [None, 'learned_absolute']
+        and data_cfg.max_seq_length > model_instance.cfg.max_position_embeddings
+    ):
+        logging.warning(
+            f"Set dataset max_seq_length to max_position_embeddings {model_instance.cfg.max_position_embeddings} if using learned_absolute position embedding"
+        )
+        data_cfg.max_seq_length = model_instance.cfg.max_position_embeddings
+
+    # Notably, the data weights are controlled by either bucketing_weights
+    # or concat_sampling_probabilities depending on the dataset type.
+    if data_cfg.get("use_lhotse"):
+        tp = TextProcessing(
+            model_instance.tokenizer,
+            max_seq_length=data_cfg["max_seq_length"],
+            min_seq_length=data_cfg["min_seq_length"],
+            add_bos=data_cfg.get('add_bos', False),
+            add_eos=data_cfg.get('add_eos', False),
+            add_sep=data_cfg.get('add_sep', False),
+            sep_id=model_instance.sep_id,
+            seed=data_cfg.get('seed', 1234),
+            separate_prompt_and_response_with_newline=data_cfg.get('separate_prompt_and_response_with_newline', True),
+            answer_only_loss=model_instance.cfg.get('answer_only_loss', True),
+            truncation_field=data_cfg.get('truncation_field', 'context'),
+            pad_to_max_length=data_cfg.get('pad_to_max_length', False),
+            prompt_template=data_cfg.get('prompt_template', None),
+            virtual_tokens=model_instance.virtual_tokens,
+            tokens_to_generate=data_cfg.get(
+                'tokens_to_generate', 0
+            ),  # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
+            context_key=data_cfg.get('context_key', 'context'),
+            answer_key=data_cfg.get('answer_key', 'answer'),
+            end_string=data_cfg.get('end_string', None),
+            sample_alpha=data_cfg.get('sample_alpha', None),
+        )
+        return LhotseAudioQuestionAnswerDataset(
+            tp,
+            default_context="answer the question according to the previous audio",
+            tokens_to_generate=data_cfg.get('tokens_to_generate', 0),
+            pad_to_max_length=data_cfg.get('pad_to_max_length', False),
+            max_seq_length=data_cfg["max_seq_length"],
+            context_key=data_cfg.get('context_key', "context"),
+            default_context_key=data_cfg.get('default_context_key', "default_context"),
+        )
+
+    # Notably, the data weights are controlled by either bucketing_weights
+    # or concat_sampling_probabilities depending on the dataset type.
+    if data_cfg.get('is_tarred', False):
+        return get_tarred_audio_text_dataset_from_config(
+            config=data_cfg,
+            tokenizer=model_instance.tokenizer,
+            augmentor=augmentor,
+            sep_id=model_instance.sep_id,
+            answer_only_loss=model_instance.cfg.get('answer_only_loss', True),
+            virtual_tokens=model_instance.virtual_tokens,
+            global_rank=parallel_state.get_data_parallel_rank(),
+            world_size=parallel_state.get_data_parallel_world_size(),
+        )
+    else:
+        return get_audio_text_dataset_from_config(
+            manifest_filepath=data_cfg.manifest_filepath,
+            config=data_cfg,
+            tokenizer=model_instance.tokenizer,
+            augmentor=augmentor,
+            is_train=is_train,
+            sep_id=model_instance.sep_id,
+            answer_only_loss=model_instance.cfg.get('answer_only_loss', True),
+            virtual_tokens=model_instance.virtual_tokens,
+        )
+
+
+def build_speechllm_dataloader(dataset, data_cfg, consumed_samples=0, is_predict=False, is_eval=False):
+    """Buld dataloader given an input dataset."""
+    if data_cfg.get("use_lhotse"):
+        if is_eval == False and is_predict == False:
+            return get_lhotse_dataloader_from_config(
+                data_cfg,
+                global_rank=parallel_state.get_data_parallel_rank(),
+                world_size=parallel_state.get_data_parallel_world_size(),
+                dataset=dataset,
+            )
+        # for eval, we need to create separate dataset so as to report splitted numbers
+        else:
+            dls = []
+            if hasattr(data_cfg, 'manifest_filepath'):
+                manifest_filepath = data_cfg.manifest_filepath
+                for cur_manifest_filepath in manifest_filepath:
+                    conf = copy.deepcopy(data_cfg)
+                    conf['manifest_filepath'] = cur_manifest_filepath
+                    dls.append(
+                        get_lhotse_dataloader_from_config(
+                            conf,
+                            global_rank=parallel_state.get_data_parallel_rank(),
+                            world_size=parallel_state.get_data_parallel_world_size(),
+                            dataset=dataset,
+                        )
+                    )
+            else:
+                input_cfg = data_cfg.input_cfg
+                if isinstance(input_cfg, (str, Path)):
+                    # Resolve /path/to/input_cfg.yaml into config contents if needed.
+                    input_cfg = OmegaConf.load(input_cfg)
+                    assert len(input_cfg) == 1, "Only one dataset with multiple manifest paths is supported for eval"
+                    data_cfg.input_cfg = input_cfg
+                    # for getting names
+                    manifest_filepath = [ic.manifest_filepath for ic in input_cfg[0].input_cfg]
+                for cur_input_cfg in input_cfg[0].input_cfg:
+                    conf = copy.deepcopy(data_cfg)
+                    conf.input_cfg[0].input_cfg = [cur_input_cfg]
+                    dls.append(
+                        get_lhotse_dataloader_from_config(
+                            conf,
+                            global_rank=parallel_state.get_data_parallel_rank(),
+                            world_size=parallel_state.get_data_parallel_world_size(),
+                            dataset=dataset,
+                        )
+                    )
+
+            if 'names' not in data_cfg:
+                names = []
+                for cur_manifest_filepath in manifest_filepath:
+                    names.append(Path(cur_manifest_filepath).stem)
+                OmegaConf.update(data_cfg, 'names', names, force_add=True)
+                logging.info(f'Update dataset names as {names}')
+            return dls
+
+    logging.info(f'Building dataloader with consumed samples: {consumed_samples}')
+    if isinstance(dataset, BlendableDataset):
+        collate_fn = dataset.datasets[0].collate_fn
+    elif hasattr(dataset, 'collate_fn'):
+        collate_fn = dataset.collate_fn
+    elif hasattr(dataset.datasets[0], 'collate_fn'):
+        # support datasets that are lists of entries
+        collate_fn = dataset.datasets[0].collate_fn
+    else:
+        # support datasets that are lists of lists
+        collate_fn = dataset.datasets[0].datasets[0].collate_fn
+
+    if isinstance(dataset, torch.utils.data.IterableDataset):
+        data_parallel_size = parallel_state.get_data_parallel_world_size()
+        num_micro_batches = data_cfg.global_batch_size // (data_cfg.micro_batch_size * data_parallel_size)
+        global_batch_size_on_this_data_parallel_rank = num_micro_batches * data_cfg.micro_batch_size
+
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            collate_fn=collate_fn,
+            shuffle=False,
+            batch_size=global_batch_size_on_this_data_parallel_rank,
+            drop_last=True,
+            num_workers=data_cfg.num_workers,
+            pin_memory=data_cfg.pin_memory,
+        )
+        return dataloader
+
+    if is_predict:
+        # MegatronPretrainingBatchSampler doesn't work with trainer.predict()
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            collate_fn=collate_fn,
+            batch_size=data_cfg.micro_batch_size,
+            num_workers=data_cfg.num_workers,
+            pin_memory=data_cfg.pin_memory,
+        )
+        return dataloader
+
+    batch_sampler = MegatronPretrainingBatchSampler(
+        total_samples=len(dataset),
+        consumed_samples=consumed_samples,
+        micro_batch_size=data_cfg.micro_batch_size,
+        global_batch_size=data_cfg.global_batch_size,
+        data_parallel_rank=parallel_state.get_data_parallel_rank(),
+        data_parallel_size=parallel_state.get_data_parallel_world_size(),
+        drop_last=data_cfg.drop_last,
+        pad_samples_to_global_batch_size=not data_cfg.drop_last,
+    )
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_sampler=batch_sampler,
+        collate_fn=collate_fn,
+        num_workers=data_cfg.num_workers,
+        pin_memory=data_cfg.pin_memory,
+        persistent_workers=True if data_cfg.num_workers > 0 else False,
+    )
+    return dataloader
diff --git a/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py b/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py
new file mode 100644
index 000000000000..d3e70343d507
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/data/lhotse_dataset.py
@@ -0,0 +1,166 @@
+import torch.utils.data
+from lhotse.dataset import AudioSamples
+from lhotse.dataset.collation import collate_vectors as collate_vectors_lhotse
+
+from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import (
+    TextProcessing,
+    build_loss_mask,
+    ceil_to_nearest,
+)
+
+
+def collate_vectors(items, max_length: int, padding_value):
+    vectors = collate_vectors_lhotse(items, padding_value=padding_value)
+    if max_length > vectors.size(1):
+        vectors = torch.cat(
+            [vectors, padding_value * torch.ones(vectors.size(0), max_length - vectors.size(1), dtype=vectors.dtype)],
+            dim=1,
+        )
+    if items[0].shape[0] < 1:
+        vectors = vectors.long()
+    return vectors
+
+
+class LhotseAudioQuestionAnswerDataset(torch.utils.data.Dataset):
+    """
+    This dataset is based on Lhotse ASR dataset from ``audio_to_text_lhotse.py``
+    and ``TarredAudioQuestionAnswerDataset`` from ``audio_text_qa_dataset.py``.
+
+    Unlike native NeMo datasets, Lhotse dataset defines only the mapping from
+    a CutSet (meta-data) to a mini-batch with PyTorch tensors.
+    Specifically, it performs tokenization, I/O, augmentation, and feature extraction (if any).
+    Managing data, sampling, de-duplication across workers/nodes etc. is all handled
+    by Lhotse samplers instead.
+
+    Args:
+        text_processor: TextProcessing object
+        default_context: Default question to use if no question is provided
+        tokens_to_generate: Number of tokens to generate during inference
+        pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
+        max_seq_length: Maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        context_key: Key to use for the context in your JSONL file
+        default_context_key: Key to use for the default context in lhotse yaml
+    """
+
+    def __init__(
+        self,
+        text_processor: TextProcessing,
+        default_context: str,
+        tokens_to_generate: int,
+        pad_to_max_length: bool,
+        max_seq_length: int,
+        context_key: str = "context",
+        default_context_key: str = "default_context",
+    ):
+        super().__init__()
+        self.text_processor = text_processor
+        self.load_audio = AudioSamples(fault_tolerant=True)
+        self.tokens_to_generate = tokens_to_generate
+        self.pad_to_max_length = pad_to_max_length
+        self.max_seq_length = max_seq_length
+
+        self.default_context = default_context
+        self.context_key = context_key
+        self.default_context_key = default_context_key
+
+    def __getitem__(self, cuts) -> dict[str, torch.Tensor | list[str] | dict]:
+        cuts = cuts.sort_by_duration()
+
+        audio, audio_lens, cuts = self.load_audio(cuts)
+
+        return_batch = {}
+        audio_ratio = []
+        for id, cut in enumerate(cuts):
+            audio_ratio.append(1.0)
+
+        for _, cut in enumerate(cuts):
+            if hasattr(cut, self.context_key):
+                cut.context = getattr(cut, self.context_key)
+            elif hasattr(cut, self.default_context_key):
+                cut.context = getattr(cut, self.default_context_key)
+            else:
+                cut.context = self.default_context
+
+        metadata = []
+        for id, cut in enumerate(cuts):
+            metadata.append({'audio_filepath': cut.id + '.wav'})
+
+        collated_text_data = collate_text_data(
+            cuts=cuts,
+            default_context=self.default_context,
+            text_processor=self.text_processor,
+            tokens_to_generate=self.tokens_to_generate,
+            pad_to_max_length=self.pad_to_max_length,
+            max_seq_length=self.max_seq_length,
+        )
+        return_batch.update(
+            {
+                "sample_ids": list(cuts.ids),
+                "audio_signal": audio,
+                "audio_signal_length": audio_lens,
+                "audio_ratio": torch.FloatTensor(audio_ratio),
+                "metadata": metadata,
+                **collated_text_data,
+            }
+        )
+
+        return return_batch
+
+
+def collate_text_data(
+    cuts,
+    default_context: str,
+    text_processor: TextProcessing,
+    tokens_to_generate: int,
+    pad_to_max_length: bool,
+    max_seq_length: int,
+) -> dict:
+    """Perform text collation equivalent to nemo/collections/multimodal/data/audio_text_qa_dataset.py:121"""
+    batch_size = len(cuts)
+    pad_id = text_processor.pad_id
+    examples = [
+        {
+            k: torch.as_tensor(v)
+            for k, v in text_processor._process_example(
+                context=cut.context,
+                output=cut.supervisions[0].text,
+            ).items()
+        }
+        for cut in cuts
+    ]
+    fields = as_dict(examples)
+
+    def get_max_len(input_list):
+        return max([len(x) for x in input_list])
+
+    max_length = tokens_to_generate + max(
+        get_max_len(fields["input_ids"]), get_max_len(fields["context_ids"]), get_max_len(fields["answer_ids"])
+    )
+    # increase max length to nearest multiple of 4 or 8
+    if pad_to_max_length:
+        max_length = max_seq_length
+    else:
+        max_length = min(max_seq_length, ceil_to_nearest(max_length, 8))
+
+    all_tokens = collate_vectors(fields["input_ids"], max_length=max_length, padding_value=pad_id)
+    full_lengths = torch.LongTensor([len(item) for item in fields["input_ids"]])
+
+    assert max_length <= max_seq_length, f"{max_length=} <= {max_seq_length=}"
+
+    return {
+        "tokens": all_tokens[:, :-1],
+        "tokens_length": full_lengths - 1,
+        "labels": all_tokens[:, 1:],
+        "loss_mask": collate_vectors(
+            [torch.as_tensor(build_loss_mask(item)) for item in examples], max_length=max_length, padding_value=0
+        )[:, 1:],
+        "position_ids": torch.arange(max_length, dtype=torch.long).repeat(batch_size, 1),
+        "contexts": collate_vectors(fields["context_ids"], max_length=max_length, padding_value=pad_id),
+        "context_lengths": torch.LongTensor([len(seq) for seq in fields["context_ids"]]),
+        "answers": collate_vectors(fields["answer_ids"], max_length=max_length, padding_value=pad_id),
+        "max_length": torch.LongTensor([max_length] * batch_size),
+    }
+
+
+def as_dict(arg: list[dict]) -> dict[str, list]:
+    return {k: [item[k] for item in arg] for k in arg[0].keys()}
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_models.py b/nemo/collections/multimodal/speech_llm/models/modular_models.py
index 39bc37c33e56..cce74e7b6a1d 100644
--- a/nemo/collections/multimodal/speech_llm/models/modular_models.py
+++ b/nemo/collections/multimodal/speech_llm/models/modular_models.py
@@ -29,12 +29,11 @@
 
 from nemo.collections.asr.models import ASRModel, EncDecSpeakerLabelModel
 from nemo.collections.asr.parts.mixins.transcription import move_to_device
-from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
 from nemo.collections.asr.parts.utils.eval_utils import remove_punctuations
 from nemo.collections.common.metrics import MetricStringToTorchMetric, TextMetricsSet
-from nemo.collections.multimodal.speech_llm.data.audio_text_dataset import (
-    get_audio_text_dataset_from_config,
-    get_tarred_audio_text_dataset_from_config,
+from nemo.collections.multimodal.speech_llm.data.build_dataset import (
+    build_speechllm_dataloader,
+    build_speechllm_dataset,
 )
 from nemo.collections.multimodal.speech_llm.modules.common.audio_text_generation_utils import generate
 from nemo.collections.multimodal.speech_llm.modules.perception_modules import (
@@ -43,10 +42,6 @@
 )
 from nemo.collections.multimodal.speech_llm.parts.mixins.adapter_mixin import SpeechLLMAdapterMixin
 from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import get_nested_dict_value
-from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
-from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import (
-    MegatronPretrainingBatchSampler,
-)
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
 from nemo.collections.nlp.modules.common.megatron.utils import (
@@ -59,7 +54,7 @@
 from nemo.core.classes import ModelPT
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.classes.mixins import adapter_mixins
-from nemo.utils import AppState, logging
+from nemo.utils import AppState, logging, model_utils
 from nemo.utils.model_utils import inject_model_parallel_rank
 
 try:
@@ -88,15 +83,24 @@
 class ModularAudioGPTModel(SpeechLLMAdapterMixin, MegatronGPTSFTModel):
     """Modularized speech GPT model."""
 
+    def setup_perception_modules(self, cfg):
+        if 'target' in cfg.perception:
+            imported_cls = model_utils.import_class_by_path(cfg.perception.target)
+            self.perception = imported_cls(cfg=cfg.perception)
+        else:
+            self.perception = (
+                AudioPerceptionModule(cfg=cfg.perception)
+                if "encoders" not in cfg.perception
+                else MultiAudioPerceptionModule(cfg=cfg.perception)
+            )
+
     def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.cfg = cfg
         super().__init__(cfg, trainer)
+        # handle the case where the batch size from dynamic bucketting is not divisible in lhotse
+        self.enforce_divisible_batch = False
+        self.setup_perception_modules(cfg)
 
-        self.perception = (
-            AudioPerceptionModule(cfg=cfg.perception)
-            if "encoders" not in cfg.perception
-            else MultiAudioPerceptionModule(cfg=cfg.perception)
-        )
         # print out params in more details
         self.summarize(max_depth=2)
 
@@ -121,11 +125,14 @@ def setup_optimizer_param_groups(self):
         Override parent method to setup optimizer groups for training/freezing different parts of the model.
         """
         known_groups = []
-        if self.cfg.get('freeze_llm', True):
-            for param in self.model.parameters():
-                param.requires_grad = False
+        self.unfreeze()
+        freeze_llm = self.cfg.get('freeze_llm', True)
+        if freeze_llm:
             known_groups.append('model.')
 
+        for param in self.model.parameters():
+            param.requires_grad = not freeze_llm
+
         if self.cfg.get('freeze_audio_encoder', False):
             # freeze speaker model if there is any
             if self.cfg.perception.get("speaker_model", None) is not None:
@@ -362,6 +369,15 @@ def forward(
         """
         Forward pass of the model. We prepend audio embeddings to the instruction and label text tokens as the LLM input.
         """
+        if 'audio_ratio' in audio_batch:
+            self.log(
+                'local_batch_size',
+                audio_batch['audio_ratio'].shape[0],
+                prog_bar=True,
+                batch_size=1,
+                rank_zero_only=False,
+            )
+
         encoder_input, attention_mask, labels, loss_mask, _ = self.prepare_llm_input(audio_batch)
         if self.mcore_gpt:
             output = self.model(
@@ -523,109 +539,10 @@ def loss_func(output_tensor):
         return fwd_output_and_loss_func
 
     def _build_dataset(self, data_cfg, is_train=True):
-        if 'augmentor' in data_cfg:
-            augmentor = process_augmentations(
-                data_cfg['augmentor'], global_rank=self.global_rank, world_size=self.world_size
-            )
-        else:
-            augmentor = None
+        return build_speechllm_dataset(self, data_cfg, is_train)
 
-        # Check dataset max_seq_legnth and max_position_embeddings size
-        if (
-            self.cfg.get('position_embedding_type', None) in [None, 'learned_absolute']
-            and data_cfg.max_seq_length > self.cfg.max_position_embeddings
-        ):
-            logging.warning(
-                f"Set dataset max_seq_length to max_position_embeddings {self.cfg.max_position_embeddings} if using learned_absolute position embedding"
-            )
-            data_cfg.max_seq_length = self.cfg.max_position_embeddings
-
-        # Notably, the data weights are controlled by either bucketing_weights
-        # or concat_sampling_probabilities depending on the dataset type.
-        if data_cfg.get('is_tarred', False):
-            return get_tarred_audio_text_dataset_from_config(
-                config=data_cfg,
-                tokenizer=self.tokenizer,
-                augmentor=augmentor,
-                sep_id=self.sep_id,
-                answer_only_loss=self.cfg.get('answer_only_loss', True),
-                virtual_tokens=self.virtual_tokens,
-                global_rank=parallel_state.get_data_parallel_rank(),
-                world_size=parallel_state.get_data_parallel_world_size(),
-            )
-        else:
-            return get_audio_text_dataset_from_config(
-                manifest_filepath=data_cfg.manifest_filepath,
-                config=data_cfg,
-                tokenizer=self.tokenizer,
-                augmentor=augmentor,
-                is_train=is_train,
-                sep_id=self.sep_id,
-                answer_only_loss=self.cfg.get('answer_only_loss', True),
-                virtual_tokens=self.virtual_tokens,
-            )
-
-    def build_data_loader(self, dataset, data_cfg, consumed_samples=0, is_predict=False):
-        """Buld dataloader given an input dataset."""
-        logging.info(f'Building dataloader with consumed samples: {consumed_samples}')
-        if isinstance(dataset, BlendableDataset):
-            collate_fn = dataset.datasets[0].collate_fn
-        elif hasattr(dataset, 'collate_fn'):
-            collate_fn = dataset.collate_fn
-        elif hasattr(dataset.datasets[0], 'collate_fn'):
-            # support datasets that are lists of entries
-            collate_fn = dataset.datasets[0].collate_fn
-        else:
-            # support datasets that are lists of lists
-            collate_fn = dataset.datasets[0].datasets[0].collate_fn
-
-        if isinstance(dataset, torch.utils.data.IterableDataset):
-            data_parallel_size = parallel_state.get_data_parallel_world_size()
-            num_micro_batches = data_cfg.global_batch_size // (data_cfg.micro_batch_size * data_parallel_size)
-            global_batch_size_on_this_data_parallel_rank = num_micro_batches * data_cfg.micro_batch_size
-
-            dataloader = torch.utils.data.DataLoader(
-                dataset,
-                collate_fn=collate_fn,
-                shuffle=False,
-                batch_size=global_batch_size_on_this_data_parallel_rank,
-                drop_last=True,
-                num_workers=data_cfg.num_workers,
-                pin_memory=data_cfg.pin_memory,
-            )
-            return dataloader
-
-        if is_predict:
-            # MegatronPretrainingBatchSampler doesn't work with trainer.predict()
-            dataloader = torch.utils.data.DataLoader(
-                dataset,
-                collate_fn=collate_fn,
-                batch_size=data_cfg.micro_batch_size,
-                num_workers=data_cfg.num_workers,
-                pin_memory=data_cfg.pin_memory,
-            )
-            return dataloader
-
-        batch_sampler = MegatronPretrainingBatchSampler(
-            total_samples=len(dataset),
-            consumed_samples=consumed_samples,
-            micro_batch_size=data_cfg.micro_batch_size,
-            global_batch_size=data_cfg.global_batch_size,
-            data_parallel_rank=parallel_state.get_data_parallel_rank(),
-            data_parallel_size=parallel_state.get_data_parallel_world_size(),
-            drop_last=data_cfg.drop_last,
-            pad_samples_to_global_batch_size=not data_cfg.drop_last,
-        )
-
-        dataloader = torch.utils.data.DataLoader(
-            dataset,
-            batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
-            num_workers=data_cfg.num_workers,
-            pin_memory=data_cfg.pin_memory,
-            persistent_workers=True if data_cfg.num_workers > 0 else False,
-        )
-        return dataloader
+    def build_data_loader(self, dataset, data_cfg, consumed_samples=0, is_predict=False, is_eval=False):
+        return build_speechllm_dataloader(dataset, data_cfg, consumed_samples, is_predict=is_predict, is_eval=is_eval)
 
     @classmethod
     def _modify_audio_encoder_config(cls, gpt_cfg, audio_cfg, speaker_cfg=None):
@@ -789,6 +706,7 @@ def get_audio_encoder_models_and_configs(cls, cfg):
     def load_pretrained_audio_weights(
         cls, cfg, model, audio_model, speaker_model: Optional[EncDecSpeakerLabelModel] = None
     ):
+        model.perception.tokenizer = audio_model.tokenizer
         use_multi_encoder = cfg.model.perception.get("encoders", None) is not None
         if not use_multi_encoder:
             if cfg.model.perception.get("use_multi_layer_feat", False):
@@ -932,7 +850,9 @@ def merge_inference_cfg(
                 trainer=trainer,
                 return_config=True,
             )
-
+        # overwrite pretrained_audio_model if there
+        if hasattr(cfg.model, "pretrained_audio_model"):
+            model_cfg.pretrained_audio_model = cfg.model.pretrained_audio_model
         if hasattr(model_cfg, 'peft') and model_cfg.peft.peft_scheme not in [None, 'none']:
             # before PEFT migrates to distributed ckpt, eval must use same TP/PP as training
             for p in ['tensor_model_parallel_size', 'pipeline_model_parallel_size']:
@@ -966,11 +886,12 @@ def load_adapters_for_inference(cls, cfg: DictConfig, model_cfg: DictConfig, mod
         if cfg.model.peft.restore_from_path:
             if '\\' in cfg.model.peft.restore_from_path:
                 cfg.model.peft.restore_from_path = cfg.model.peft.restore_from_path.replace('\\', '')
-            if "peft" in model_cfg:
+            if "peft" in model_cfg and 'peft_scheme' in model_cfg.peft:
                 peft_cfg_cls = PEFT_CONFIG_MAP[model_cfg.peft.peft_scheme]
                 model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg), map_location="cpu")
             else:
-                model.load_state_dict(torch.load(cfg.model.peft.restore_from_path), strict=False)
+                torch_state_dict = torch.load(cfg.model.peft.restore_from_path)['state_dict']
+                model.load_state_dict(torch_state_dict, strict=False)
         elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
             checkpoint_path = os.path.join(
                 cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
@@ -1486,9 +1407,9 @@ def write_predictions_to_file(self, outputs, output_file_path_prefix, output_dir
     def setup_eval_dataloader(self, datasets, data_cfg):
         dataloaders = []
         if not isinstance(datasets, list):
-            return self.build_data_loader(dataset=datasets, data_cfg=data_cfg, consumed_samples=0)
+            return self.build_data_loader(dataset=datasets, data_cfg=data_cfg, consumed_samples=0, is_eval=True)
         for dataset in datasets:
-            eval_dl = self.build_data_loader(dataset=dataset, data_cfg=data_cfg, consumed_samples=0)
+            eval_dl = self.build_data_loader(dataset=dataset, data_cfg=data_cfg, consumed_samples=0, is_eval=True)
             dataloaders.append(eval_dl)
         return dataloaders
 
@@ -1517,8 +1438,6 @@ def maybe_build_test(self):
             logging.info('Building test datasets...')
             # Wrap this in a list since the general finetuning parent class supports multi-validation.
             self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False)
-            lengths = [len(x) for x in self._test_ds]
-            logging.info(f'Length of test datasets: {lengths}, total: {sum(lengths)}')
         return
 
     def maybe_setup_test(self):
@@ -1532,8 +1451,6 @@ def build_train_valid_test_datasets(self, stage):
             logging.info('Building validation datasets.')
             # Wrap this in a list since the general finetuning parent class supports multi-validation.
             self._validation_ds = self._build_dataset(self.cfg.data.validation_ds, is_train=False)
-            lengths = [len(x) for x in self._validation_ds]
-            logging.info(f'Length of validation datasets: {lengths}, total: {sum(lengths)}')
 
         if stage != 'validate':
             self.maybe_build_test()
@@ -1542,7 +1459,6 @@ def build_train_valid_test_datasets(self, stage):
             return
         logging.info('Building training datasets.')
         self._train_ds = self._build_dataset(self.cfg.data.train_ds)
-        logging.info(f'Length training datasets: {len(self._train_ds)}')
 
     @classmethod
     def list_available_models(cls) -> Optional[PretrainedModelInfo]:
@@ -1561,3 +1477,76 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
         )
         results.append(model)
         return results
+
+
+class CrossAttendModularAudioGPTModel(ModularAudioGPTModel):
+    """Modularized speech GPT model."""
+
+    def prepare_llm_input(self, audio_batch):
+
+        input_signal = audio_batch['audio_signal']
+        input_signal_length = audio_batch['audio_signal_length']
+
+        input_ids, input_length, labels, loss_mask = (
+            audio_batch['tokens'],
+            audio_batch['tokens_length'],
+            audio_batch['labels'],
+            audio_batch['loss_mask'],
+        )
+
+        num_audios = audio_batch.get("num_audios", None)
+        if num_audios is not None:
+            raise ValueError("num_audios is not supported.")
+
+        if self.cfg.get('megatron_amp_O2', False):
+            base_module = self.model.module
+        else:
+            base_module = self.model
+        lm_embedding = (
+            base_module.language_model.embedding if hasattr(base_module, 'language_model') else base_module.embedding
+        )
+        # [b, t, c]
+        encoded, encoded_len = self.perception(
+            input_signal=input_signal,
+            input_signal_length=input_signal_length,
+            processed_signal=None,
+            processed_signal_length=None,
+        )
+        input_embeds = self._get_text_embeddings(input_ids, None).transpose(0, 1)
+        encoder_input, extra_outputs = self.perception_cross_attn(
+            encoded, encoded_len, input_embeds, input_lengths=input_length, return_mems=True
+        )
+        # TODO: need separate speech and text methods for inference
+        if 'audio_ratio' in audio_batch:
+            audio_ratio = audio_batch['audio_ratio'][..., None, None]
+            encoder_input = encoder_input * audio_ratio + input_embeds * (1 - audio_ratio)
+        if 'alpha_xattn' in extra_outputs:
+            alpha_xattn = extra_outputs['alpha_xattn']
+            self.log(
+                'alpha_xattn',
+                alpha_xattn.mean(),
+                prog_bar=True,
+                batch_size=1,
+                rank_zero_only=True,
+            )
+        attention_mask = self._create_attention_mask(encoder_input)
+
+        if not hasattr(lm_embedding, 'transpose_batch_sequence') or lm_embedding.transpose_batch_sequence:
+            encoder_input = encoder_input.transpose(0, 1).contiguous()
+        if self.cfg.get("sequence_parallel", False):
+            encoder_input = tensor_parallel.mappings.scatter_to_sequence_parallel_region(encoder_input)
+        return encoder_input, attention_mask, labels, loss_mask, (encoded, encoded_len, extra_outputs)
+
+    def setup_perception_modules(self, cfg):
+        super().setup_perception_modules(cfg)
+        imported_cls = model_utils.import_class_by_path(cfg.perception.xattn.target)
+        self.perception_cross_attn = imported_cls(cfg=cfg.perception)
+
+    def state_dict(self, destination=None, prefix=None, keep_vars=False):
+        if self.setup_complete:
+            return_state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+            state_dict = self.perception_cross_attn.state_dict(prefix="perception_cross_attn.")
+            return_state_dict.update(state_dict)
+            return return_state_dict
+        else:
+            return super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py b/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py
new file mode 100644
index 000000000000..a96ee823e197
--- /dev/null
+++ b/nemo/collections/multimodal/speech_llm/models/modular_t5_models.py
@@ -0,0 +1,1367 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import itertools
+import json
+import os
+from functools import partial
+from typing import Any, Optional, Union
+
+import sacrebleu
+import torch
+from omegaconf import ListConfig
+from omegaconf.dictconfig import DictConfig
+from omegaconf.omegaconf import OmegaConf, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.asr.models import ASRModel, SpeechEncDecSelfSupervisedModel
+from nemo.collections.asr.parts.mixins.transcription import move_to_device
+from nemo.collections.common.metrics import MetricStringToTorchMetric, TextMetricsSet
+from nemo.collections.multimodal.speech_llm.data.build_dataset import (
+    build_speechllm_dataloader,
+    build_speechllm_dataset,
+)
+from nemo.collections.multimodal.speech_llm.modules.perception_modules import (
+    AudioPerceptionModule,
+    MultiAudioPerceptionModule,
+)
+from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5LoraModel
+from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel
+from nemo.collections.nlp.models.nlp_model import NLPModel
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    average_losses_across_data_parallel_group,
+    build_position_ids,
+    get_iterator_k_split,
+)
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.utils_funcs import get_last_rank
+from nemo.core.classes.mixins import adapter_mixins
+from nemo.utils import AppState, logging, model_utils
+
+try:
+    from apex.transformer.pipeline_parallel.utils import (
+        _reconfigure_microbatch_calculator,
+        get_current_global_batch_size,
+        get_micro_batch_size,
+        get_num_microbatches,
+    )
+
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_MEGATRON_CORE = False
+
+
+__all__ = ["ModularizedAudioT5Model"]
+
+
+default_inference_config = {'tokens_to_generate': 30}
+
+
+class ModularizedAudioT5Model(MegatronT5LoraModel):
+    """Modularized speech GPT model."""
+
+    def setup_perception_modules(self, cfg):
+        if 'target' in cfg.perception:
+            imported_cls = model_utils.import_class_by_path(cfg.perception.target)
+            self.perception = imported_cls(cfg=cfg.perception)
+        else:
+            self.perception = (
+                AudioPerceptionModule(cfg=cfg.perception)
+                if "encoders" not in cfg.perception
+                else MultiAudioPerceptionModule(cfg=cfg.perception)
+            )
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        self.cfg = cfg
+        super().__init__(cfg, trainer)
+        self.val_metric, self.val_metric_name = self.setup_metric(self.cfg.data.validation_ds)
+        self.val_metric = torch.nn.ModuleList(self.val_metric)
+        if hasattr(self.cfg.data, "test_ds"):
+            self.test_metric, self.test_metric_name = self.setup_metric(self.cfg.data.test_ds)
+            self.test_metric = torch.nn.ModuleList(self.test_metric)
+        # Used other keys from metadata to calulate metrics
+        if hasattr(self.cfg.data, "test_ds") and hasattr(self.cfg.data.test_ds, "metric"):
+            self.test_metric_label_key = self.cfg.data.test_ds.metric.get('label_key', 'labels')
+        if hasattr(self.cfg.data, "validation_ds") and hasattr(self.cfg.data.validation_ds, "metric"):
+            self.val_metric_label_key = self.cfg.data.validation_ds.metric.get('label_key', 'labels')
+        self.setup_perception_modules(cfg)
+        self.setup_optimizer_param_groups()
+        # self.configure_optimizers()
+        self.summarize(max_depth=3)
+        # follow gpt
+        self.setup_complete = False
+        self.sep_id = cfg.get('sep_id', self.tokenizer.bos_id)
+        self.virtual_tokens = 0
+        self.model = self.frozen_model.enc_dec_model
+
+    def load_frozen_model(self, cfg, trainer):
+        self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
+        t5_cfg_base = MegatronT5Model.restore_from(cfg.get('language_model_path'), trainer=trainer, return_config=True)
+        # use the incoming cfg updated by _modify_config
+        t5_cfg = copy.deepcopy(cfg)
+        t5_cfg.target = t5_cfg_base.target
+        self.frozen_model = MegatronT5Model.restore_from(
+            cfg.get('language_model_path'),
+            trainer=trainer,
+            override_config_path=t5_cfg,
+            save_restore_connector=NLPSaveRestoreConnector(),
+        )
+        logging.info(f"self.frozen_model.cfg: {self.frozen_model.cfg}")
+
+    def init_model(self, cfg: DictConfig, trainer: Trainer):
+        self.cfg = cfg
+
+        self.load_frozen_model(cfg, trainer)
+        self.prompt_encoder = None
+        if self.frozen_model.tokenizer is not None:
+            self.tokenizer = self.frozen_model.tokenizer
+
+        if hasattr(self.frozen_model.cfg, "encoder") and hasattr(self.frozen_model.cfg, "decoder"):
+            self.hidden_size = (
+                self.frozen_model.cfg.encoder.hidden_size
+            )  # Encoder and decoder need to have the same hidden size and we check for this in the frozen enc-dec model.
+        else:
+            self.hidden_size = self.frozen_model.cfg.hidden_size
+
+        # Handle this when moving GPT prompt learning to the base class.
+        self.word_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.word_embeddings
+
+        self._reduced_loss_buffer = []
+        self._inference_config = None
+
+        self.tokenizer.legacy = cfg.get('legacy_tokenizer', False)
+        self.bos_id = self.tokenizer.bos_id
+        self.decoder_seq_length = cfg.get('decoder_seq_length', 40)
+
+        # make sure the default pytorch lightning gradient clipping in the basemodel
+        self.grad_clip_pl_default = False  # make distributed_fused_adam happy
+        self.lowest_val_loss = None
+        self.prompt_encoder = None
+
+        self.enable_autocast = (
+            True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False
+        )
+
+    def parameters(self):
+        # override the same method in MegatronGPT model to include parameters ouside of LM
+        all_names = []
+        all_params = []
+        for name, param in self.named_parameters(recurse=True):
+            all_names.append(name)
+            all_params.append(param)
+
+        if isinstance(self.frozen_model, list):
+            for module in self.frozen_model:
+                for name, param in module.named_parameters(recurse=True):
+                    all_names.append(name)
+                    all_params.append(param)
+
+        return itertools.chain(all_params)
+
+    def setup_optimizer_param_groups(self):
+        """
+        ModelPT override. Optimizer will get self._optimizer_param_groups.
+        Makes two optimizer param groups, one for the frozen model params
+        and one for the prompt-table/prompt-encoder params. The learning
+        rate for the frozen model's params will always be zero effectively
+        freezing the model's params but still allowing for the needed gradients
+        to be passed around in pipeline parallel models. The prompt-encoder
+        and/or prompt table will use the learning rate set by the user.
+        """
+        self.unfreeze()
+        known_groups = []
+        if self.cfg.get('freeze_llm', True):
+            for param in self.frozen_model.parameters():
+                param.requires_grad = False
+            known_groups.append('model.')
+        else:
+            if self.cfg.get('freeze_encoder', False):
+                for param in self.frozen_model.enc_dec_model.enc_dec_model.encoder.parameters():
+                    param.requires_grad = False
+                known_groups.append('enc_dec_model.encoder.')
+            if self.cfg.get('freeze_decoder', False):
+                for param in self.frozen_model.enc_dec_model.enc_dec_model.decoder.parameters():
+                    param.requires_grad = False
+                known_groups.append('enc_dec_model.decoder.')
+            if self.cfg.get('freeze_word_emb', False):
+                names = [
+                    'encoder_embedding',
+                    'encoder_relative_position_embedding',
+                    'decoder_relative_position_embedding',
+                    'decoder_embedding',
+                ]
+                for pname in names:
+                    for param in getattr(self.frozen_model.enc_dec_model, pname).parameters():
+                        param.requires_grad = False
+                known_groups.append('enc_dec_model.word_embeddings.')
+                known_groups.append('enc_dec_model.relative_position_embedding.')
+        if self.cfg.get('freeze_modality_adapter', False):
+            self.perception.modality_adapter.freeze()
+            known_groups.append('modality_adapter.')
+        if self.cfg.get('freeze_audio_encoder', False):
+            self.perception.encoder.freeze()
+            known_groups.append('audio_encoder.')
+
+        opt_params = []
+        for _, module in self.named_modules():
+            if isinstance(module, adapter_mixins.AdapterModuleMixin) and module.is_adapter_available():
+                module.set_enabled_adapters(enabled=True)
+                module.unfreeze_enabled_adapters()  # selectively unfreeze the adapter modules.
+                opt_params += [p for p in module.parameters()]
+
+        param_groups = []
+        if "optim_param_groups" in self.cfg:
+            param_groups_cfg = self.cfg.optim_param_groups
+            for group, group_cfg in param_groups_cfg.items():
+                module = getattr(self, group, None)
+                if module is None:
+                    raise ValueError(f"{group} not found in model.")
+                elif hasattr(module, "parameters"):
+                    known_groups.append(f"{group}.")
+                    new_group = {"params": module.parameters()}
+                    for k, v in group_cfg.items():
+                        new_group[k] = v
+                    param_groups.append(new_group)
+                else:
+                    raise ValueError(f"{group} does not have parameters.")
+
+        for n, p in self.named_parameters():
+            is_unknown = True
+            for group in known_groups:
+                if n.startswith(group):
+                    is_unknown = False
+            if is_unknown:
+                opt_params.append(p)
+
+        param_groups = [{"params": opt_params}] + param_groups
+
+        self._optimizer_param_groups = param_groups
+        logging.info(f"Optimizer groups set:\n{self.summarize(max_depth=2)}")
+
+    def inject_perception_input(self, encoded, encoded_len, input_ids, input_length):
+        def _concat_embs(embs1, emb1_lens, embs2, emb2_lens):
+            concat_emb = []
+            concat_len = []
+            for emb1, emb1_len, emb2, emb2_len in zip(embs1, emb1_lens, embs2, emb2_lens):
+                if self.cfg.get('ignore_dummy_audio', False) and emb1_len <= 1:  # TODO: ignore the dummy audio emb
+                    new_len = emb2_len
+                    new_emb = emb2[:emb2_len]
+                else:
+                    new_len = emb1_len + emb2_len
+                    new_emb = torch.concat([emb1[:emb1_len], emb2[:emb2_len]], axis=0)
+                padded_new_emb = torch.zeros(emb1.shape[0] + emb2.shape[0], emb1.shape[-1], device=emb1.device)
+                padded_new_emb[:new_len, ...] = new_emb
+                concat_emb.append(padded_new_emb)
+                concat_len.append(new_len)
+            concat_emb = torch.stack(concat_emb, dim=0)
+            concat_len = torch.stack(concat_len, dim=0)
+            return concat_emb, concat_len
+
+        # [b, t, c]
+        lm_embedding = self.frozen_model.enc_dec_model.encoder_embedding
+        input_embeds = lm_embedding.word_embeddings(input_ids)
+        if self.cfg.audio_prompt_first:
+            encoder_input, encoder_length = _concat_embs(encoded, encoded_len, input_embeds, input_length)
+        else:  # more streaming friendly
+            encoder_input, encoder_length = _concat_embs(input_embeds, input_length, encoded, encoded_len)
+
+        b = encoder_input.shape[0]
+        max_len = encoder_input.shape[1]
+
+        # Using causal attention mask for whole input
+        # TODO(zhehuai): use prefixlm instead for the audio embeddings
+        attention_mask = torch.tril(torch.ones((b, max_len, max_len), device=encoder_input.device)).view(
+            b, 1, max_len, max_len
+        )
+        # Convert attention mask from float to bool
+        attention_mask = attention_mask < 0.5
+        position_ids = build_position_ids(encoder_input[:, :, 0])
+
+        # Add position embeddings
+        if hasattr(lm_embedding, "position_embeddings"):
+            position_embeddings = lm_embedding.position_embeddings(position_ids)
+            encoder_input = encoder_input + position_embeddings
+        else:
+            pass
+        encoder_max_length = encoder_input.shape[1]
+        if lm_embedding.transpose_batch_sequence:
+            encoder_input = encoder_input.contiguous()
+        if self.cfg.get("sequence_parallel", False):
+            encoder_input = tensor_parallel.mappings.scatter_to_sequence_parallel_region(encoder_input)
+        return encoder_input, attention_mask, encoder_length, position_ids, encoder_max_length
+
+    def _shift_labels_by_emb_len(self, labels, label_lens, emb_lens, max_len, pad_token=0):
+        shifted_labels = []
+        for label, label_len, emb_len in zip(labels, label_lens, emb_lens):
+            shifted_label = torch.full([max_len], pad_token, device=label.device)
+            shifted_label[emb_len : emb_len + label_len] = label[:label_len]
+            shifted_labels.append(shifted_label)
+        shifted_labels = torch.stack(shifted_labels, dim=0)
+        return shifted_labels
+
+    def _get_text_embeddings(self, text_tokens, position_ids):
+        lm_embedding = self.frozen_model.enc_dec_model.encoder_embedding
+        text_embeddings = lm_embedding.word_embeddings(text_tokens)  # (batch_size, seq_len, hidden_size)
+        if hasattr(lm_embedding, 'position_embeddings'):
+            position_embeddings = lm_embedding.position_embeddings(position_ids)
+            text_embeddings = text_embeddings + position_embeddings
+        return text_embeddings
+
+    def prepare_llm_input(self, audio_batch):
+
+        input_signal = audio_batch['audio_signal']
+        input_signal_length = audio_batch['audio_signal_length']
+
+        input_ids, input_length, labels, loss_mask = (
+            audio_batch['contexts'],
+            audio_batch['context_lengths'],
+            audio_batch['labels'],
+            audio_batch['loss_mask'],
+        )
+
+        # [b, t, c]
+        encoded, encoded_len = self.perception(
+            input_signal=input_signal,
+            input_signal_length=input_signal_length,
+            processed_signal=None,
+            processed_signal_length=None,
+        )
+        encoder_input, attention_mask, encoder_length, _, encoder_max_length = self.inject_perception_input(
+            encoded, encoded_len, input_ids, input_length
+        )
+        # generate encoder_mask from encoder_length
+        enc_mask = torch.arange(encoder_input.shape[1], device=encoder_input.device)[None, :] < encoder_length[:, None]
+        return encoder_input, attention_mask, enc_mask
+
+    def forward(
+        self,
+        audio_batch,
+        checkpoint_activations_all_layers,
+    ):
+        """Forward pass of the model.
+
+        We prepend audio embeddings to the instruction and label text tokens
+        as the LLM input.
+        """
+        if 'audio_ratio' in audio_batch:
+            self.log(
+                'audio_ratio', audio_batch['audio_ratio'].mean(), prog_bar=True, batch_size=1, rank_zero_only=False
+            )
+            self.log(
+                'local_batch_size',
+                audio_batch['audio_ratio'].shape[0],
+                prog_bar=True,
+                batch_size=1,
+                rank_zero_only=False,
+            )
+
+        encoder_input, attention_mask, enc_mask = self.prepare_llm_input(audio_batch)
+        # enc_input = speech and text prompt
+        # dec_input and label = text output label
+        b = audio_batch['answers'].shape[0]
+        device = audio_batch['answers'].device
+        dec_input = audio_batch['masked_answer_ids'] if 'masked_answer_ids' in audio_batch else audio_batch['answers']
+        dec_input = torch.cat([torch.full([b, 1], self.bos_id, device=device), dec_input[:, :-1]], dim=-1)
+        labels = audio_batch['answers']
+        dec_mask = (dec_input != self.tokenizer.pad_id).long().contiguous()
+        output = self.frozen_model.enc_dec_model(
+            enc_input_ids=None,
+            enc_attn_mask=enc_mask,
+            dec_input_ids=dec_input,
+            dec_attn_mask=dec_mask,
+            token_type_ids=None,
+            labels=labels,
+            output_enc_hidden_only=False,
+            enc_input=encoder_input,
+        )
+        loss_mask = dec_mask
+        return output, loss_mask
+
+    def get_forward_output_only_func(self):
+        def fwd_output_only_func(dataloader_iter, model):
+            batch = next(dataloader_iter)
+            extra_arg = {}
+            # take the batch produced by prepare_batch_at_step
+            (
+                _,
+                input_embeddings,
+                attention_mask,
+                _,
+                set_inference_key_value_memory,
+                inference_max_sequence_len,
+            ) = batch
+            if attention_mask is not None:
+                attention_mask = attention_mask.cuda()
+                attention_mask = attention_mask[0:1]
+            extra_arg['set_inference_key_value_memory'] = set_inference_key_value_memory[0].item()
+            extra_arg['inference_max_sequence_len'] = inference_max_sequence_len[0].item()
+            output_tensor = model(
+                input_ids=None,
+                position_ids=None,
+                encoder_input=input_embeddings,
+                attention_mask=attention_mask,
+                **extra_arg,
+            )
+
+            if isinstance(output_tensor, tuple):
+                output_tensor = output_tensor[1]  # get logits only
+
+            def id_func(output_tensor):
+                return output_tensor, {'logits': output_tensor}
+
+            return output_tensor, id_func
+
+        return fwd_output_only_func
+
+    def get_forward_output_and_loss_func(self, validation_step=False):
+        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
+            batch = next(dataloader_iter)
+            batch = {key: val.cuda(non_blocking=True) for key, val in batch.items()}
+            output_tensor, loss_mask = self.forward(
+                batch, checkpoint_activations_all_layers=checkpoint_activations_all_layers
+            )
+
+            def loss_func(output_tensor):
+                # Loss for a micro-batch (ub)
+                if 'audio_ratio' in batch:
+                    text_loss_weight = self.cfg.get('text_loss_weight', 1.0)
+                    audio_ratio = batch['audio_ratio']
+                    scaled_loss_mask = loss_mask * torch.unsqueeze(
+                        (1 * audio_ratio + text_loss_weight * (1 - audio_ratio)), 1
+                    )
+                    loss_for_ub = self.loss_func(scaled_loss_mask, output_tensor)
+                else:
+                    loss_for_ub = self.loss_func(loss_mask, output_tensor)
+                if validation_step and not self.cfg.data.get('validation_drop_last', True):
+                    num_valid_tokens_in_ub = batch['loss_mask'].sum()
+                    if loss_for_ub.isnan():
+                        assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input'
+                        loss_sum_for_ub = torch.zeros_like(num_valid_tokens_in_ub)
+                    else:
+                        loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub
+
+                    loss_sum_and_ub_size_all_gpu = torch.cat(
+                        [
+                            loss_sum_for_ub.clone().detach().view(1),
+                            torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(),
+                        ]
+                    )
+                    # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds)
+                    torch.distributed.all_reduce(
+                        loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group()
+                    )
+                    return loss_for_ub, {'loss_sum_and_ub_size': loss_sum_and_ub_size_all_gpu}
+                else:
+                    reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+                    return loss_for_ub, {'avg': reduced_loss}
+
+            return output_tensor, loss_func
+
+        return fwd_output_and_loss_func
+
+    def _build_dataset(self, data_cfg, is_train=True):
+        return build_speechllm_dataset(self, data_cfg, is_train)
+
+    def build_data_loader(self, dataset, data_cfg, consumed_samples=0, is_eval=False):
+        return build_speechllm_dataloader(dataset, data_cfg, consumed_samples, is_eval=is_eval)
+
+    @classmethod
+    def _modify_config(cls, gpt_cfg, cfg, audio_cfg, add_cfg_to_tree=False):
+        """
+        This function modifies the original gpt pre-training config (gpt_cfg) with attributes from the finetuning config (cfg).
+        The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
+        """
+        OmegaConf.set_struct(gpt_cfg, True)
+        OmegaConf.resolve(cfg)
+        with open_dict(gpt_cfg):
+            if 'vocab_file' in cfg.model:
+                gpt_cfg.tokenizer.vocab_file = cfg.model.vocab_file
+            gpt_cfg.legacy_tokenizer = cfg.model.get('legacy_tokenizer', False)
+            gpt_cfg.audio_prompt_first = cfg.model.get('audio_prompt_first', True)
+            gpt_cfg.ignore_dummy_audio = cfg.model.get('ignore_dummy_audio', False)
+            gpt_cfg.freeze_llm = cfg.model.get('freeze_llm', True)
+            gpt_cfg.freeze_word_emb = cfg.model.get('freeze_word_emb', False)
+            gpt_cfg.freeze_encoder = cfg.model.get('freeze_encoder', False)
+            gpt_cfg.freeze_decoder = cfg.model.get('freeze_decoder', False)
+            gpt_cfg.text_loss_weight = cfg.model.get('text_loss_weight', 1.0)
+            gpt_cfg.freeze_audio_encoder = cfg.model.get('freeze_audio_encoder', False)
+            gpt_cfg.freeze_modality_adapter = cfg.model.get('freeze_modality_adapter', False)
+            gpt_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
+            gpt_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
+            gpt_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
+            gpt_cfg.sequence_parallel = cfg.model.get("sequence_parallel", False)
+            gpt_cfg.tensor_model_parallel_size = cfg.model.get(
+                "tensor_model_parallel_size",
+                gpt_cfg.tensor_model_parallel_size if hasattr(gpt_cfg, "tensor_model_parallel_size") else 1,
+            )
+            gpt_cfg.activations_checkpoint_granularity = cfg.model.get("activations_checkpoint_granularity", None)
+            gpt_cfg.activations_checkpoint_num_layers = cfg.model.get("activations_checkpoint_num_layers", None)
+            gpt_cfg.activations_checkpoint_method = cfg.model.get("activations_checkpoint_method", None)
+            gpt_cfg.data = cfg.model.data
+            gpt_cfg.optim = cfg.model.optim
+            gpt_cfg.precision = cfg.trainer.precision
+            gpt_cfg.answer_only_loss = cfg.model.answer_only_loss
+            gpt_cfg.language_model_path = cfg.model.language_model_path
+            gpt_cfg.resume_from_checkpoint = cfg.model.resume_from_checkpoint
+            gpt_cfg.save_nemo_on_validation_end = cfg.model.save_nemo_on_validation_end
+            gpt_cfg.gradient_as_bucket_view = cfg.model.gradient_as_bucket_view
+            # set dropout
+            hidden_dropout = cfg.model.get('hidden_dropout', 0.0)
+            attention_dropout = cfg.model.get('attention_dropout', 0.0)
+            ffn_dropout = cfg.model.get('ffn_dropout', 0.0)
+            gpt_cfg.encoder.hidden_dropout = hidden_dropout
+            gpt_cfg.decoder.hidden_dropout = hidden_dropout
+            gpt_cfg.encoder.attention_dropout = attention_dropout
+            gpt_cfg.decoder.attention_dropout = attention_dropout
+            gpt_cfg.encoder.ffn_dropout = ffn_dropout
+            gpt_cfg.decoder.ffn_dropout = ffn_dropout
+            if hasattr(gpt_cfg, 'embedding_dropout'):
+                gpt_cfg.embedding_dropout = hidden_dropout
+            # set label_smoothing
+            if hasattr(gpt_cfg, 'label_smoothing'):
+                gpt_cfg.label_smoothing = cfg.model.get('label_smoothing', gpt_cfg.label_smoothing)
+            gpt_cfg.virtual_prompt_style = cfg.model.virtual_prompt_style
+            gpt_cfg.lora_tuning = cfg.model.lora_tuning
+            # for AudioGPTLoRAModel
+            gpt_cfg.target = f"{cls.__module__}.{cls.__name__}"
+            gpt_cfg.perception = cfg.model.perception
+            gpt_cfg.pretrained_audio_model = cfg.model.get('pretrained_audio_model', None)
+            gpt_cfg.perception.preprocessor = audio_cfg.preprocessor
+            gpt_cfg.perception.encoder = audio_cfg.encoder
+            modality_adapter_cfg = gpt_cfg.perception.modality_adapter
+            modality_adapter_cfg.feat_in = audio_cfg.encoder.d_model
+            gpt_cfg.perception.output_dim = gpt_cfg.encoder.hidden_size
+            override_vocab_size = cfg.model.get('override_vocab_size', None)
+            if override_vocab_size is not None:
+                gpt_cfg.override_vocab_size = override_vocab_size
+            if not hasattr(gpt_cfg, 'tokenizer'):
+                gpt_cfg.tokenizer = gpt_cfg.decoder_tokenizer
+            # This is needed when modifying a hparam file directly to load `.ckpt` files.
+            # This is not needed to modify the cfg in `.nemo` files.
+            if add_cfg_to_tree:
+                OmegaConf.resolve(gpt_cfg)
+                gpt_cfg.cfg = gpt_cfg
+
+        return gpt_cfg
+
+    @classmethod
+    def load_audio_model(cls, pretrained_audio_model):
+        try:
+            if pretrained_audio_model.endswith('.nemo'):
+                logging.info(f'Loading pretrained audio model from local file: {pretrained_audio_model}')
+                audio_model = ASRModel.restore_from(pretrained_audio_model, map_location='cpu')
+            else:
+                logging.info(f'Loading pretrained audio model from NGC: {pretrained_audio_model}')
+                audio_model = ASRModel.from_pretrained(pretrained_audio_model, map_location='cpu')
+        except:
+            logging.info(f'Fail in loading it with ASRModel. Try again with SpeechEncDecSelfSupervisedModel.')
+            if pretrained_audio_model.endswith('.nemo'):
+                logging.info(f'Loading pretrained audio model from local file: {pretrained_audio_model}')
+                audio_model = SpeechEncDecSelfSupervisedModel.restore_from(pretrained_audio_model, map_location='cpu')
+            else:
+                logging.info(f'Loading pretrained audio model from NGC: {pretrained_audio_model}')
+                audio_model = SpeechEncDecSelfSupervisedModel.from_pretrained(
+                    pretrained_audio_model, map_location='cpu'
+                )
+        return audio_model
+
+    @classmethod
+    def restore_from_pretrained_models(
+        cls,
+        cfg: Optional[Union[OmegaConf, str]] = None,
+        trainer: Optional[Trainer] = None,
+    ):
+        if not cfg.model.pretrained_audio_model:
+            raise RuntimeError("PEFT training needs a pretrained audio model present.")
+
+        if not cfg.model.language_model_path:
+            raise RuntimeError("PEFT training needs a trained base model present.")
+
+        base_model_save_restore_connector = NLPSaveRestoreConnector()
+        if os.path.isdir(cfg.model.language_model_path):
+            base_model_save_restore_connector.model_extracted_dir = cfg.model.language_model_path
+        base_model_cfg = cls.restore_from(
+            restore_path=cfg.model.language_model_path,
+            trainer=trainer,
+            return_config=True,
+            save_restore_connector=base_model_save_restore_connector,
+        )
+        audio_model = cls.load_audio_model(cfg.model.pretrained_audio_model)
+
+        model_cfg = cls._modify_config(base_model_cfg, cfg, audio_model.cfg, add_cfg_to_tree=False)
+
+        # load llm
+        model = cls.restore_from(
+            restore_path=cfg.model.language_model_path,
+            trainer=trainer,
+            override_config_path=model_cfg,
+            strict=False,
+        )
+        # load am
+        model.perception.tokenizer = audio_model.tokenizer
+        if cfg.model.get('load_audio_encoder', True):
+            model.perception.encoder.load_state_dict(
+                audio_model.encoder.state_dict(), strict='adapter' not in cfg.model.perception
+            )
+            logging.info(f'Loaded pretrained audio model from {cfg.model.pretrained_audio_model}')
+        else:
+            logging.info(f'Not load pretrained audio model from {cfg.model.pretrained_audio_model}')
+        if cfg.model.get('use_am_tokenizer', False):
+            model.tokenizer = audio_model.tokenizer
+            logging.info(f'Use AM tokenizer: {audio_model.tokenizer}')
+        if 'inference' in cfg:
+            inference_cfg = OmegaConf.to_container(cfg.inference, resolve=True)
+            model.set_inference_config(inference_cfg)
+        return model
+
+    def _build_vocab(self):
+        """
+        Manipulate vocabulary (e.g., pad vocabulary for increased performance)/
+        """
+        if self._cfg.get('override_vocab_size', None) is not None:
+            self.padded_vocab_size = self._cfg.override_vocab_size
+        else:
+            self.padded_vocab_size = self._vocab_size_with_padding(
+                orig_vocab_size=self.tokenizer.vocab_size,
+                make_vocab_size_divisible_by=self._cfg.get('make_vocab_size_divisible_by', 128),
+                tensor_model_parallel_size=self._cfg.get('tensor_model_parallel_size', 1),
+            )
+
+    def state_dict(self, destination=None, prefix=None, keep_vars=False):
+        if self.setup_complete:
+            # save adapter
+            return_state_dict = super().state_dict(destination, prefix, keep_vars)
+            # save perception
+            if not self.cfg.get('freeze_audio_encoder', False):
+                perception_state_dict = self.perception.state_dict(prefix="perception.")
+                return_state_dict.update(perception_state_dict)
+            # store llm if not freezing it
+            if not self.cfg.get('freeze_llm', True):
+                llm_state_dict = self.frozen_model.state_dict(prefix="frozen_model.")
+                return_state_dict.update(llm_state_dict)
+        else:
+            return_state_dict = self.frozen_model.state_dict(prefix="frozen_model.")
+        return return_state_dict
+
+    def load_state_dict(self, state_dict, strict: bool = True):
+        """
+        Loads a state_dict expecting the state_dict to contain key,values
+        only for the adapter parameters.
+        """
+        if self.setup_complete:
+            # load adapters
+            super().load_state_dict(state_dict, strict)
+            # load perception
+            print(f"loading state_dict {self.setup_complete}: {state_dict.keys()}")
+            super(NLPModel, self).load_state_dict(state_dict, strict=False)
+        else:
+            if len([i for i in state_dict.keys() if 'lora' in i]) > 0:
+                # load adapters
+                super().load_state_dict(state_dict, strict)
+            # load frozen llm and maybe perception model
+            print(f"loading state_dict {self.setup_complete}: {state_dict.keys()}")
+            super(NLPModel, self).load_state_dict(state_dict, strict=False)
+
+    def build_train_valid_test_datasets(self, stage):
+        if stage != 'test':
+            logging.info('Building GPT SFT validation datasets.')
+            # Wrap this in a list since the general finetuning parent class supports multi-validation.
+            self._validation_ds = self._build_dataset(self.cfg.data.validation_ds, is_train=False)
+
+        if stage != 'validate':
+            if hasattr(self.cfg.data, 'test_ds'):
+                logging.info('Building GPT SFT test datasets.')
+                # Wrap this in a list since the general finetuning parent class supports multi-validation.
+                self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False)
+
+        if stage == 'validate' or stage == 'test':
+            return
+        logging.info('Building GPT SFT traing datasets.')
+        self._train_ds = self._build_dataset(self.cfg.data.train_ds)
+
+    def setup_training_data(self, training_data_config=None):
+        return
+
+    def setup_validation_data(self, validation_data_config=None):
+        return
+
+    def setup_test_data(self, test_data_config=None):
+        return
+
+    def setup_training_dataloader(self):
+        if hasattr(self, '_train_ds'):
+            consumed_samples = self.compute_consumed_samples(0)
+            self._train_dl = self.build_data_loader(
+                dataset=self._train_ds,
+                data_cfg=self.cfg.data.train_ds,
+                consumed_samples=consumed_samples,
+            )
+
+    def setup(self, stage=None):
+        self.init_consumed_samples = 0
+
+        if stage == 'predict':
+            return
+
+        # If the user wants to manually override train and validation dataloaders before calling `.fit()`
+        if self._train_dl is not None and self._validation_dl is not None:
+            return
+        self.build_train_valid_test_datasets(stage=stage)
+        if hasattr(self, '_train_ds'):
+            self.setup_training_dataloader()
+        if hasattr(self, '_validation_ds'):
+            self._validation_dl = self.setup_eval_dataloader(self._validation_ds, self.cfg.data.validation_ds)
+        if hasattr(self.cfg.data, 'test_ds'):
+            self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds)
+
+        # when using pipeline model parallel the final stage need to initialize word embeddings
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            if isinstance(self.frozen_model, list):
+                for i, module in enumerate(self.frozen_model):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    module.sync_initial_word_embeddings()
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+            else:
+                self.frozen_model.sync_initial_word_embeddings()
+
+        if self.cfg.get('transformer_engine', False):
+            self.setup_transformer_engine_tp_groups()
+        self.setup_complete = True
+
+    @property
+    def _metrics_require_string2category_map(self):
+        return set(["f1", "accuracy", "average_precision"])
+
+    def setup_metric(self, data_cfg):
+        metric_name = "exact_string_match"
+        if not hasattr(data_cfg, "metric"):
+            metric = MetricStringToTorchMetric["exact_string_match"]
+        else:
+            if not hasattr(data_cfg.metric, "name"):
+                raise ValueError("Metric name is not provided in the metric config.")
+            if data_cfg.metric.name == "loss":
+                return None, "loss"
+            if data_cfg.metric.name not in MetricStringToTorchMetric:
+                raise KeyError(
+                    f"{data_cfg.metric.name} is not supported. List of supported metrics: {MetricStringToTorchMetric.keys()}"
+                )
+            if data_cfg.metric.name in self._metrics_require_string2category_map:
+                if data_cfg.metric.average is None:
+                    raise ValueError(
+                        f"{data_cfg.metric.name} requires specifying whether you want to compute a micro or macro average. Found None."
+                    )
+            if (
+                data_cfg.metric.get('labels_are_strings', False)
+                and data_cfg.metric.name in self._metrics_require_string2category_map
+            ):
+                if data_cfg.metric.num_classes is None:
+                    raise ValueError(
+                        "Number of classes is not provided in the metric section within the data config. "
+                        f"Please provide the number of classes in the data config to use the {data_cfg.metric.name} metric."
+                    )
+                if data_cfg.metric.get('class_labels', None) is None or not isinstance(
+                    data_cfg.metric.get('class_labels', None), ListConfig
+                ):
+                    raise ValueError(
+                        "Class labels are not provided properly in the metric section witnin the data config. "
+                        f"Please provide the class labels as a list of strings in the data config to use the {data_cfg.metric.name} metric."
+                    )
+                if len(data_cfg.metric.get('class_labels', None)) != data_cfg.metric.num_classes:
+                    raise ValueError(
+                        f"Number of class labels {len(data_cfg.metric.get('class_labels', None))} does not match `num_classes` : {data_cfg.metric.num_classes}"
+                    )
+
+            metric_name = data_cfg.metric.name
+            metric_cls = MetricStringToTorchMetric[metric_name]
+            if metric_name not in TextMetricsSet:
+                metric = [metric_cls(**data_cfg.metric)]
+            else:
+                metric = [metric_cls()]
+        return metric, metric_name
+
+    # Override the parent batch reconfiguring logic.
+    def _reconfigure_and_process_inference_batch(self, batch, data_cfg):
+        global_batch_size_per_gpu = batch['tokens'].size(0)
+        # This should happen only on the last batch of the dataset.
+        if (
+            global_batch_size_per_gpu
+            != get_current_global_batch_size() // parallel_state.get_data_parallel_world_size()
+        ):
+            # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches.
+            if (
+                global_batch_size_per_gpu
+                != data_cfg.global_batch_size // parallel_state.get_data_parallel_world_size()
+            ):
+                app_state = AppState()
+                _reconfigure_microbatch_calculator(
+                    rank=app_state.global_rank,
+                    rampup_batch_size=None,
+                    global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
+                    micro_batch_size=global_batch_size_per_gpu,
+                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
+                )
+            # NOTE: need to explicitly handle resetting for multi-validation
+            else:
+                app_state = AppState()
+                _reconfigure_microbatch_calculator(
+                    rank=app_state.global_rank,
+                    rampup_batch_size=None,
+                    global_batch_size=data_cfg.global_batch_size,
+                    micro_batch_size=data_cfg.micro_batch_size,
+                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
+                )
+
+    def validation_step(self, dataloader_iter, inference=False):
+        return self.inference_step(dataloader_iter, 'validation')
+
+    def _validation_step_internal(
+        self, dataloader_iter, batch_idx, dataloader_idx=0, inference=False, result_mode='validation'
+    ):
+        """
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        """
+        mode = self.training
+        self.eval()
+        loss = self.fwd_bwd_step(dataloader_iter, 0, True)
+        self.train(mode=mode)
+        self.frozen_model.eval()
+
+        if result_mode == 'validation':
+            if type(self._validation_dl) == list and len(self._validation_dl) > 1:
+                self.validation_step_outputs[dataloader_idx].append(loss)
+            else:
+                self.validation_step_outputs.append(loss)
+        else:
+            if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
+                self.test_step_outputs[dataloader_idx].append(loss)
+            else:
+                self.test_step_outputs.append(loss)
+        return loss
+
+    def inference_step(self, dataloader_iter, mode, dataloader_idx=0):
+        batch, batch_idx, dataloader_idx = next(dataloader_iter)
+        data_cfg = self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds
+        self._reconfigure_and_process_inference_batch(batch, data_cfg)
+        # Meta data from dataset
+        metadata = batch.get('metadata', [{}] * len(batch['tokens']))
+        loss = self._validation_step_internal(itertools.chain([batch]), batch_idx, dataloader_idx, result_mode=mode)
+
+        # We need _inference_config to get generation params
+        # add_BOS and tokens_to_generate are set in dataset
+        if self.get_inference_config() is None:
+            logging.warning(f'inference_config is not set. Use default: {default_inference_config}')
+            self.set_inference_config(inference_config=default_inference_config)
+        self._inference_config['add_BOS'] = data_cfg.add_bos
+        self._inference_config['tokens_to_generate'] = data_cfg.get('tokens_to_generate')
+
+        output = self.predict_step(batch, batch_idx, dataloader_idx)
+
+        inputs_text = [self.tokenizer.ids_to_text(c.tolist()) for c in batch['contexts']]
+        labels_text = [self.tokenizer.ids_to_text(a.tolist()) for a in batch['answers']]
+        preds_text = output['preds_text']
+        if data_cfg.get("log_every_n_steps", None) is not None:
+            if batch_idx % data_cfg.log_every_n_steps == 0:
+                logging.info(f"Input: `{inputs_text[0]}`")
+                logging.info(f"Label: `{labels_text[0]}`")
+                logging.info(f"Pred: `{preds_text[0]}`")
+
+        outputs = {
+            'loss': loss,
+            'preds': preds_text,  # [str]
+            'labels': labels_text,  # [str]
+            'inputs': inputs_text,  # [str]
+            'metadata': metadata,  # [dict]
+        }
+
+        if mode == 'validation':
+            if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
+                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
+                self.validation_step_outputs[dataloader_idx][-1] = outputs
+            else:
+                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
+                self.validation_step_outputs[-1] = outputs
+        else:
+            if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
+                self.test_step_outputs[dataloader_idx][-1] = outputs
+            else:
+                self.test_step_outputs[-1] = outputs
+        return outputs
+
+    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
+
+        batch = move_to_device(batch, device=self.device)
+        encoder_input, attention_mask, enc_mask = self.prepare_llm_input(batch)
+        # enc_input = speech and text prompt
+        # dec_input and label = text output label
+        predicted_token_ids, log_probs = self.frozen_model.decode(
+            tokens_enc=None,
+            enc_mask=enc_mask,
+            num_tokens_to_generate=self._inference_config['tokens_to_generate'],
+            encoder_input=encoder_input,
+            tokenizer=self.tokenizer,
+            bos_id=self.bos_id,
+        )
+
+        # Special ids to text function to handle stripping <eos> and special tokens with sentencepiece tokenizers.
+        input_text = batch['contexts']
+        preds_text = MegatronT5SFTModel.ids_to_text(predicted_token_ids, self.tokenizer)
+        input_text = MegatronT5SFTModel.ids_to_text(input_text, self.tokenizer)
+        labels = batch['answers']
+
+        if labels is not None:
+            labels_text = MegatronT5SFTModel.ids_to_text(labels, self.tokenizer)
+        else:
+            labels_text = [None] * len(preds_text)
+
+        return {
+            'input_text': input_text,
+            'preds_text': preds_text,
+            'labels_text': labels_text,
+        }
+
+    def on_test_epoch_end(self):
+        _ = self.inference_epoch_end(self.test_step_outputs, 'test', self.cfg.data.test_ds)
+        # Commenting as on_test_epoch_end was a no-op in PTL 1.9
+        # return super().on_test_epoch_end()
+
+    def on_validation_epoch_end(self):
+        _ = self.inference_epoch_end(self.validation_step_outputs, 'validation', self.cfg.data.validation_ds)
+        # Commenting as on_validation_epoch_end was a no-op in PTL 1.9
+        # return super().on_validation_epoch_end()
+
+    def inference_epoch_end(self, outputs, mode, data_cfg):
+        # Parent class will handle logging of the loss.
+        if not outputs:
+            # Handle case where no metrics. This can break checkpoint save/load.
+            app_state = AppState()
+            monitor_mode = app_state.checkpoint_callback_params.mode
+            assert monitor_mode in ['min', 'max']
+            averaged_metric = 0.0 if monitor_mode == 'max' else 1e2
+            logging.warning(f"No outputs to log for {mode} epoch")
+            return torch.Tensor([1e2]), torch.Tensor([averaged_metric])
+
+        if isinstance(outputs[0], dict):
+            outputs = [outputs]
+
+        averaged_loss = []
+        averaged_metric = []
+        # Log metrics for each provided validation/test dataset.
+        for dataloader_idx, output in enumerate(outputs):
+            if len(output) == 0:
+                logging.warning(f"Empty output for dataloader_idx: {dataloader_idx}")
+                continue
+            # Expand on_validation_epoch_end from parent class MegatronGPTModel as on_validation_epoch_end doesnt take outputs arg
+            loss_vals = [x['loss'] for x in output]
+            if parallel_state.is_pipeline_last_stage():
+                # only the last pipeline parallel stages return loss with their batch size
+                if self.cfg.data.get('validation_drop_last', True):
+                    loss = torch.stack(loss_vals).mean()
+                else:
+                    # Compute the avg loss by total_loss across all samples / total number of samples
+                    total_loss_and_total_samples = torch.vstack(loss_vals).sum(axis=0)
+                    avg_loss = total_loss_and_total_samples[0] / total_loss_and_total_samples[1]
+                    loss = avg_loss.type(torch.float32).cuda()
+            else:
+                loss = torch.tensor(0.0, dtype=torch.float32).cuda()
+
+            # we can only log on one rank if it is rank zero so we broadcast from last rank
+            torch.distributed.broadcast(loss, get_last_rank())
+
+            self.log('val_loss', loss, prog_bar=True, rank_zero_only=True, batch_size=1, sync_dist=True)
+
+            # Determine the key used to log the loss based on the user provided name of the dataset or the dataloader index.
+            loss_log_key = self._determine_log_key(data_cfg, dataloader_idx, "loss", mode)
+            self.log(loss_log_key, loss, batch_size=1)
+            averaged_loss.append(loss)
+
+            # Gather the outputs object from all data parallel ranks since we are using the DistributedSampler which splits data across DDP ranks.
+            gathered_outputs = [None for _ in range(parallel_state.get_data_parallel_world_size())]
+            torch.distributed.all_gather_object(
+                gathered_outputs,
+                [
+                    {'preds': x['preds'], 'labels': x['labels'], 'inputs': x['inputs'], 'metadata': x['metadata']}
+                    for x in output
+                ],
+                group=parallel_state.get_data_parallel_group(),
+            )
+
+            # Remove duplicate examples due to distributed sampler.
+            inp_label_set = set()
+            deduplicated_outputs = {
+                'preds': [],
+                'labels': [],
+                'inputs': [],
+                'metadata': [],
+            }
+            total_size = 0
+            for rank in range(0, parallel_state.get_data_parallel_world_size()):
+                for batch in gathered_outputs[rank]:
+                    for pred, label, input, metadata in zip(
+                        batch['preds'], batch['labels'], batch['inputs'], batch['metadata']
+                    ):
+                        key = input + label
+                        total_size += 1
+                        dedup = data_cfg.get('deduplicate', True)
+                        if (not dedup) or key not in inp_label_set:
+                            inp_label_set.add(key)
+                            deduplicated_outputs['preds'].append(pred)
+                            deduplicated_outputs['labels'].append(label)
+                            deduplicated_outputs['inputs'].append(input)
+                            deduplicated_outputs['metadata'].append(metadata)
+
+            # Compute metric score
+            metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name
+            metric_label_key = self.val_metric_label_key if mode == 'validation' else self.test_metric_label_key
+            if metric_name != 'loss':
+                metric_log_key = self._determine_log_key(data_cfg, dataloader_idx, metric_name, mode)
+                metric_fn = self.val_metric[0] if mode == 'validation' else self.test_metric[0]
+                if metric_label_key in deduplicated_outputs['metadata'][0]:
+                    labels = [m[metric_label_key] for m in deduplicated_outputs['metadata']]
+                else:
+                    labels = deduplicated_outputs['labels']
+
+                # sacrebleu.corpus_bleu is commonly used which does not share
+                # the same interface as other metrics. We handle it separately.
+                if metric_name == 'bleu':
+                    metric_result = torch.Tensor(
+                        [sacrebleu.corpus_bleu(deduplicated_outputs['preds'], [labels]).score]
+                    ).to(self.device)
+                else:
+                    for pred, label in zip(deduplicated_outputs['preds'], labels):
+                        _ = metric_fn(pred, label)
+
+                    metric_result = metric_fn.compute()
+
+                if metric_name == 'rouge':
+                    for k, v in metric_result.items():
+                        if 'fmeasure' in k:
+                            self.log(metric_log_key + f'_{k}', v.item(), sync_dist=True)
+                            logging.info(f"{mode} {metric_name} {k}: {v.item()}")
+                    metric_result = metric_result['rouge1_fmeasure']
+                else:
+                    self.log(metric_log_key, metric_result.item(), sync_dist=True)
+                    logging.info(f"{mode} {metric_name}: {metric_result.item()}")
+
+                metric_fn.reset()
+                averaged_metric.append(metric_result)
+
+            # Write predictions to file
+            if self.global_rank == 0 and data_cfg.get("write_predictions_to_file", False):
+                logging.info(
+                    f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['inputs'])}"
+                )
+
+                # Check if the user provided a prefix path to the file(s) they want to write.
+                if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None:
+                    raise ValueError(
+                        f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file."
+                    )
+                filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode)
+                output_dir = data_cfg.get("output_dir", "./")
+                self.write_predictions_to_file(
+                    deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}", output_dir
+                )
+
+            torch.distributed.barrier(group=parallel_state.get_data_parallel_group())
+            outputs[dataloader_idx].clear()  # free memory
+
+        # Logging of the averaged metrics:
+        averaged_loss = sum(averaged_loss) / len(averaged_loss)
+        averaged_metric = sum(averaged_metric) / len(averaged_metric) if len(averaged_metric) > 0 else None
+
+        # Handle case where metrics can be nan or inf. This can break checkpoint save/load.
+        if averaged_metric is not None and (torch.isinf(averaged_metric) or torch.isnan(averaged_metric)):
+            app_state = AppState()
+            monitor_mode = app_state.checkpoint_callback_params.mode
+            assert monitor_mode in ['min', 'max']
+            averaged_metric = 0.0 if monitor_mode == 'max' else 1e5
+
+        if mode == 'validation':
+            self.log("validation_loss", averaged_loss, batch_size=1, sync_dist=True)
+            if averaged_metric is not None:
+                self.log(f"validation_{self.val_metric_name}", averaged_metric, sync_dist=True)
+        elif mode == 'test':
+            self.log("test_loss", averaged_loss, batch_size=1, sync_dist=True)
+            if averaged_metric is not None:
+                self.log(f"test_{self.test_metric_name}", averaged_metric, sync_dist=True)
+
+        # Merge the functionality of previous on_inference_epoch_end() within inference_epoch_end() func here
+        app_state = AppState()
+        # TODO(zhehuai): add _restore_sequence_parallelism_args after sync to HEAD
+        if hasattr(self, "_train_ds"):
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=self.cfg.data.train_ds.global_batch_size,
+                micro_batch_size=self.cfg.data.train_ds.micro_batch_size,
+                data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            )
+        # When running `trainer.validate()`, the training dataset is not available.
+        else:
+            logging.warning('No training data found, reconfiguring microbatches based on validation batch sizes.')
+            _reconfigure_microbatch_calculator(
+                rank=app_state.global_rank,
+                rampup_batch_size=None,
+                global_batch_size=data_cfg.global_batch_size,
+                micro_batch_size=data_cfg.micro_batch_size,
+                data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            )
+
+        return averaged_loss, averaged_metric
+
+    # consistent with speech models
+    def write_predictions_to_file(self, outputs, output_file_path_prefix, output_dir):
+        os.makedirs(output_dir, exist_ok=True)
+        output_file_path = output_file_path_prefix + "_inputs_preds_labels.jsonl"
+        output_file_path = os.path.join(output_dir, output_file_path)
+        with open(output_file_path, "w") as f_json:
+            assert (
+                len(outputs['inputs']) == len(outputs['preds']) == len(outputs['labels']) == len(outputs['metadata'])
+            )
+            for i, p, l, m in zip(outputs['inputs'], outputs['preds'], outputs['labels'], outputs['metadata']):
+                json_string = {'input': i, 'pred_text': p, 'text': l}
+                for k, v in m.items():
+                    if k not in json_string:
+                        json_string[k] = v
+                f_json.write(json.dumps(json_string) + '\n')
+
+        logging.info(f'Predictions saved to {output_file_path}')
+
+    def setup_eval_dataloader(self, datasets, data_cfg):
+        dataloaders = []
+        if not isinstance(datasets, list):
+            return self.build_data_loader(dataset=datasets, data_cfg=data_cfg, consumed_samples=0, is_eval=True)
+        for dataset in datasets:
+            eval_dl = self.build_data_loader(dataset=dataset, data_cfg=data_cfg, consumed_samples=0, is_eval=True)
+            dataloaders.append(eval_dl)
+        return dataloaders
+
+    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+        batch = next(dataloader_iter)
+        # Pass only torch.Tensor to prevent errors when process get_iterator_k_split()
+        batch = {k: v for k, v in batch.items() if isinstance(v, torch.Tensor)}
+        _, seq_length = batch['tokens'].shape
+        # handle the case where the batch size from dynamic bucketting is not divisible in lhotse
+        data_iter = get_iterator_k_split(batch, get_num_microbatches(), enforce_divisible_batch=False)
+
+        # handle asynchronous grad reduction
+        no_sync_func = None
+        grad_sync_func = None
+        param_sync_func = None
+        if not forward_only and self.with_distributed_adam:
+            no_sync_func = partial(
+                self._optimizer.no_sync,
+                greedy_grad_copy=self.megatron_amp_O2,
+            )
+            grad_sync_func = self.reduce_overlap_gradients
+            param_sync_func = self.sync_overlap_parameters
+
+        self.model.config.no_sync_func = no_sync_func
+        self.model.config.grad_sync_func = grad_sync_func
+        self.model.config.param_sync_func = param_sync_func
+
+        fwd_bwd_function = get_forward_backward_func()
+
+        dec_seq_length = batch['answers'].shape[1]
+
+        losses_reduced_per_micro_batch = fwd_bwd_function(
+            forward_step_func=self.get_forward_output_and_loss_func(),
+            data_iterator=data_iter,
+            model=[self.model],
+            num_microbatches=get_num_microbatches(),
+            forward_only=forward_only,
+            seq_length=seq_length,
+            micro_batch_size=get_micro_batch_size(),
+            decoder_seq_length=dec_seq_length,
+        )
+
+        # only the last stages of the pipeline return losses
+        if losses_reduced_per_micro_batch:
+            if (not forward_only) or self.cfg.data.get('validation_drop_last', True):
+                # average loss across micro batches
+                loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch]
+                loss_tensor = torch.concat(loss_tensors_list)
+                loss_mean = loss_tensor.mean()
+            else:
+                # Get the total loss since micro batches sizes are not uniform
+                loss_sum_tensors_list = [
+                    loss_sum['loss_sum_and_ub_size']
+                    for loss_sum in losses_reduced_per_micro_batch
+                    if loss_sum['loss_sum_and_ub_size'][1] > 0
+                ]
+                loss_sum = (
+                    torch.vstack(loss_sum_tensors_list).sum(axis=0)
+                    if len(loss_sum_tensors_list) > 0
+                    else torch.tensor([0.0, 0.0]).cuda()
+                )
+                return loss_sum
+        else:
+            # we're not on the last pipeline stage so no losses
+            if forward_only:
+                loss_mean = []
+            else:
+                loss_mean = torch.tensor(0.0).cuda()
+
+        return loss_mean
+
+    def loss_func(self, loss_mask, output_tensor):
+        losses = output_tensor.float()
+        loss_mask = loss_mask.view(-1).float()
+        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()  # sequence level nll
+        return loss
+
+    def _determine_log_key(self, data_config, dataloader_idx, metric_name, mode):
+        # Function that determines whether to log based on the user provided name of the dataset or the dataloader index.
+        base_key = f"{mode}_{metric_name}_" if metric_name is not None else f"{mode}_"
+        # If the user provided names for each validation/test dataset, use those.
+        if hasattr(data_config, "names") and data_config.names is not None:
+            # With only a single validation/test dataset, the name is not a list.
+            if not isinstance(data_config.names, ListConfig):
+                name = data_config.names
+            else:
+                name = data_config.names[dataloader_idx]
+            return base_key + name
+        else:
+            return base_key + f"dataloader{dataloader_idx}"
+
+    def test_step(self, dataloader_iter, dataloader_idx=0):
+        return self.inference_step(dataloader_iter, 'test')
+
+    def training_step(self, dataloader_iter):
+        batch, batch_idx, dataloader_idx = next(dataloader_iter)
+        return super().training_step(itertools.chain([batch]), batch_idx=batch_idx)
+
+    def setup_mcore_distributed_parallel(self):
+        """Set up mcore distributed data parallel called by configure_ddp in nlp_overrides."""
+        if self.with_distributed_adam and self.use_mcore_dist_optim:
+            raise ValueError("T5 does not support both distributed adam and mcore distributed data parallel.")
+
+
+class DecoderTextPromptModularizedAudioT5Model(ModularizedAudioT5Model):
+    """Modularized speech GPT model."""
+
+    def prepare_llm_input(self, audio_batch):
+
+        input_signal = audio_batch['audio_signal']
+        input_signal_length = audio_batch['audio_signal_length']
+
+        # [b, t, c]
+        encoded, encoded_len = self.perception(
+            input_signal=input_signal,
+            input_signal_length=input_signal_length,
+            processed_signal=None,
+            processed_signal_length=None,
+        )
+        encoder_input, attention_mask, encoder_length = encoded, None, encoded_len
+        # generate encoder_mask from encoder_length
+        enc_mask = torch.arange(encoder_input.shape[1], device=encoder_input.device)[None, :] < encoder_length[:, None]
+        return encoder_input, attention_mask, enc_mask
+
+    def forward(
+        self,
+        audio_batch,
+        checkpoint_activations_all_layers,
+    ):
+        """Forward pass of the model.
+
+        We prepend audio embeddings to the instruction and label text tokens
+        as the LLM input.
+        """
+        if 'audio_ratio' in audio_batch:
+            self.log(
+                'local_batch_size',
+                audio_batch['audio_ratio'].shape[0],
+                prog_bar=True,
+                batch_size=1,
+                rank_zero_only=False,
+            )
+
+        encoder_input, _, enc_mask = self.prepare_llm_input(audio_batch)
+        # enc_input = speech prompt
+        # dec_input and label = text prompt and text output label
+        dec_input = audio_batch['tokens']
+        labels = audio_batch['labels']
+        dec_mask = (dec_input != self.tokenizer.eos_id) * (dec_input != self.tokenizer.pad_id).long().contiguous()
+        output = self.frozen_model.enc_dec_model(
+            enc_input_ids=None,
+            enc_attn_mask=enc_mask,
+            dec_input_ids=dec_input,
+            dec_attn_mask=dec_mask,
+            token_type_ids=None,
+            labels=labels,
+            output_enc_hidden_only=False,
+            enc_input=encoder_input,
+        )
+        loss_mask = audio_batch['loss_mask']
+        return output, loss_mask
+
+    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
+
+        batch = move_to_device(batch, device=self.device)
+        encoder_input, _, enc_mask = self.prepare_llm_input(batch)
+        # enc_input = speech prompt
+        # dec_input and label = text prompt and text output label
+
+        predicted_token_ids, log_probs = self.frozen_model.decode(
+            tokens_enc=None,
+            enc_mask=enc_mask,
+            num_tokens_to_generate=self._inference_config['tokens_to_generate'],
+            encoder_input=encoder_input,
+            tokenizer=self.tokenizer,
+            bos_id=self.bos_id,
+            predicted_tokens_dec=torch.cat(
+                [
+                    batch['contexts'],
+                    torch.full_like(batch['contexts'][:, :1], self.sep_id, device=batch['contexts'].device),
+                ],
+                dim=1,
+            ),
+        )
+        predicted_token_ids = predicted_token_ids[:, batch['contexts'].shape[1] + 1 :]
+
+        # Special ids to text function to handle stripping <eos> and special tokens with sentencepiece tokenizers.
+        input_text = batch['contexts']
+        preds_text = MegatronT5SFTModel.ids_to_text(predicted_token_ids, self.tokenizer)
+        input_text = MegatronT5SFTModel.ids_to_text(input_text, self.tokenizer)
+        labels = batch['answers']
+
+        if labels is not None:
+            labels_text = MegatronT5SFTModel.ids_to_text(labels, self.tokenizer)
+        else:
+            labels_text = [None] * len(preds_text)
+
+        return {
+            'input_text': input_text,
+            'preds_text': preds_text,
+            'labels_text': labels_text,
+        }
+
+    def _build_dataset(self, data_cfg, is_train=True):
+        # this is crucial so as to tell the decoder when to start generate answer after context and paddings
+        assert data_cfg.add_sep == True
+        return super()._build_dataset(data_cfg, is_train)
diff --git a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
index 0cd48502bb84..763e03b699cd 100644
--- a/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
+++ b/nemo/collections/multimodal/speech_llm/modules/common/audio_text_generation_strategy.py
@@ -18,7 +18,7 @@
 
 import nemo.collections.nlp.modules.common.text_generation_strategy as text_generation_strategy
 from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import shift_tokens_by_multi_audios
-
+from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids
 
 # the text representation of eos_id, it applies for all tokenizers
 END_OF_SEQ = '<|endoftext|>'
@@ -166,10 +166,121 @@ def end_of_generation_condition(
             return torch.tensor(conditions, dtype=torch.bool, device=tokens.device)
 
 
+class CrossAttendAudioToTextGenerationStrategy(AudioToTextGenerationStrategy):
+    def init_batch(
+        self,
+        context_tokens: torch.Tensor,
+        context_lengths: torch.Tensor,
+        audio_signal: torch.Tensor,
+        audio_length: torch.Tensor,
+        compute_attention_mask: bool,
+        num_audios: Optional[torch.Tensor] = None,
+        context_start_idx: Optional[List[List[int]]] = None,
+    ):
+        """initialize the batch data before the inference steps."""
+        # Move to GPU.
+        batch = {
+            'audio_signal': audio_signal,
+            'audio_signal_length': audio_length,
+            'tokens': context_tokens,
+            'tokens_length': context_lengths,
+            'labels': context_tokens,
+            'loss_mask': None,
+        }
+        if self.model.perception.cfg.get('combine_return', True):
+            (
+                encoder_input,
+                self.attention_mask,
+                context_tokens,
+                _,
+                (speech_encoded, speech_encoded_len, extra_outputs),
+            ) = self.model.prepare_llm_input(batch)
+            self.position_ids = build_position_ids(encoder_input[:, :, 0].transpose(0, 1))
+            self.extra_outputs = extra_outputs
+            return (
+                context_tokens,
+                (encoder_input, speech_encoded, speech_encoded_len),
+                torch.zeros_like(context_lengths),
+            )
+        else:
+            (
+                encoder_input,
+                self.attention_mask,
+                context_tokens,
+                _,
+                (speech_encoded, speech_encoded_len, llm_encoded_len, extra_outputs),
+            ) = self.model.prepare_llm_input(batch)
+            self.position_ids = build_position_ids(encoder_input[:, :, 0].transpose(0, 1))
+            self.extra_outputs = extra_outputs
+            return context_tokens, (encoder_input, speech_encoded, speech_encoded_len), llm_encoded_len
+
+    def prepare_batch_at_step(
+        self,
+        tokens: torch.Tensor,
+        input_embeddings: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+        maxlen: int,
+        micro_batch_size: int,
+        step: int,
+        context_lengths: torch.Tensor,
+        curr_context_length: int,
+        compute_attention_mask: bool,
+    ) -> Tuple[List[torch.Tensor], List[int]]:
+        # types2use = None
+        self.input_embeds_hidden = self.extra_outputs.get('input_embeds_hidden', None)
+        input_embeddings, speech_encoded, speech_encoded_len = input_embeddings
+        if step == 0:
+            # Allocate memory for the entire context.
+            set_inference_key_value_memory = True
+            tokens2use = tokens[:, :curr_context_length]
+            positions2use = self.position_ids[:, :curr_context_length]
+            embeddings2use = input_embeddings[:curr_context_length]
+        else:
+            # Set this to false so the memory is not reallocated.
+            set_inference_key_value_memory = False
+            tokens2use = tokens[:, curr_context_length - 1].view(micro_batch_size, -1)
+            positions2use = self.position_ids[:, curr_context_length - 1].view(micro_batch_size, -1)
+            embeddings2use = self.model._get_text_embeddings(tokens2use, positions2use).transpose(0, 1)
+            started = context_lengths <= curr_context_length
+            # for seq started, first get embeddings2use, and then run cross attend, after that replace embeddings2use with the cross attended embed
+            # use speech_encoded; rerun cross attend
+            # [1, b, d]
+            decoder_mems_list = self.extra_outputs.get('decoder_mems_list', None)
+            if decoder_mems_list is not None:
+                decoder_mems_list = decoder_mems_list[:, :, : curr_context_length - 1]
+            # need to use audio_ratio field if to support text-only decoding
+            embeddings2use, self.extra_outputs = self.model.perception_cross_attn(
+                speech_encoded,
+                speech_encoded_len,
+                embeddings2use,
+                input_lengths=tokens2use.squeeze(-1) != self.model.tokenizer.eos_id,
+                decoder_mems_list=decoder_mems_list,
+                return_mems=True,
+            )
+            self.input_embeds_hidden = self.extra_outputs.get('input_embeds_hidden', None)
+            embeddings2use = switch(
+                input_embeddings[curr_context_length - 1].unsqueeze(0), embeddings2use.transpose(0, 1), started
+            )
+
+        """Prepare batch for each of the inference steps"""
+        setkey_value_array = torch.tensor(
+            [set_inference_key_value_memory] * micro_batch_size, device=torch.cuda.current_device()
+        )
+        len_array = torch.tensor([maxlen] * micro_batch_size, device=torch.cuda.current_device())
+
+        batch = [tokens2use, embeddings2use, self.attention_mask, positions2use, setkey_value_array, len_array]
+        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
+        return batch, tensor_shape
+
+
 def model_inference_strategy_dispatcher(model, **args):
-    from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
+    from nemo.collections.multimodal.speech_llm.models.modular_models import (
+        CrossAttendModularAudioGPTModel,
+        ModularAudioGPTModel,
+    )
 
-    if isinstance(model, ModularAudioGPTModel):
+    if isinstance(model, CrossAttendModularAudioGPTModel):
+        return CrossAttendAudioToTextGenerationStrategy(model, **args)
+    elif isinstance(model, ModularAudioGPTModel):
         return AudioToTextGenerationStrategy(model, **args)
     else:
         return text_generation_strategy.model_inference_strategy_dispatcher(model, **args)
diff --git a/nemo/collections/multimodal/speech_llm/modules/modality_adapters.py b/nemo/collections/multimodal/speech_llm/modules/modality_adapters.py
index 408231adcc6d..9138845c73bd 100644
--- a/nemo/collections/multimodal/speech_llm/modules/modality_adapters.py
+++ b/nemo/collections/multimodal/speech_llm/modules/modality_adapters.py
@@ -132,3 +132,15 @@ def forward(self, audio_signal, length=None):
         outputs = self.mlp(outputs)
         outputs_len = torch.div(length, self.pooling_factor, rounding_mode='floor')
         return outputs.transpose(1, 2), outputs_len
+
+
+class IdentityConnectors(NeuralModule, Exportable, AccessMixin):
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+
+    def forward(self, audio_signal, length=None, *args, **kwargs):
+        return audio_signal, length
diff --git a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
index 2f0565982941..a42c7d06cba0 100644
--- a/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
+++ b/nemo/collections/multimodal/speech_llm/modules/perception_modules.py
@@ -23,12 +23,12 @@
 from nemo.collections.asr.models import EncDecSpeakerLabelModel
 from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder, ConformerMultiLayerFeatureExtractor
 from nemo.collections.multimodal.speech_llm.parts.utils.data_utils import align_feat_seq_list
+from nemo.collections.nlp.modules.common.transformer.transformer_decoders import TransformerDecoder
 from nemo.core.classes import Exportable, NeuralModule
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType
 from nemo.utils.decorators import experimental
 
-
 __all__ = ["AudioPerceptionModule", "MultiAudioPerceptionModule"]
 
 
@@ -70,6 +70,7 @@ def output_types(self):
     def __init__(self, cfg: DictConfig):
         super().__init__()
         # Initialize components
+        self.cfg = cfg
         self.preprocessor = self.from_config_dict(cfg.preprocessor)
         self.encoder = self.from_config_dict(cfg.encoder)
 
@@ -429,3 +430,76 @@ def forward(
         # b, c, t -> b, t, c
         encoded = self.proj(encoded.transpose(1, 2))
         return encoded, encoded_len
+
+
+def lens_to_mask(lens, max_length):
+    batch_size = lens.shape[0]
+    mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None]
+    return mask
+
+
+class TransformerCrossAttention(NeuralModule, Exportable):
+    """Transformer module for cross-attention between speech and text embeddings.
+    The module allows optional projection from the input embeddings to a lower dimension before feeding them to the transformer.
+    Args:
+        cfg: DictConfig, configuration object for the module which should include:
+            xattn: DictConfig, configuration object for the transformer decoder
+    """
+
+    def __init__(self, cfg: DictConfig, *args, **kwargs):
+        super().__init__()
+        xformer_num_layers = cfg.xattn.get('xformer_num_layers', 2)
+        xformer_dims = cfg.xattn.get('xformer_dims', cfg.output_dim)
+        self.cfg = cfg
+        cross_attn_cfg = cfg.xattn
+        if xformer_dims != cfg.output_dim:
+            self.input_proj1 = nn.Linear(cfg.output_dim, xformer_dims)
+            self.input_proj2 = nn.Linear(cfg.output_dim, xformer_dims)
+            self.output_proj = nn.Linear(xformer_dims, cfg.output_dim)
+        else:
+            self.input_proj1 = nn.Identity()
+            self.input_proj2 = nn.Identity()
+            self.output_proj = nn.Identity()
+        # causal attention decoder by default
+        self.xattn_decoder = TransformerDecoder(
+            hidden_size=xformer_dims,
+            num_layers=xformer_num_layers,
+            inner_size=1 * xformer_dims,
+            num_attention_heads=cross_attn_cfg.num_attention_heads,
+            ffn_dropout=cross_attn_cfg.ffn_dropout,
+            attn_score_dropout=cross_attn_cfg.attn_score_dropout,
+            attn_layer_dropout=cross_attn_cfg.attn_layer_dropout,
+            hidden_act=cross_attn_cfg.hidden_act,
+            pre_ln=cross_attn_cfg.pre_ln,
+            pre_ln_final_layer_norm=cross_attn_cfg.pre_ln_final_layer_norm,
+        )
+
+    def forward(
+        self,
+        encoder_states,
+        encoded_len,
+        input_embeds,
+        input_lengths,
+        decoder_mems_list=None,
+        return_mems=False,
+    ):
+        assert input_embeds.shape[-1] == encoder_states.shape[-1]
+        enc_mask = lens_to_mask(encoded_len, encoder_states.shape[1]).to(encoder_states.dtype)
+        dec_mask = lens_to_mask(input_lengths, input_embeds.shape[1]).to(input_lengths.dtype)
+        y = self.xattn_decoder(
+            decoder_states=self.input_proj1(input_embeds),
+            decoder_mask=dec_mask,
+            encoder_states=self.input_proj2(encoder_states),
+            encoder_mask=enc_mask,
+            decoder_mems_list=decoder_mems_list,
+            return_mems=return_mems,
+            return_mems_as_list=False,
+        )
+        if return_mems:
+            extra_outpus = {'decoder_mems_list': y}
+            y = y[-1][:, -input_embeds.shape[1] :]
+        else:
+            extra_outpus = {}
+        y = self.output_proj(y) + input_embeds
+        assert y.shape == input_embeds.shape
+        return y, extra_outpus
diff --git a/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py b/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py
index 92a3548f9337..d638281950b4 100644
--- a/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py
+++ b/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import torch
+from nemo.utils import logging, logging_mode
 
 
 def maybe_cast_to_list(x):
@@ -155,3 +156,227 @@ def align_feat_seq_list(
         new_seq_list.append(new_seq)
         new_seq_len_list.append(new_seq_len)
     return new_seq_list, new_seq_len_list
+
+
+def build_loss_mask(processed_example: dict, answer_only_loss: bool = True):
+    """Pad input_ids in batch to max batch length while building loss mask"""
+    # function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
+    input_ids = processed_example['input_ids']
+    answer_start_idx = processed_example['answer_start_idx']
+    if answer_only_loss:
+        loss_mask = [float(idx >= answer_start_idx) for idx in range(len(input_ids))]
+    else:
+        loss_mask = [1.0] * len(input_ids)
+
+    return loss_mask
+
+
+class TextProcessing:
+    """
+    Text processing pipeline for speech_llm data loader.
+    This class is adapted from the one used in nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+    The class follows the interface of _process_example which takes in a context and an output
+      and processes them into a formatted training example.
+
+    Args:
+        tokenizer: text tokenizer object
+        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
+        add_bos (bool): Whether to add a beginning of sentence token to each data example
+        add_eos (bool): Whether to add an end of sentence token to each data example
+        add_sep (bool): Whether to add a separation token to each data example (goes between prompt and answer)
+        sep_id (int): The id of the separation token
+        separate_prompt_and_response_with_newline (bool): Whether to separate the prompt and response with a newline character
+        answer_only_loss (bool): Whether to compute the loss only on the answer part of the input
+        truncation_field (str): Field to use for truncation. (Options: "answer", "context"). Field to be used for truncation if the combined length exceeds the max sequence length.
+        pad_to_max_length (bool): Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
+        prompt_template (str): Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
+        virtual_tokens (int): Number of virtual tokens to add to the beginning of the input
+        tokens_to_generate (int): Number of tokens to generate during inference
+        context_key (str): Key to use for the context in your JSONL file
+        answer_key (str): Key to use for the label in your JSONL file
+        end_string (Optional[str]): If not None, add this string to the end of the answer.
+        sample_alpha (Optional[float]): For SPE subword sampling
+        input_text_mask_ratio (Optional[float]): If not None, will mask the input text at this ratio.
+    """
+
+    def __init__(
+        self,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        max_seq_length: int = 1024,
+        min_seq_length: int = 1,
+        add_bos: bool = False,
+        add_eos: bool = True,
+        add_sep: bool = False,
+        sep_id: Optional[int] = None,
+        seed: int = 1234,
+        separate_prompt_and_response_with_newline: bool = False,
+        answer_only_loss: bool = True,
+        truncation_field: str = "answer",
+        pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
+        prompt_template: str = None,
+        virtual_tokens: int = 0,
+        tokens_to_generate: int = 0,
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        end_string: Optional[str] = None,
+        sample_alpha: Optional[float] = None,
+        audio_locator: Optional[str] = None,
+    ):
+        self.context_key = context_key
+        self.answer_key = answer_key
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        self.min_seq_length = min_seq_length
+        self.seed = seed
+        self.separate_prompt_and_response_with_newline = separate_prompt_and_response_with_newline
+        self.answer_only_loss = answer_only_loss
+        self.truncation_field = truncation_field
+        self.pad_to_max_length = pad_to_max_length
+        self.prompt_template = prompt_template
+        self.virtual_tokens = virtual_tokens
+        self.tokens_to_generate = tokens_to_generate
+        self.add_bos = add_bos
+        self.add_eos = add_eos
+        self.add_sep = add_sep
+        self.end_string = end_string
+        self.sample_alpha = sample_alpha
+        self.audio_locator = audio_locator
+
+        if add_bos and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0:
+            self.bos_id = tokenizer.bos_id
+        else:
+            self.bos_id = None
+
+        if add_eos and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0:
+            self.eos_id = tokenizer.eos_id
+        else:
+            self.eos_id = None
+
+        if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0:
+            self.pad_id = tokenizer.pad_id
+        else:
+            self.pad_id = self.eos_id if self.eos_id is not None else 0
+
+        self.sep_id = sep_id if add_sep else None
+
+        if self.prompt_template is not None:
+            # When providing things like newlines in the prompt template via the CLI, they are escaped. This line unescapes them.
+            self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape')
+        assert self.truncation_field in ["answer", "context"]
+
+    def _process_example(self, context: str, output: str):
+        """
+        Create an example by concatenating text and answer.
+        Truncation is carried out when needed, but it is performed only on the prompt side.
+        BOS, EOS, and SEP, are added if specified.
+
+        function copied from nemo/collections/nlp/data/language_modelling/megatron/gpt_sft_dataset.py
+        """
+        if self.prompt_template is not None:
+            if self.context_key not in self.prompt_template or self.answer_key not in self.prompt_template:
+                if "input" in self.prompt_template and "output" in self.prompt_template:
+                    logging.warning(
+                        f"Using 'input' and 'output' as context and answer keys, since given ones ({self.context_key}, {self.answer_key}) are not found in the prompt template: {self.prompt_template}.",
+                        mode=logging_mode.ONCE,
+                    )
+                    self.context_key = "input"
+                    self.answer_key = "output"
+            assert f'{{{self.context_key}}}' in self.prompt_template
+            assert f'{{{self.answer_key}}}' in self.prompt_template
+            # Make sure that '{output}' always occurs at the end of the prompt template string
+            assert self.prompt_template.index(f'{{{self.answer_key}}}') == len(self.prompt_template) - len(
+                f'{{{self.answer_key}}}'
+            )
+            # Get the context by replacing only the input
+            original_context = context
+            context = (
+                self.prompt_template.replace(f'{{{self.context_key}}}', context)
+                .replace(f'{{{self.answer_key}}}', '')
+                .strip(' ')
+            )
+            # Replace the input and output placeholders with the actual input and output
+            text = self.prompt_template.replace(f'{{{self.context_key}}}', original_context).replace(
+                f'{{{self.answer_key}}}', output
+            )
+
+        elif self.separate_prompt_and_response_with_newline:
+            text = context + '\n' + output
+        else:
+            text = context + ' ' + output
+
+        if self.virtual_tokens:
+            # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context
+            # these pad/eos tokens are placeholders for virtual tokens
+            pre_pad = [self.tokenizer.eos_id] * self.virtual_tokens
+        else:
+            pre_pad = []
+        answer_text = text[len(context) :]
+        answer_ids = pre_pad + self.tokenizer.text_to_ids(answer_text, self.sample_alpha)
+        if self.end_string:
+            answer_ids += self.tokenizer.text_to_ids(self.end_string)
+
+        if self.audio_locator is None:
+            # signle audio case
+            context_ids = self.tokenizer.text_to_ids(context)
+            context_start_idx = [0]
+        else:
+            # multiple audio case
+            context_ids = []
+            context_start_idx = []
+            for context_seg in context.split(self.audio_locator):
+                context_start_idx.append(len(context_ids))
+                context_ids.extend(self.tokenizer.text_to_ids(context_seg))
+        context_ids = pre_pad + context_ids
+        context_start_idx = [x + len(pre_pad) for x in context_start_idx]
+
+        # for the long context cases, collate_fn includes self.tokens_to_generate for padding
+        total_ids = len(context_ids) + max(len(answer_ids), self.tokens_to_generate)
+        if self.add_bos:
+            total_ids += 1
+        if self.add_sep:
+            total_ids += 1
+        if self.add_eos:
+            total_ids += 1
+
+        # If the total number of token is greater than the max, we will try to truncate the answer
+        if total_ids > self.max_seq_length:
+            truncation_length = total_ids - self.max_seq_length
+            answer_ids = answer_ids[: -min(truncation_length, len(answer_ids))]
+            context_ids = context_ids[: -min(truncation_length, len(context_ids))]
+
+        input_ids = context_ids
+        answer_start_idx = len(input_ids)
+
+        # Adds bos token in the start
+        if self.add_bos:
+            context_ids = [self.bos_id] + context_ids
+            input_ids = [self.bos_id] + input_ids
+            answer_start_idx += 1
+
+        # Adds sep token between text/prompt and answer
+        if self.add_sep:
+            context_ids = context_ids + [self.sep_id]
+            input_ids = input_ids + [self.sep_id]
+            answer_start_idx += 1
+
+        input_ids = input_ids + answer_ids
+
+        if self.add_eos:
+            input_ids = input_ids + [self.tokenizer.eos_id]
+            answer_ids = answer_ids + [self.tokenizer.eos_id]
+
+        if len(input_ids) > self.max_seq_length:
+            logging.warning(f'Input ids length {len(input_ids)} exceed max sequence length {self.max_seq_length}')
+            input_ids = input_ids[: self.max_seq_length]
+
+        processed_example = {
+            'input_ids': (input_ids),
+            'answer_start_idx': (answer_start_idx),
+            'context_ids': (context_ids),
+            'context_length': len(context_ids),
+            'answer_ids': (answer_ids),
+            'context_start_idx': context_start_idx,
+        }
+
+        return processed_example
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index b2594731d177..29f3e8905f91 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -421,7 +421,7 @@ def _build_tokenizer(self):
             legacy = True if self._cfg.tokenizer.library == 'sentencepiece' else False
         self.tokenizer = get_nmt_tokenizer(
             library=self._cfg.tokenizer.library,
-            model_name=self._cfg.tokenizer.type,
+            model_name=self._cfg.tokenizer.get("type", None),
             tokenizer_model=self.register_artifact("tokenizer.model", self._cfg.tokenizer.get('model', None)),
             vocab_file=self.register_artifact("tokenizer.vocab_file", self._cfg.tokenizer.get('vocab_file', None)),
             merges_file=self.register_artifact("tokenizer.merge_file", self._cfg.tokenizer.get('merge_file', None)),
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
index 4d4cc09d0751..d151925635ab 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
@@ -63,26 +63,29 @@
 
 class MegatronBasePromptLearningModel(MegatronBaseModel, TextGeneration):
     """
-    Model class for prompt-tuning or p-tuning a pretrained Megatron model. 
+    Model class for prompt-tuning or p-tuning a pretrained Megatron model.
 
     Prompt Tuning initalizes virtual prompt embeddings directly from a copy of
     certain token embeddings from the the pretrained model's vocabulary
-    and directly tunes these embedding weights. The token embeddings used in 
-    initalization are specified by the user in the config file. The model can 
-    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a 
-    prompt table and can be added or deleted without disrupting virtual prompts 
-    for other tasks. 
+    and directly tunes these embedding weights. The token embeddings used in
+    initalization are specified by the user in the config file. The model can
+    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a
+    prompt table and can be added or deleted without disrupting virtual prompts
+    for other tasks.
 
     P-tuning initializes an LSTM encoder model that generates virtual prompt
     embeddings for every task. Each task shares the same encoder. After ptuning
     is compelete, the learned virtual prompts can be saved to the prompt table
-    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a 
-    new virtual prompt via p-tuning, they do not need to retrain on all previous 
+    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a
+    new virtual prompt via p-tuning, they do not need to retrain on all previous
     tasks. This gives p-tuning the same task flexiblity as prompt-tuning.
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
         super().__init__(cfg, trainer)
+        self.init_model(cfg, trainer)
+
+    def init_model(self, cfg: DictConfig, trainer: Trainer):
 
         self.config: ModelParallelConfig = self.model_parallel_config
 
@@ -156,10 +159,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
     def load_task_templates(self, task_templates):
         """
-        Takes in the task template portion of the config and turns  
-        it into a table where each task's prompt template and 
-        the number of virtual tokens to insert in a given part of 
-        the prompt template are specified. 
+        Takes in the task template portion of the config and turns
+        it into a table where each task's prompt template and
+        the number of virtual tokens to insert in a given part of
+        the prompt template are specified.
         """
         self.task_templates = {}
         self.task_id_num_to_name = {}
@@ -215,18 +218,17 @@ def init_prompt_encoder(self):
         )
 
     def freeze_existing_word_embeddings(self):
-        """Freeze params of existing virtual prompts that should not be tuned further
-        """
+        """Freeze params of existing virtual prompts that should not be tuned further"""
         # Make sure word embeddings are frozen
         for params in self.word_embeddings.parameters():
             params.requires_grad = False
 
     def state_dict(self):
         """
-        Custom state dict that only contains prompt table and prompt encoder parameters. 
-        No frozen model parameters are stored in the state dict. Prompt encoder parameters 
+        Custom state dict that only contains prompt table and prompt encoder parameters.
+        No frozen model parameters are stored in the state dict. Prompt encoder parameters
         are only in state dict for intermediate checkpoints saved during training. Final
-        nemo checkpoints at the end of training will contain prompt table parameters only. 
+        nemo checkpoints at the end of training will contain prompt table parameters only.
         """
         state_dict_ = {}
 
@@ -241,7 +243,7 @@ def state_dict(self):
     def load_state_dict(self, state_dict, strict: bool = True):
         """
         Custom load state dict method that only loads prompt table and prompt encoder
-        parameters. Matching load method for this class' custom state dict method. 
+        parameters. Matching load method for this class' custom state dict method.
         """
         if self.first_stage_of_pipeline():
             if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER:
@@ -253,7 +255,7 @@ def load_state_dict(self, state_dict, strict: bool = True):
 
     def setup_optimizer_param_groups(self):
         """
-        ModelPT override. Optimizer will get self._optimizer_param_groups. 
+        ModelPT override. Optimizer will get self._optimizer_param_groups.
         Only want virtual prompt params to be passed to the optimizer.
         """
         ## Freeze frozen model
@@ -272,8 +274,8 @@ def setup_optimizer_param_groups(self):
 
     def embed_input(self, input_ids: Tensor, taskname_ids: Tensor, use_cached_reps: bool):
         """
-        Replaces the virtual tokens in the input_ids with embeddings 
-        calculated from either the 'prompt_table' or 'prompt_encoder'. 
+        Replaces the virtual tokens in the input_ids with embeddings
+        calculated from either the 'prompt_table' or 'prompt_encoder'.
         The virtual token placeholders have token_ids listed in
         `self.pseudo_token_ids`.
 
@@ -422,7 +424,7 @@ def load_frozen_model(self, cfg, trainer):
 def get_pseudo_tokens(num_virtual_tokens):
     """
     Takes in an integer and returns a list of strings where each string
-    is a numbered virtual token placeholder. If 
+    is a numbered virtual token placeholder. If
     num_virtual_tokens = 3, then this function returns:
 
     ["<prompt_0>", "<prompt_1>", "<prompt_2>"]
@@ -430,7 +432,7 @@ def get_pseudo_tokens(num_virtual_tokens):
     Args:
         num_virtual_tokens: (int) Number of virtual token strings you want to make
 
-    returns a list of string. 
+    returns a list of string.
 
     """
     pseudo_tokens = [
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 44a08e163c91..28bcbf22ac33 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -100,6 +100,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
         self.virtual_tokens = 0
         self.init_global_step = 0
+        self.enforce_divisible_batch = True  # used for gradient accumulation
 
     def setup_metric(self, data_cfg):
         metric_name = "exact_string_match"
@@ -356,7 +357,7 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
         # Pass only torch.Tensor to prevent errors when process get_iterator_k_split()
         batch = {k: v for k, v in batch.items() if isinstance(v, (torch.Tensor, list))}
         _, seq_length = batch['tokens'].shape
-        data_iter = get_iterator_k_split(batch, get_num_microbatches())
+        data_iter = get_iterator_k_split(batch, get_num_microbatches(), self.enforce_divisible_batch)
 
         if log_token_counts:
             self.log('seq_length_padded', seq_length, prog_bar=True, batch_size=1)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 90c6a40b1d40..8fe215bcc9af 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -1206,6 +1206,10 @@ def dummy():
             global_batch_per_gpu = tokens_enc.size(0)
             device = tokens_enc.device
             encoder_seq_length = tokens_enc.size(1)
+        elif encoder_input is not None:
+            global_batch_per_gpu = encoder_input.size(0)
+            device = encoder_input.device
+            encoder_seq_length = encoder_input.size(1)
         else:
             global_batch_per_gpu = enc_output.size(0)
             device = enc_output.device
diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py
index 75c50146bfab..5aaac6755601 100644
--- a/nemo/collections/nlp/modules/common/megatron/utils.py
+++ b/nemo/collections/nlp/modules/common/megatron/utils.py
@@ -15,11 +15,10 @@
 """Utilities for models."""
 import itertools
 import math
-from typing import Dict, Iterator, List, Tuple, Union
+from typing import Dict, Iterator, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
-
 from torch import Tensor
 
 from nemo.utils import logging, logging_mode
@@ -413,16 +412,19 @@ def get_all_params_for_weight_decay_optimization(
     return tuple(filter(lambda g: len(g['params']) > 0, param_groups))
 
 
-def split_list(inputs, num_chunks):
+def split_list(inputs, num_chunks, enforce_divisible_batch: Optional[bool] = True):
     """
     Split a list into equal sized chunks
     """
     chunk_size = len(inputs) // num_chunks
-    assert len(inputs) % chunk_size == 0, "Issue with batch size configuration!"
+    if enforce_divisible_batch:
+        assert len(inputs) % chunk_size == 0, "Issue with batch size configuration!"
     return [inputs[i : i + chunk_size] for i in range(0, len(inputs), chunk_size)]
 
 
-def get_iterator_k_split(batch: Union[Dict, List[torch.Tensor]], num_microbatches: int) -> Iterator:
+def get_iterator_k_split(
+    batch: Union[Dict, List[torch.Tensor]], num_microbatches: int, enforce_divisible_batch: Optional[bool] = True
+) -> Iterator:
     """
     Split a batch into k microbatches, where the batch size is divisible by k. Batch could be
     a dictionary of tensors or a list of tensors. A dictionary batch could also have items of List type,
@@ -442,8 +444,13 @@ def get_iterator_k_split(batch: Union[Dict, List[torch.Tensor]], num_microbatche
 
         # Split tensor items
         items = list(tensor_items.items())
-        assert items[0][1].shape[0] % num_microbatches == 0, "Issue with batch size configuration!"
+        if enforce_divisible_batch:
+            assert items[0][1].shape[0] % num_microbatches == 0, "Issue with batch size configuration!"
         split_batch = [torch.tensor_split(item[1], num_microbatches, dim=0) for item in items]
+        # handle the case where the batch size from dynamic bucketting is not divisible
+        if items[0][1].shape[0] % num_microbatches != 0:
+            chunk_size = split_batch[0][-1].shape[0]
+            split_batch = [[j[:chunk_size] for j in i] for i in split_batch]
 
         if len(list_items) == 0:
             # Only have tensor items
@@ -453,7 +460,10 @@ def get_iterator_k_split(batch: Union[Dict, List[torch.Tensor]], num_microbatche
         else:
             # Split list items
             list_items = list(list_items.items())
-            split_list_batch = [split_list(item[1], num_microbatches) for item in list_items]
+            split_list_batch = [
+                split_list(item[1], num_microbatches, enforce_divisible_batch=enforce_divisible_batch)
+                for item in list_items
+            ]
             # Merge tensor and list items
             all_keys = [item[0] for item in items] + [item[0] for item in list_items]
             all_split_batch = split_batch + split_list_batch

From c665430279efc8db6fefb4644a826b2e59f6db08 Mon Sep 17 00:00:00 2001
From: Xiaowei Ren <103958965+xrennvidia@users.noreply.github.com>
Date: Thu, 6 Jun 2024 21:45:24 -0700
Subject: [PATCH 176/178] Remove unnecessary attention mask (#8733)

* pass a config to GPTDataset

Signed-off-by: Xiaowei Ren <xren@nvidia.com>

* set attention mask to None if dataloader does not have it

Signed-off-by: Xiaowei Ren <xren@nvidia.com>

* fix function name

Signed-off-by: Xiaowei Ren <xren@nvidia.com>

* fix nsys profile

Signed-off-by: Xiaowei Ren <xren@nvidia.com>

* dataset config variable name change

Signed-off-by: Xiaowei Ren <xren@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: xrennvidia <xrennvidia@users.noreply.github.com>

---------

Signed-off-by: Xiaowei Ren <xren@nvidia.com>
Signed-off-by: xrennvidia <xrennvidia@users.noreply.github.com>
Co-authored-by: xrennvidia <xrennvidia@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py          | 2 ++
 nemo/core/optim/distributed_adam.py                             | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 652b3b767c94..cd51568abcd2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1126,6 +1126,7 @@ def get_batch(self, data_iterator, tuning):
             'tokens': data["tokens"],
             'labels': data["labels"],
             'loss_mask': data["loss_mask"],
+            'attention_mask': None if "attention_mask" not in data else data["attention_mask"],
             'position_ids': data["position_ids"],
         }
         if "attention_mask" in data:
@@ -1497,6 +1498,7 @@ def build_train_valid_test_datasets(self):
                 "reset_position_ids": self.reset_position_ids,
                 "reset_attention_mask": self.reset_attention_mask,
                 "eod_mask_loss": self.eod_mask_loss,
+                "create_attention_mask": not self.get_attention_mask_from_fusion,
                 "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
             }
 
diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
index 94f117e7f525..77d00de89232 100644
--- a/nemo/core/optim/distributed_adam.py
+++ b/nemo/core/optim/distributed_adam.py
@@ -122,7 +122,7 @@ def __init__(
     ):
 
         # Initialize process groups
-        if 'process_group' not in kwargs and not parallel_state.is_unitialized():
+        if 'process_group' not in kwargs and parallel_state.is_initialized():
             kwargs['process_group'] = parallel_state.get_data_parallel_group(with_context_parallel=True)
         if disable_distributed_parameters:
             world_size = torch.distributed.get_world_size()

From ceffb49263ef562ff2d64c6994b5226e232aa0d4 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Fri, 7 Jun 2024 12:50:03 -0400
Subject: [PATCH 177/178] QLoRA (#9340)

* temp qlora implementation

Signed-off-by: Chen Cui <chcui@nvidia.com>

* swap nf4 after model instantiation

Signed-off-by: Chen Cui <chcui@nvidia.com>

* load model on cpu and then quantize on gpu

Signed-off-by: Chen Cui <chcui@nvidia.com>

* model init on cpu to prevent memory spike

Signed-off-by: Chen Cui <chcui@nvidia.com>

* account for TE versions

Signed-off-by: Chen Cui <chcui@nvidia.com>

* guard use_cpu_initialization

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix layernorm autograd Function

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add unit tests

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* move cpu init to library code

Signed-off-by: Chen Cui <chcui@nvidia.com>

* copyright header and nf4 quantize on GPU

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fix cpu init

Signed-off-by: Chen Cui <chcui@nvidia.com>

* comments

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix test

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 .../language_modeling/megatron_gpt_model.py   |  14 +-
 nemo/collections/nlp/models/nlp_model.py      |  26 +-
 .../modules/common/megatron/adapters/qlora.py | 246 ++++++++++++++++++
 .../nlp/parts/mixins/nlp_adapter_mixins.py    |  10 +-
 nemo/collections/nlp/parts/peft_config.py     |  16 +-
 tests/collections/nlp/test_qlora.py           |  77 ++++++
 6 files changed, 376 insertions(+), 13 deletions(-)
 create mode 100644 nemo/collections/nlp/modules/common/megatron/adapters/qlora.py
 create mode 100644 tests/collections/nlp/test_qlora.py

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index cd51568abcd2..718991dc203d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -343,7 +343,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                     model_provider_func=self.model_provider_func,
                     wrap_with_ddp=False,
                     virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
-                    on_cpu=cfg.get('fsdp', False) and cfg.get('use_cpu_initialization', False),
+                    on_cpu=cfg.get('use_cpu_initialization', False),
                 )
 
         # if we're not using interleaved, then self.model is a module.
@@ -887,10 +887,18 @@ def training_step(self, dataloader_iter):
             self.megatron_timer_stop('allreduce_first_last_embeddings')
 
         if self.log_memory_usage:
-            mem_reserved = torch.cuda.max_memory_reserved()
+            max_memory_reserved = torch.cuda.max_memory_reserved()
+            memory_allocated = torch.cuda.memory_allocated()
             self.log(
                 'peak_memory_usage',
-                mem_reserved,
+                max_memory_reserved,
+                prog_bar=True,
+                rank_zero_only=True,
+                batch_size=1,
+            )
+            self.log(
+                'memory_allocated',
+                memory_allocated,
                 prog_bar=True,
                 rank_zero_only=True,
                 batch_size=1,
diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py
index 65d8645688fd..37195f1df142 100644
--- a/nemo/collections/nlp/models/nlp_model.py
+++ b/nemo/collections/nlp/models/nlp_model.py
@@ -60,8 +60,7 @@
 
 
 class NLPModel(ModelPT, Exportable):
-    """Base class for NLP Models.
-    """
+    """Base class for NLP Models."""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=False):
 
@@ -120,7 +119,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=False):
             if cfg.get('language_model').get('config_file'):
                 config_file = self.register_artifact('language_model.config_file', cfg.language_model.config_file)
             bert_model = get_lm_model(
-                config_file=config_file, config_dict=config_dict, vocab_file=vocab_file, trainer=trainer, cfg=cfg,
+                config_file=config_file,
+                config_dict=config_dict,
+                vocab_file=vocab_file,
+                trainer=trainer,
+                cfg=cfg,
             )
             # set the tokenizer if it is not initialized explicitly
             if ((hasattr(self, 'tokenizer') and self.tokenizer is None) or not hasattr(self, 'tokenizer')) and hasattr(
@@ -146,16 +149,18 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=False):
             self.register_bert_model()
 
     def register_artifact(
-        self, config_path: str, src: str, verify_src_exists: bool = False,
+        self,
+        config_path: str,
+        src: str,
+        verify_src_exists: bool = False,
     ):
-        """ Overrides ModelPT register_artifact default behavior.
+        """Overrides ModelPT register_artifact default behavior.
         NLP models usually need artifacts that are optional."""
         return super().register_artifact(config_path, src, verify_src_exists=verify_src_exists)
 
     @rank_zero_only
     def register_bert_model(self):
-        """Adds encoder config to .nemo archive for Jarvis.
-        """
+        """Adds encoder config to .nemo archive for Jarvis."""
         # check if there is an encoder, warn if not
         if self.bert_model is not None:
             # get encoder config and create source for artifact
@@ -462,6 +467,13 @@ def restore_from(
             save_restore_connector = NLPSaveRestoreConnector()
         if os.path.isdir(restore_path):
             save_restore_connector.model_extracted_dir = restore_path
+        if (
+            isinstance(override_config_path, DictConfig)
+            and override_config_path.get('use_cpu_initialization', False)
+            and map_location is None
+        ):
+            logging.info('use_cpu_initialization is True, loading checkpoint on CPU')
+            map_location = 'cpu'
         return super().restore_from(
             restore_path, override_config_path, map_location, strict, return_config, save_restore_connector, trainer
         )
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py b/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py
new file mode 100644
index 000000000000..e29744ce4d4d
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/qlora.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from importlib.metadata import version
+from typing import TYPE_CHECKING, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from pkg_resources import packaging
+from torch import Tensor, nn
+
+from nemo.collections.nlp.parts.peft_config import LORA_CONFIG_TO_MCORE_MAP, get_target_modules
+from nemo.utils import logging
+
+te_version = packaging.version.Version(version("transformer-engine"))
+
+if TYPE_CHECKING:
+    from megatron.core.models.gpt import MCoreGPTModel
+    from omegaconf import DictConfig
+
+
+class NF4Weight(nn.Parameter):
+    def __new__(
+        cls,
+        data: torch.Tensor,
+        is_nf4_quantized: bool = False,
+        block_size: int = 64,
+        scale_block_size: int = 256,
+    ):
+        self = torch.Tensor._make_subclass(cls, data, require_grad=False)
+        self._nf4_quantizer = None
+        self.is_nf4_quantized = is_nf4_quantized
+        self.block_size = block_size
+        self.scale_block_size = scale_block_size
+        return self
+
+    def quantize(self, device='cuda') -> torch.Tensor:
+        from modelopt.torch.quantization.nn import TensorQuantizer
+        from modelopt.torch.quantization.tensor_quant import QuantDescriptor
+
+        # initialize the quantizer
+        nf4_desc = QuantDescriptor(
+            num_bits=4,
+            block_sizes={-1: self.block_size, "scale_bits": 8, "scale_block_sizes": {-1: self.scale_block_size}},
+            fake_quant=False,
+        )
+        self._nf4_quantizer = TensorQuantizer(nf4_desc)
+
+        # quantize on GPU directly
+        nf4_tensor = self._nf4_quantizer(self.data.to(device))
+        self.quantized_data = nf4_tensor
+        self.is_nf4_quantized = True
+        return self
+
+    def dequantize(self):
+        assert self.is_nf4_quantized, "NF4 Tensor is not yet quantized, cannot dequantize."
+        return self._nf4_quantizer(self.quantized_data)
+
+    def cuda(self, device=None, non_blocking=False):
+        return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
+
+    def to(self, *args, **kwargs):
+        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+
+        if device is not None and device.type == "cuda":
+            # Note: self.data remains on CPU. Only self.quantized_data is on GPU
+            return self.quantize() if not self.is_nf4_quantized else self
+        else:
+            return NF4Weight(
+                super().to(device=device, dtype=dtype, non_blocking=non_blocking),
+                self.is_nf4_quantized,
+                self.block_size,
+                self.scale_block_size,
+            )
+
+    def __repr__(self, *, tensor_contents=None):
+        if self.is_nf4_quantized:
+            return f"NF4Weight(is_nf4_quantized=True, quantized_data={self.quantized_data}"
+        else:
+            return f"NF4Weight(is_nf4_quantized=False, data={self.data}"
+
+
+class _LinearNF4(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, weight: NF4Weight):
+        ctx.nf4_weight = weight
+        return F.linear(input, weight.dequantize().to(input.device))
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        weight: NF4Weight = ctx.nf4_weight
+        return grad_output @ weight.dequantize().to(grad_output.device), None
+
+
+class NF4LinearWrapper(nn.Module):
+    """
+    NF4 Linear Layer for QLoRA as introduced in `QLORA: Efficient Finetuning of Quantized LLMs <https://arxiv.org/abs/2305.14314>`_.
+    This wrapper module is instantiated in `on_load_checkpoint` and replaces TERowParallelLinear
+    Tensor Parallel is not supported.
+
+    Args:
+        bf16_linear_weight: Weight tensor in BF16 to wrap with NF4Weight
+    """
+
+    def __init__(self, bf16_linear_weight: torch.Tensor):
+        super().__init__()
+
+        # quantize the weight upon initialization
+        self.weight = NF4Weight(bf16_linear_weight).cuda()
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (Tensor): input tensor with shape ``(..., in_dim)``
+
+        Returns:
+            Tensor: output tensor with shape ``(..., out_dim)``
+
+        """
+        return _LinearNF4.apply(x, self.weight), None
+
+
+class NF4LayerNormLinearWrapper(NF4LinearWrapper):
+    """
+    Layernorm + NF4 Linear for QLoRA.
+    This class only combines the two modules for compatibility with TE's LayernormLinear layer, so that
+    the implementation for LoRA and QLoRA can share the same code path.
+    It does NOT fuse the two operations like TE does.
+    This wrapper module is instantiated in `on_load_checkpoint` and replaces TELayerNormColumnParallelLinear
+    Tensor Parallel is not supported.
+
+    Args:
+        bf16_linear_weight: Weight tensor in BF16 to wrap with NF4Weight
+        layer_norm_weight: layernorm weight tensor
+        layer_norm_bias: layernorm bias tensor, only if normalization is LayerNorm
+        normalization: Same as TELayerNormColumnParallelLinear.config.normalization
+        zero_centered_gamma: Same as TELayerNormColumnParallelLinear.config.zero_centered_gamma
+    """
+
+    def __init__(
+        self,
+        bf16_linear_weight: torch.Tensor,
+        layer_norm_weight: torch.Tensor,
+        layer_norm_bias: Optional[torch.Tensor],
+        normalization: str,
+        zero_centered_gamma: bool,
+    ):
+        super().__init__(bf16_linear_weight)
+        self.layer_norm_weight = nn.Parameter(layer_norm_weight)
+        if normalization != "RMSNorm":
+            self.layer_norm_bias = nn.Parameter(layer_norm_bias)
+        else:
+            self.layer_norm_bias = None
+
+        self.zero_centered_gamma = zero_centered_gamma
+        self.normalization = normalization
+        self.layer_norm_fn = self._create_layer_norm_fn()
+        self.te_return_bias = False
+
+    def _create_layer_norm_fn(self):
+        '''
+        create the layernorm function signature in TE. Assume this layer is already running without gradients
+        since this is for QLoRA.
+        '''
+        if self.normalization == 'LayerNorm':
+            from transformer_engine.pytorch.module.layernorm import _LayerNorm
+
+            layer_norm_fn = _LayerNorm.apply
+        elif self.normalization == 'RMSNorm':
+            from transformer_engine.pytorch.module.rmsnorm import _RMSNorm
+
+            layer_norm_fn = _RMSNorm.apply
+        else:
+            raise ValueError("Unsupported normalization type:", self.normalization)
+
+        return layer_norm_fn
+
+    def forward(self, x):
+        layer_norm_args = [
+            x,  # inp
+            self.layer_norm_weight,
+            1e-5,  # eps,
+            0,  # fwd_rmsnorm_sm_margin,
+            0,  # bwd_rmsnorm_sm_margin,
+            self.zero_centered_gamma,
+            True,  # is_grad_enabled,
+            x.dtype,  # activation_dtype,
+        ]
+        if te_version >= packaging.version.Version("1.6"):
+            layer_norm_args.insert(5, 0)  # inf_rmsnorm_sm_margin
+        if self.normalization == "LayerNorm":
+            layer_norm_args.insert(2, self.layer_norm_bias)
+        layernorm_output = self.layer_norm_fn(*layer_norm_args)
+        linear_output = _LinearNF4.apply(layernorm_output, self.weight)
+        return (linear_output, layernorm_output), None
+
+
+def qlora_load_model(model: 'MCoreGPTModel', model_cfg: 'DictConfig', checkpoint: Dict[str, Tensor]):
+    # swap linear layer and cast weight to nf4
+    qlora_targets = [
+        LORA_CONFIG_TO_MCORE_MAP[x] for x in get_target_modules(model_cfg.peft.lora_tuning, default=('all',))
+    ]
+
+    # if not load directly on device, need to load the rest of the model
+    # this block should only load word_embeddings, final_layernorm and output_layer weights.
+    if not model_cfg.get("dist_ckpt_load_on_device", True):
+        checkpoint_state_dict = {}
+        for key, value in checkpoint.items():
+            if not any(qlora_target in key for qlora_target in qlora_targets):
+                checkpoint_state_dict[key.replace('model.', '')] = value
+        model.load_state_dict(checkpoint_state_dict, strict=False)
+
+    def replace_linear(module: nn.Module, prefix=""):
+        for name, child in module.named_children():
+            if name in qlora_targets:
+                bf16_weight = checkpoint[f"{prefix}.{name}.weight"]
+                logging.info(f'QLoRA: Quantizing linear layer: {prefix}.{name}')
+                if name in ['linear_proj', 'linear_fc2']:
+                    setattr(module, name, NF4LinearWrapper(bf16_weight))
+                else:  # name in ['linear_qkv', 'linear_fc1']
+                    layer_norm_weight = checkpoint[f"{prefix}.{name}.layer_norm_weight"]
+                    layer_norm_bias = checkpoint.get(f"{prefix}.{name}.layer_norm_bias", None)
+                    normalization = module.config.normalization
+                    zero_centered_gamma = module.config.layernorm_zero_centered_gamma
+                    setattr(
+                        module,
+                        name,
+                        NF4LayerNormLinearWrapper(
+                            bf16_weight, layer_norm_weight, layer_norm_bias, normalization, zero_centered_gamma
+                        ),
+                    )
+            else:
+                replace_linear(child, prefix=f"{prefix}.{name}")
+
+    replace_linear(model, prefix="model")
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index ca5820772c62..0b0158447554 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -464,7 +464,15 @@ def on_load_checkpoint(self, checkpoint) -> None:
                             self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
                         parallel_state.set_virtual_pipeline_model_parallel_rank(0)
         else:
-            super().on_load_checkpoint(checkpoint)
+            cfg_peft = self.cfg.get('peft', None)
+            if cfg_peft and cfg_peft['peft_scheme'] == 'qlora':
+                from nemo.collections.nlp.modules.common.megatron.adapters.qlora import qlora_load_model
+
+                qlora_load_model(
+                    self.model.module if self.megatron_amp_O2 else self.model, self.cfg, checkpoint['state_dict']
+                )
+            else:
+                super().on_load_checkpoint(checkpoint)
 
     @classmethod
     def merge_cfg_with(cls, path: str, cfg: DictConfig) -> DictConfig:
diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index 820e2ad63f24..4d558ce00114 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -54,9 +54,16 @@
     "all": "all",
 }
 
+LORA_CONFIG_TO_MCORE_MAP = {
+    "attention_qkv": "linear_qkv",
+    "attention_dense": "linear_proj",
+    "mlp_fc1": "linear_fc1",
+    "mlp_fc2": "linear_fc2",
+}
+
 
-def get_target_modules(lora_cfg):
-    original_target_modules = lora_cfg.get("target_modules", ["attention_qkv"])
+def get_target_modules(lora_cfg, default=("attention_qkv",)):
+    original_target_modules = lora_cfg.get("target_modules", default)
     target_modules = []
 
     for module in original_target_modules:
@@ -251,6 +258,10 @@ def _create_lora_config(
         return adapter_cfg
 
 
+class QLoraPEFTConfig(LoraPEFTConfig):
+    pass
+
+
 class IA3PEFTConfig(PEFTConfig):
     def __init__(self, cfg):
         mlp_infused_adapter_cfg = MLPInfusedAdapterConfig(
@@ -360,6 +371,7 @@ def __init__(self, cfg):
     "ia3": IA3PEFTConfig,
     "ptuning": PtuningPEFTConfig,
     "lora": LoraPEFTConfig,
+    "qlora": QLoraPEFTConfig,
     "selective": SelectivePEFTConfig,
     'none': None,
     None: None,
diff --git a/tests/collections/nlp/test_qlora.py b/tests/collections/nlp/test_qlora.py
new file mode 100644
index 000000000000..bc00cc20c6ca
--- /dev/null
+++ b/tests/collections/nlp/test_qlora.py
@@ -0,0 +1,77 @@
+import pytest
+import torch
+from torch import nn
+
+from nemo.collections.nlp.modules.common.megatron.adapters.qlora import NF4LayerNormLinearWrapper, NF4LinearWrapper
+
+ao = pytest.importorskip("torchao.dtypes.nf4tensor", reason="torchao is not installed, skipping qlora tests")
+
+
+@pytest.fixture
+def input_tensor():
+    return torch.randn([8, 4096], dtype=torch.bfloat16, device='cuda') / 10
+
+
+@pytest.fixture
+def original_weight():
+    return torch.randn([1024, 4096], dtype=torch.bfloat16) / 10
+
+
+@pytest.fixture
+def norm_weight():
+    return torch.randn([4096], dtype=torch.bfloat16, device='cuda') / 100
+
+
+@pytest.fixture
+def norm_bias():
+    return torch.randn([4096], dtype=torch.bfloat16, device='cuda') / 100
+
+
+@pytest.fixture
+def ao_nf4_weight(original_weight):
+    return ao.NF4Tensor.from_tensor(original_weight.cuda(), 64, 256)
+
+
+@torch.no_grad()
+def test_nf4_linear(input_tensor, original_weight, ao_nf4_weight):
+
+    nemo_nf4_linear = NF4LinearWrapper(original_weight)
+    assert nemo_nf4_linear.weight.is_nf4_quantized
+    nemo_output, _ = nemo_nf4_linear(input_tensor)
+
+    ao_output = ao.linear_nf4(input_tensor, ao_nf4_weight)
+
+    assert torch.allclose(nemo_output, ao_output, atol=1e-2)
+
+
+# @torch.no_grad()
+def test_nf4_layernorm_linear(input_tensor, original_weight, norm_weight, norm_bias, ao_nf4_weight):
+    ln = nn.LayerNorm(input_tensor.size(-1))
+    ln.weight = nn.Parameter(norm_weight)
+    ln.bias = nn.Parameter(norm_bias)
+
+    nemo_nf4_layernorm_linear = NF4LayerNormLinearWrapper(original_weight, norm_weight, norm_bias, "LayerNorm", False)
+    assert nemo_nf4_layernorm_linear.weight.is_nf4_quantized
+    (nemo_output, nemo_norm_output), _ = nemo_nf4_layernorm_linear(input_tensor)
+
+    ao_norm_output = ln(input_tensor)
+    ao_output = ao.linear_nf4(ln(input_tensor), ao_nf4_weight)
+    assert torch.allclose(nemo_norm_output, ao_norm_output, atol=1e-2)
+    assert torch.allclose(nemo_output, ao_output, atol=1e-2)
+
+
+@torch.no_grad()
+def test_nf4_rmsnorm_linear(input_tensor, original_weight, norm_weight, norm_bias, ao_nf4_weight):
+    from nemo.utils.export_utils import TorchRMSNorm
+
+    rms_norm = TorchRMSNorm(norm_weight)
+
+    nemo_nf4_layernorm_linear = NF4LayerNormLinearWrapper(original_weight, norm_weight, None, "RMSNorm", False)
+    assert nemo_nf4_layernorm_linear.weight.is_nf4_quantized
+    (nemo_output, nemo_norm_output), _ = nemo_nf4_layernorm_linear(input_tensor)
+
+    ao_norm_output = rms_norm(input_tensor)
+    ao_output = ao.linear_nf4(ao_norm_output, ao_nf4_weight)
+
+    assert torch.allclose(nemo_norm_output, ao_norm_output, atol=1e-2)
+    assert torch.allclose(nemo_output, ao_output, atol=1e-2)

From f1062b72c0b990791799aadf958cfa7543b94302 Mon Sep 17 00:00:00 2001
From: Shashank Verma <shashank3959@gmail.com>
Date: Fri, 7 Jun 2024 10:10:05 -0700
Subject: [PATCH 178/178] Add tutorial for Llama-3-8B lora training and
 deployment (#9359)

* Add tutorial for Llama-3-8B lora training and deployment

* Adds a notebook for Llama-3-8b LORA PEFT with NeMo FW
* Adds a notebook for sending multi-LoRA inference request to NIM
* Adds README that includes instructions fore context and set up

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Add inference for other LoRAs in deployment notebook

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Fix typo in path in LoRA training notebook

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Fix typos and add end-2-end diagram

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Fix minor issue in architecture diagram

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Convert README from .md to .rst

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Minor updates to README

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Fix typo in deployment notebook

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Incorporate review suggestions

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Minor updates to README

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Remove access token

Invaidate and removes HF access token

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Fix broken link to NIM docs

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Fix minor typo in README parameter name

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Fix gramma and inconsistencies in style and formatting

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

* Capitalize Title

Signed-off-by: Shashank Verma <shashank3959@gmail.com>

---------

Signed-off-by: Shashank Verma <shashank3959@gmail.com>
---
 tutorials/llm/llama-3/README.rst              | 178 ++++++
 .../llama-3/img/e2e-lora-train-and-deploy.png | Bin 0 -> 202808 bytes
 .../llm/llama-3/llama3-lora-deploy-nim.ipynb  | 393 ++++++++++++
 .../llm/llama-3/llama3-lora-nemofw.ipynb      | 595 ++++++++++++++++++
 4 files changed, 1166 insertions(+)
 create mode 100755 tutorials/llm/llama-3/README.rst
 create mode 100644 tutorials/llm/llama-3/img/e2e-lora-train-and-deploy.png
 create mode 100755 tutorials/llm/llama-3/llama3-lora-deploy-nim.ipynb
 create mode 100755 tutorials/llm/llama-3/llama3-lora-nemofw.ipynb

diff --git a/tutorials/llm/llama-3/README.rst b/tutorials/llm/llama-3/README.rst
new file mode 100755
index 000000000000..473815802e5f
--- /dev/null
+++ b/tutorials/llm/llama-3/README.rst
@@ -0,0 +1,178 @@
+Llama 3 LoRA Fine-Tuning and Deployment with NeMo Framework and NVIDIA NIM
+==========================================================================
+
+`Llama 3 <https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/>`_ is an open source large language model by Meta that delivers state-of-the-art performance on popular industry benchmarks. It has been pretrained on over 15 trillion tokens, and supports an 8K token context length. It is available in two sizes, 8B and 70B, and each size has two variants—base pretrained and instruction tuned.
+
+`Low-Rank Adaptation (LoRA) <https://arxiv.org/pdf/2106.09685>`__ has emerged as a popular Parameter Efficient Fine-Tuning (PEFT) technique that tunes a very small number of additional parameters as compared to full fine-tuning, thereby reducing the compute required.
+
+`NVIDIA NeMo
+Framework <https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html>`__ provides tools to perform LoRA on Llama 3 to fit your use case, which can then be deployed using `NVIDIA NIM <https://www.nvidia.com/en-us/ai/>`__ for optimized inference on NVIDIA GPUs.
+
+.. figure:: ./img/e2e-lora-train-and-deploy.png
+  :width: 1000
+  :alt: Diagram showing the steps for LoRA customization using the NVIDIA NeMo Framework and deployment with NVIDIA NIM. The steps include converting the base model to .nemo format, creating LoRA adapters with NeMo, and then depoying the LoRA adapter with NIM for inference.
+  :align: center
+
+  Figure 1: Steps for LoRA customization using the NVIDIA NeMo Framework and deployment with NVIDIA NIM
+
+
+| NIM supports seamless deployment of multiple LoRA adapters (aka “multi-LoRA”) over the same base model by dynamically loading the adapter weights based on incoming requests at runtime. This provides the flexibility to handle inputs from various tasks or use cases without the need for deploying a unique model for each individual use case. More information on NIM for LLMs can be found it its `documentation <https://docs.nvidia.com/nim/large-language-models latest/introduction.html>`__.
+
+Requirements
+-------------
+
+In order to proceed, ensure that you have met the following requirements:
+
+* System Configuration
+    * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 80GB, for example: 1 x H100-80GB or 1 x A100-80GB.
+    * A Docker-enabled environment, with `NVIDIA Container Runtime <https://developer.nvidia.com/container-runtime>`_ installed, which will make the container GPU-aware.
+    * `Additional NIM requirements <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#prerequisites>`_.
+
+* Requested the necessary permission from Hugging Face and Meta to download `Meta-Llama-3-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`_. Then, you can use your Hugging Face `access token <https://huggingface.co/docs/hub/en/security-tokens>`_ to download the model, which we will then convert and customize with NeMo Framework.
+
+* `Authenticate with NVIDIA NGC <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-authentication>`_, and download `NGC CLI Tool <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-cli-tool>`_.
+
+
+`Create a LoRA Adapter with NeMo Framework <./llama3-lora-nemofw.ipynb>`__
+--------------------------------------------------------------------------
+
+This notebook shows how to perform LoRA PEFT on **Llama 3 8B Instruct** using `PubMedQA <https://pubmedqa.github.io/>`__ with NeMo Framework. PubMedQA is a Question-Answering dataset for biomedical texts. You will use the NeMo Framework which is available as a `docker container <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`__.
+
+To get started
+^^^^^^^^^^^^^^
+
+1. Run the container using the following command. It assumes that you have the notebook(s) available in the current working directory. If not, mount the appropriate folder to ``/workspace``.
+
+.. code:: bash
+
+   export FW_VERSION=24.05  # Make sure to choose the latest available tag
+
+
+.. code:: bash
+
+   docker run \
+     --gpus all \
+     --shm-size=2g \
+     --net=host \
+     --ulimit memlock=-1 \
+     --rm -it \
+     -v ${PWD}:/workspace \
+     -w /workspace \
+     -v ${PWD}/results:/results \
+     nvcr.io/nvidia/nemo:$FW_VERSION bash
+
+2. From within the container, start the Jupyter lab:
+
+.. code:: bash
+
+   jupyter lab --ip 0.0.0.0 --port=8888 --allow-root
+
+3. Then, navigate to `this notebook <./llama3-lora-nemofw.ipynb>`__.
+
+
+`Deploy Multiple LoRA Inference Adapters with NVIDIA NIM <./llama3-lora-deploy-nim.ipynb>`__
+--------------------------------------------------------------------------------------------
+
+This procedure demonstrates how to deploy multiple LoRA adapters with NVIDIA NIM. NIM supports LoRA adapters in ``.nemo`` (from NeMo Framework), and Hugging Face model formats. You will deploy the PubMedQA LoRA adapter from the first notebook, alongside two previously trained LoRA adapters (`GSM8K <https://github.com/openai/grade-school-math>`__, `SQuAD <https://rajpurkar.github.io/SQuAD-explorer/>`__) that are available on NVIDIA NGC as examples.
+
+``NOTE``: Although it’s not mandatory to finish the LoRA training and secure the adapter from the preceding notebook (“Creating a LoRA adapter with NeMo Framework”) to proceed with this one, it is advisable. Regardless, you can continue to learn about LoRA deployment with NIM using other adapters that you’ve downloaded from NVIDIA NGC.
+
+
+1. Download the example LoRA adapters.
+
+The following steps assume that you have authenticated with NGC and downloaded the CLI tool, as listed in the Requirements section.
+
+.. code:: bash
+
+   # Set path to your LoRA model store
+   export LOCAL_PEFT_DIRECTORY="$(pwd)/loras"
+
+
+.. code:: bash
+
+   mkdir -p $LOCAL_PEFT_DIRECTORY
+   pushd $LOCAL_PEFT_DIRECTORY
+
+   # downloading NeMo-format loras
+   ngc registry model download-version "nim/meta/llama3-8b-instruct-lora:nemo-math-v1"
+   ngc registry model download-version "nim/meta/llama3-8b-instruct-lora:nemo-squad-v1"
+
+   popd
+   chmod -R 777 $LOCAL_PEFT_DIRECTORY
+
+2. Prepare the LoRA model store
+
+After training is complete, that LoRA model checkpoint will be
+created at
+``./results/Meta-Llama-3-8B-Instruct/checkpoints/megatron_gpt_peft_lora_tuning.nemo``,
+assuming default paths in the first notebook weren’t modified.
+
+To ensure model store is organized as expected, create a folder named
+``llama3-8b-pubmed-qa``, and move your .nemo checkpoint there.
+
+.. code:: bash
+
+   mkdir -p $LOCAL_PEFT_DIRECTORY/llama3-8b-pubmed-qa
+
+   # Ensure the source path is correct
+   cp ./results/Meta-Llama-3-8B-Instruct/checkpoints/megatron_gpt_peft_lora_tuning.nemo $LOCAL_PEFT_DIRECTORY/llama3-8b-pubmed-qa
+
+
+
+The LoRA model store directory should have a structure like so - with the name of the model as a sub-folder that contains the .nemo file.
+
+::
+
+   <$LOCAL_PEFT_DIRECTORY>
+   ├── llama3-8b-instruct-lora_vnemo-math-v1
+   │   └── llama3_8b_math.nemo
+   ├── llama3-8b-instruct-lora_vnemo-squad-v1
+   │   └── llama3_8b_squad.nemo
+   └── llama3-8b-pubmed-qa
+       └── megatron_gpt_peft_lora_tuning.nemo
+
+The last one was just trained on the PubmedQA dataset in the previous
+notebook.
+
+
+3. Set-up NIM
+
+From your host OS environment, start the NIM docker container while mounting the LoRA model store, as follows:
+
+.. code:: bash
+
+   # Set these configurations
+   export NGC_API_KEY=<YOUR_NGC_API_KEY>
+   export NIM_PEFT_REFRESH_INTERVAL=3600  # (in seconds) will check NIM_PEFT_SOURCE for newly added models in this interval
+   export NIM_CACHE_PATH=</path/to/NIM-model-store-cache>  # Model artifacts (in container) are cached in this directory
+
+
+.. code:: bash
+
+   mkdir -p $NIM_CACHE_PATH
+   chmod -R 777 $NIM_CACHE_PATH
+
+   export NIM_PEFT_SOURCE=/home/nvs/loras # Path to LoRA models internal to the container
+   export CONTAINER_NAME=meta-llama3-8b-instruct
+
+   docker run -it --rm --name=$CONTAINER_NAME \
+       --runtime=nvidia \
+       --gpus all \
+       --shm-size=16GB \
+       -e NGC_API_KEY \
+       -e NIM_PEFT_SOURCE \
+       -e NIM_PEFT_REFRESH_INTERVAL \
+       -v $NIM_CACHE_PATH:/opt/nim/.cache \
+       -v $LOCAL_PEFT_DIRECTORY:$NIM_PEFT_SOURCE \
+       -p 8000:8000 \
+       nvcr.io/nim/meta/llama3-8b-instruct:1.0.0
+
+The first time you run the command, it will download the model and cache it in ``$NIM_CACHE_PATH`` so subsequent deployments are even faster. There are several options to configure NIM other than the ones listed above. You can find a full list in `NIM configuration <https://docs.nvidia.com/nim/large-language-models/latest/configuration.html>`__ documentation.
+
+
+4. Start the notebook
+
+From another terminal, follow the same instructions as the previous
+notebook to launch Jupyter Lab, and navigate to `this notebook <./llama3-lora-deploy-nim.ipynb>`__.
+
+You can use the same NeMo Framework docker container which already has Jupyter Lab installed.
\ No newline at end of file
diff --git a/tutorials/llm/llama-3/img/e2e-lora-train-and-deploy.png b/tutorials/llm/llama-3/img/e2e-lora-train-and-deploy.png
new file mode 100644
index 0000000000000000000000000000000000000000..16bb47eed43133d25ded37e0cfea5855da0e9c7a
GIT binary patch
literal 202808
zcmd>lWmg=*)@~94gkT}KhY;M|A-KD{ySqbz1`iH{1=qoKfFW3LmtlYz+}#~6=Y8+F
zf8y4c?&{UOR@JuNdp}P_sVGUKe<b?&=FJ;)Ss4knH*b)s-n>D0`0(!ai63!4?&}4?
zT})Qv!-o$`Ys#yy(@!3fIv(oIRvzA_ZkBJ<Z9F|ZEZxjQCQ;tJ`SM0qLR7<N@emeh
zLk!6kd7ddCSNLn3(TEXCftuT2ocUYK!r)gxR!FH&x4GIc1!kSE;aawB<=KqoFfsF%
zl&b0WEPCB$h*J7Lxi6yh-(-HiXHeNdnKh*Q{RQ!JRSMF`<d65RHArTY-*GM<#ODQN
zh=lZm1|KBbHmRFA-M6zj@-8yn$2*e7tX^01-@$pPiSzcqlQ-YhzpVUcyZ<{TNd{Z~
zv(^6{oXF?E|K7iOGxjrp;s4FPyZ?XsStN?&t0kx%Dk%(a`h5T#tr1*ezsV{`94Jgk
za;rvZsoU;E3uk|;BR!;ZTi1BM<27C%BbAtvoVbB0UZ=xem!x+zpuqT&|Ca&>`f>BN
z(?)IFP%9c{|L^HlMa{L^_0U7mieE8}8htW#*r$3&f!;}A=hg7YP@f!VQ^b+Ok&-&v
zsgzEgLwKbT7;0JuBxaH7BKAO1tCWPC00jD-t0$y~YV^*~Jr@ud!aj-7n(_;8m;eI%
z7}VVOQpdUOyds<`EAnKh)hl5V`p7d%KwTDzU$#qL5&4Q-!BYQuXVX7no7|$Gts+IC
z0$C;8amgm4`)}|;rqDEnPZFWnhkQSS#P*G4BDiSs)RNimv)E(#w~?>{0e+q87UfzG
zReQtYmds|X=mJ}S7+#4kzPp#%sl96x`5^%S#Bb*$39WYjwD4|-9BwXx>rO{)w~%+P
zJEv}d5liABeXc7xfLl{|K~)`+S#%5A?YsGTYA42=@VO(3P#@QuK~bj5kyx<>#9m&f
zxc3%Oat57rN{+oPTdCzwaTN^TvM-Sbo~1Xe>C?bKM7R=K?LC6AtjB|=V($>w>o|t|
zdFDDkN6xaw!(1=)wrKoVUMR`R({I1cxv*KAPw98kP)as8nfRPm?vldNUQSb9REmP!
zo!t*M3<naZ=S{$PIZ}GcT)NW{r1Ysb70?h>ATah=J9C6eWhVr(=7`T3-kEH!EI*v*
z@>WnBQH^}cI=)anzmq#?YAz4W-jle%<aN!})+2?T82K38X{)8ry`}dY%rfdV`t?an
z7i?H|#qrfR)02EkB-6%K$Y-^Ji?1g)L;M07668heK3z@*=^+Xnx#(Nw)8A{8r3MJ>
z0`3aAGy(rvhG^-*KxQ~jb)fm#q-K^MU%Qm}g#J13faHC1$aR`D`h%zS?Lg7?%4jt*
zi|dm=m^bh@m(5m4OT|18buY8f{lF(5U4Suy3v)qg;*L_;{`qz<)b9L;5O<4}q9s$8
z%o1FY#XCG01<8qt7_~@}#(o;D1z}DKAs>7H!p5*e<Luw;<9|eIe-<nG)YMr$c(Ki`
zHcwbpz?v8N&%6A{Pf+h64*W+@ad|fYNIJ1@(N&GWou~JmI5O_dLyOkMs!Up1!Md`#
zd9%?qusiC&we236iAU&}X1_4&FI8|oUnEY}y@`v;;FEXX1pz+Fu-<uGM_vtUbGeNb
z0x5t~VC|Nf-nr^LsTaSr4-julwcGYtQZh7syP4o-!4FyFLNeKH2#QyCfE}9FT`mo@
z0ZyxFbO>{u4XqyYcws(Cvodd-zi|*gBae?YOmM!NNl8B2Wm<twM1WQngf6|=bOQyO
zH*dz|>|6~m-fPd(nmNqjJ5II}OX*}eGk!nvUhpc%k~sy?YR(Z!zKms@6Lyj>06Gb5
zjy94NW;akP^X?}PI%j522K!2dJ&}?QCeb=Ykct68Te;Tz-0LC>zkWLd>ox*N1fL-A
zZaZfb*%K;5Yl@X?pDg#CwDmQUQ!Q2>27>IkK5ZAvKUJwdHr&Rh+very^A$ZS{02AN
z3qo!~2W`5&al*c^4MQ=Z&~(YiNJ?`ECjxS`l|N%ZUVn!=N875b%i%^tot7|6Hy#m8
zc6dyTD%?%=Zb5@xbt1toI&9HA3rz|yuViLode3^#W7v7gZ$zwEXLZx0W=!3ZhOm7d
zt7$5TeYX=#t@}GIa4MpcmB7>yT=!$hp(Qi*Rvfg2&a8cr`t!$UP|I1f_+d$m@}_UX
zNq#eBoG(j^BAE))U3T7`-U{Nk8s$0m+N4ksKQqEc%7tIGN0}H+ko|>~{D;a^jdfW@
zoOtR~0Vis@ToR%s>`am}oF}1kzn!g+YM4JCzu#~Nm3#szf&0;5-U*r0OrWq_5UfgK
zB1n*p_oTVeF++S<<|lc_nfUmANKM8~KwN?;q{H4i+&mtBgc}rnY&sS2XKGX?Vdf!u
z2?R2&#OYjZB){pm5c`mL;v;w?1$zLf6%#(6e$t+l94zlk_PKQ81XscBiUq7Akt|-0
zWb;YWEB<}j1?ESow_kLIW=e)Fyz{8pu*c6E(stXnWQwFx)hMA<NsG$s7#{-zt=s##
zmx1jlBX0DB32^lOS<AO>kBh$oA;*(+^5JnG*i92^Owi+SIyMu?y}0j~;q@%<(8vM&
zc&ivte`QH+_qe3pE;^_%qli+vIOOpxAR?Zu7IO~vRIMNIHAZ_@eNfiu<~<nKK`|~k
z2_lvAg!d8&JVMZtnCLHS7|xcR7Vu<>!b2y<F0c(;ws%zuGPY|tP4zh?H>WIpS9#&f
z?m9u2;BM0R2&D6aVl(^w!+9E_%+20xSwG-~{+v4dsrYzDd*}IWCC@DzM{MPS*!p9P
zpvYF+j>nb8S#p=3AhBambrf!V!U0b)qD|74jK)xG#%Q64f?oS<d)b|Nf&e3ik=*ui
z>Xc2=p}iEwP~+15mBk)?PGSM1)S%ShXJqA=N)YeXZ666bFM&Z7;IKYEI9$yJ$lY)x
z&k_Bz&)jbQPYk}7<1>fZZJm3um(Qq7-NA9<gi=i9%<|^0hyh%gp(8^hm#I!rDfi}<
zj6drET%Hz)WpX>YS@%61HqS!=*Hdgos(DBzPjl2{C^)ea7PmHwz|cP+|52qKP)~A-
zGa+P*;a5%ND4j+0Y?1VnMXKa_g122ykGw#;B*a(&RdaL2JYBFL3<;eW=Nh|={fkvN
z%agnHEb)(k6>`QdjvPk6pke2aGyh!WSUO%qg_%Frj~*oKhqb|lu{e*G(LTYQP|Ze7
zy4ym0d!1FP`mg2wNsX?Z{;1EH``jm5WOOCPO)Fh}5?PpW54{vOKluO3z_STD<9CTo
z8ZPN=>4J%iy&OtgP8~jBkCD2K8IVI5gYXbqiawhD!wqH^K>;kKp6s%#o@}|OV^&)l
zzn{FUd&9xWShjJw^fnlp`fy(5Ej6ThOj3WPZa)i$FbhW+U(nhgEC?Aqzu%m+RIb{i
zsVB+4-E^o#7LX58=4#NgoN{N2UH<5>y!1T(5wgCf%XZ0{CL{AX-v$)AHQE{2;UwLL
zR2xb9D9ct`?<f7M&!T?hG`U!RKdU|?lzD=<8eUTv_9;pBo&|9kc^?`rCr7<ypqwr%
zW0y`d66I4LnB~;5G(I_;Z&|ciufTraH8D6OJrypZmc*FM9e=X(F<x3psvoMtiKi@+
zl$~G34GM)hHlGGyiW6e?B&&hj?ac{bG{b6YdO|*1d7Bm<UQ!*zF)Z?Z5;LR@H^M(x
zCW!t)?Vs)z3xd8OS@)uzg*A%F5RbXRI-f#{lZSD08awU{L2k_c3@Vrt61!dz?5Uk#
zoUumrVi76bv}viz6~lT9&H>rnk|;G(_VUeS3$+_Onb!oY{JcLY(X8Rwq&2svN{jlK
z?-&Putw$V4%RqAyvmW3<k^5U~(?GelJre)ERWE_tH*oj?sIo?QndDso29C|Qc2$)3
z$#Ig46&2eY-bk29QipM4o-%6cP_27Qu#rSjee;z}Q^bLdyE$k2KlsLTS4s8<CO+z*
zEu%@VqtJ=J%Af0^=SMh`51^9-+Zm}}#~|rLzHQpU)<Y#@nI3&;ipEXcRC7&|<D^8H
zR|NCQ?>Idz(sXdOQ^#UVuD~q40LQB!niSnlP@+CP4L60QsJ&Zpyu8MnTj#zgn;>rK
zXt__SR$zA-#}?ro#>-)p<nYMc`^+#@#EzlXKBKnPMS=?dC`I0f{h7ki1G#-^n&#BM
zqV)rD^N|g>|CGCE8EZRqy)*olJn0*DMclDZ;nFnAso??0r?gI~DYsVIc(z-=dpC5B
z#hB^0p$9rsT`M6ykN22}Y(oDk(B<2!_&&hkQ)dyG<f~(vq=h6eT(slTW+=8z;~kb*
zNIq%IyUP_HQ0V6!o6HWI4|#0x0rzS1jBQl5e0Sf<QSI#a{{XOm&$@kWD)b}47=b3v
zwX_e`A+w1wMu<H1)i}|v*yah+koyMH^N{jhC)3<EN!rXCDJVXC<$JbwX}V+&bF${+
z5aCrZq|!Vt!t7rS@Of3~H(hR34}EFmfvf8+@w22AP<BH2+_8N83QF|F9VOd7%>kJy
z?sas556kBl@g$2(yF!8IaE=@FbAR-S_Sv`CzI<f<pX6og6=VjMv_{6&R26wapOj^j
zzGameh;Kjmw_Pu0$+Z1U74>~4LK}ZowCu2ohd;9Q@z#TU$_^+NaW~ZKCrXtQ7w9We
z^+3)*Y}#M>?gP;+N6+QjKu7%Cv>4@mybD7#ppoG4`mL$O+8<bp!Uu|Tp_&`G57)tk
z-H6z7u1ml`j4?oe0aw18T&NCnV5Ej7&B;$tesHfFDb3J(VuzFCl&+G+a3ZqkD|{k+
z8Ugn;+7R6g#0qmCk!wAmZ|@r1iIdKeRJgENJ{Csph29mivBjJ4KeX$^vdiQ0Vj-F;
z0+pQ`3)ObxH`d4vL_F%vm3t1{oi5NJ`;An?jr<+J=ctL+h%u>5I=`ARCTck{lTG<1
z5ewyA`SVN&po@@WV7nfAxjv@~))<vIsRPaLU1%%6c)#4fji=X49F3$AFL&VP&o%xc
zb+&`8Y1rNfPBK<Xtzx(P(mundUC}dM?uuZ^*PxPbF}qPkaC?y)H61SdS_vf@b+9pe
z%&BKYjBRufEvN))iDGN8MRrh$uz5l&o`)%FRQ;7659_Z|;-|`TIxYreD$H0UGQ;0C
zWsj{er--BvYFk|uT$Tf~_wy7uySnHPep>B))s&}EUkIo#Z%>X71Yo_hbbYB7`AMGB
zF_lpox25cZpPPDL5CW#{Totl2jkc#VcQrB^#1}O?j%||ARb_l>3qJFO+G&|v$^_Iz
zwRp!#%LV%EhUxL#p5QZZTKr?XsFF<wmpB<bnTlPt%FV9s#5^Z(b!JDcDTk|ZEysQZ
zgcoxL8yRDi^Oe@)E%e1j5P!h5tC@}(0Z{!+>Bk*0PJKHT_uQN5k?DW)c8kV#&8)!V
zPDFV$yWWjTDg#G=RT!IvxXNL(S-o!Cq5--S)yBasKlDB%Y8wM&wca$qeL(o08|Qkn
zeRA-!u7F0Z()b7<zw7MV`|Qy?iVpd5n14GYQV%VqxLH#hkga=UI}rn^1-G_nyq(Iv
zT59XNI4qcc3wSL%Jp0PZCw&V+ifj)K)vIP}Z==WUZaz49j|$e%FH}$zIj={hL^}@V
zPV*!tMal*9s&7l<Pn6{&-e`V%5G*?S=?K<nmIa(kN^!+4+PWGU$k;Eniz>?hax#3c
zwlEa?5X|C>hzAS2GhRH17Mv*QNscEg<QGZyfyZKLfe}6ip41WuLQMT)`G5Byj8ktL
zL^h&#xODvf1mi9o5TX7Tupo3J?Wo<c{I_EFbb89xdI<KDAZ8<=C|<7>$^7T#vMN`k
z>{S~+x-+4XXXH%MRqc9Qx$T*3CrsNXg~N7&Q|O{jFXV8l%>zdIvpUgjJ`~_mHzr8N
zeAPW6zG1g&O_so|m7CL1%kz>$?d1V}EN4`r$4y;lJSht|5pMd8xbUV~49xXpvZ&=M
zCrP|>%EX=P?5UOM9H)FP_XfsABxl_K?JELl<hy+?%@3%IBJQ&1RFY>3V8Owd=0vzk
z;dLr+9pLU?WM-Mq!xZAj=A&UPsn0vnkDQpWYAE-`H@?0s?=0jHf#f=mmF#eOi{96E
zNz`u7EJr9F?gl_fkc&|<h{jsj$J=(KdPRaR^%?ZFO|=Nkr^xT|1G%sKG6~iO`a$Qw
z@=3bfa+hHo?*;v@Q-4NU^bFpn^Kb9ErN!vxj`5}#TR6bUfw&jIxGQboU`t8QMp_6~
zH1`U?C|h_`GoV%RgqS7q@64s-{7C-L+YKlxwG?mH?=fl*+wQN2`L>}c_J*kx$Mb#9
zR2wIQtyV{Zo+kXN6$R5sOr81i#_~Ky6!vLG8D#TC5eijT4>G_TSpNm!Xl~K>CbaRq
z!|sw)PhioOb`e-8CvstLge{69LM`V}d&C_)%nGY08Ce3tON%S2ox24_)7pOvgbyJX
z%Xw_H<V4@i_l@p?<t`owx=9tCbHCK)*V1THk9Tf?bN1`oM4Y9-SJf+RrOWtJfz^`@
z(^NQjU<VZm+Ur^#kLsTR_Vk-XWt_xg68cKJ$p)G``sIPu`+i9g7r?8WLBO_t(@oA}
zoUP~u2BDn^f1+aqb$}{J?AEb>$r$7yFtXW3A1KzQB$Y)@lGnb*nV#b@gw{$qK}2S1
z*#Nap913*^#~AF>o4^<OG)2RS%^`_ykman%l3i0`;R6Xk&zz{Z0B*ROsalkYXk64~
z>Zw=Tv*TrGH~f}#N0f`Sn-wUYuDAc~%oVM?zYlKA+?q>}v-j8d$2(rWSE8W1JoVc?
z&+6U3+^IHe`)ubUqN_aZmI#xGk=$XJE7yr2D_<baoz{17(_X&Uvi%VwMwKndrw+NF
z&*>Y{$8j}4ZN_9XaYSSVx=JexW#vt3jNOW;S6oCc#7IR`3c<0*W7s77dd?>bB215P
zb=&wShJy|?;;^@K7ZlT`9-LL|>s>V-K-&qaB1UYU^3b2^FSr|8(SN#wwnFp_dT=CL
z)EC1&M~yht-N9U07!?}}YBSn19OV1a@JG#}Q%W;tPm}>8NY6#dmOiRD`q62^E)Qur
znAQGQomD+F<rtWL^k`AFO<=#~ZzOUMX?Cr18<e{;z~*@W=k!7a6ySg*uRyM{U7DUX
zlh|4&<**#3vR?rd12;`Q+Z|m1$w6`*xO#EAGwyi@+Mk=)mW6|wvae{mgA8y2?CbfK
zjgB7WDkM-QA9r^`t^sOPX|2;#l;?g7y#)`X70Z`%8TvOEwu}3UxG)ur8GtTpd5Piu
ze*4+EZo{i$5n`{~(HdlMkEKzMh4r#j0eIHS<OkM5>^+;I|9V3GDjI|6E8UcXxi=Qd
z+uV@i_&rrsf-?E4P|movCXQyf-bR;{uiGzd(91PBL^P`luiP_of@^l$<*J2F+(rdq
z-$YH!O_fE(SsAg9n#x#K?AAvYGyRa{pP}YdxtJwRymCccsL_KGSvU1X-~fBSZkV!m
zY)%R@kES%(bHUiK$%b)7?<Q(C*?I0;!v|C4Ff>d#b-Dcjp(6m_uv&rDeDxU%dzCGs
zYvB;<3gF|eNVi0TKU09nM3=eMRm@g-=Z0irc8QF)n}NS*GGii^CFTP*dVFm?%1LdI
zDzBuwkl@OM{`J21GJ9P-G2KR3$&6>Kk82zC3X$jD2{}%~h=Xrr%OM9oT*Pa!Lvz9n
z$i1>?%Y8Dtavd$8gKd5>hu2iyuk-!IAiF*#7oMVqpPx49U#$zCqZKZvq0E<sP2XAf
zPKXz_%S0jaqi^e*Lk|&JxR=IBv*dxUV6ICI?1of*yedNBECT&6T;6E^W9CEo*98E)
zA*N_SZlN20Ft!hf)qUL52?7gyR6f8Q$7^CyC}MOf{8w{y(wJQ25reNIT{|yJQXZaX
zZ56`7(Vdw|?Br=ojBWoQ{BqOw5@aMp82e|}ORr5m3zIk77-X;lG7uTx0&Rvn&(%f;
zy%vyjK?;6stQAL&G3oRgnV6a->L2oyC;GKTD6bxtH%$#0RRtY%HYSwsFgZYp(Ap7l
z#Z7BRDlzan*A>^Me!~POOCFPvdF7JJpUaJT$uYB?5C@07B;RHNwTeF9$oq~KLl@Yj
zc1C+tiE$N?d6fe<*T!;jPU*&L8&W>@>Y?b`u|%OO=*q^=bvwn%=#8b9BtswG;v?@I
zkL~Y7Jr@Qj^6>hzP*(MCXUC2%y^8>bx$5^*IPDYryllt!^IQBIhNQYBk9Kb)T<6{0
zQI1l5hya2`QK>PM-xQ4-?+M$hs{e}7djs>NI%pWF(uSK|n>jctbM*MJ&f@l|3vm-{
ziObMX0dpy{Klj{-F+PbY#1^iW>+5k?%~+ZfgD*9`H@51n%70fX&x}l2?ZhE%OFpeN
zd8TE~Sm(r+5Tj%6S4;x}wHxu-fKb$sN#xG*qzMgF?sx3fps+gLiHWU~roFz$Ms<O+
znWBki=d@;Z<`n?Oo|j|c0>TFTPI3#6@GR~E@Rsk!(nkILPxJ^OP!zl^2iZwNtLE@x
zR?KDk4nv*7Q^yjlc^6HVCGbhMSRP-%c7q00jcH<=lhQ?uR_k_{IJy!t7W_}E6;nL!
zPRo&SyLt9Bf(w^&wcghHD6>_S4Jgs%e#hJQl3L;oJIJ1wrgO)v$S$8#0XIEkmkX@`
zz?HtnX;a+l;g}muQTdW%06tGMY0ABMsjgjU_AzV=XCzn{INvBD2Cs>}HleGs1xUlf
z*n@dVr@13N%#>Ojif`*OnZr95=g8cAd+)&4P_y0h);}=1o<9DNaw#bt6j+MFqT6Vt
zzV^#N_QUV(Go5sG5jGX~znnmiRS$e;0e=3V;jR1mqY0MONjPLTQ0WPYZcoBz5n70Z
zG+%-3IJd1YpSCj-p6Lu+%<N1FVjkp=MT~yOH?6+^I~K3L%J+X9ukB%4X>D4VINpY)
zC>^SlKObQ4L?)gYiL5x$B0LennVS4SZ<f-ng=#iov+IlZe|@gmX}hER(yrz*RCa6=
zaX5fpP1)tzq3*QAEF)yf+Rj6l14bg{*ZQ3K4Bg3}s;+<1gSrxNtg<hx)8u@j$q8g^
z8sO93U&WiS6&tGVp#yPkiYq#lm;OOHp%@!1>d$+5txw*CsBB{we_X}vDy)PDR64Ee
zPYHc*Kuz$wcrN^cRvfFvk&2#i-ohrw(SGgQ1Z@GaXpOOHacAL4^Bu+KmwMP|kNndC
zDST|2(6saP;6G2*LmC~pM<<CfM?J!J!{#xv!yLYRESr9rjO-|P2oJF5xUZ~8R;M2K
zvvWK54}BSb@DvXzKogj=%t0%S&3(A-s&_kTbP1}vbPreBm{UUxR4pAd)@e|CHZ3v1
zH9AXa44bHs&k(jTvYc#A@yash;gqxWy4J|!+!fZ*r{W~+q&B!G0=b4(>Avd!b4Lz2
zKJn7k3AEtFrW``uSoetHTtp}Eah@x8%AvffOI?~PZ=O0uhwx#xa$NglwnmJWHdrcy
z5st~pRjA05vcO|i*pgF!Hy;LBtk&n(Gi#ef;8{`nFor@yox0;rEb^gB<Z#N-LSa`}
zyT~Wi(odA9u%j-3;c49GEdK3YEwj6}ihJXTRa8`yArn?(QJ<=zd^Oz);CzX*e}puL
z?g}yA^@yADsd#<o$M&(VH{eg=tB90}Kmqo4YZb!sPAO+}yD2uX1S-ztGX{qMjkt&|
z8RHpV=^?;%LxY&2f9^*^5?qjt*Mc#9hf-NWpWI~~;+^PlK$S+;Jj)VGovau-Wfh`!
zf}UerlJQM|YrV00J*dx5Rjgs`uvN#6+-eLXm5PUC#HvBrnncLzU}wfXcvc11Zm31g
zpqvx|Z``)@Yo=k7LX*?A&KKqOQsZ)lj9b-~-&BSxapi4ivW~EHD>kj}2Gb*UM)j%9
zB&2g7bmzV_3sH2CXj9)4;%Ys0@aZFYlImkyzr%kG+sjX>K~_2wCWJSF_#Fg~TeSBi
zbKie{Sg<<c!3X;!uE+8?EBQv+76usHTaPXL+4OOKO*=FuTDvif_f1Q3iLznj$pFQ{
zTXuxd^Sem{w~F_$W`Xb-&iTs=PY%fN`;@n1g@=o;OPSRqc4W;~A(NSYxD8;H(`_Pt
zU)r(Ha#BHs4RZlEfnzkKV>Nk$qo*B*jhO#Ub0N>$a=LeX$vp26bI3p9k?S;(PKL48
zjpxlkKi<ukTxx65l+rW)YVgY=+|jp%8@kr!$PtouU2~$&B!a+YmwKg*LE*N0zQ6YP
z#ui=VZqB2T7<w;$*|X4*_Ln?XI!+cr6O2~~npl))98O`e^2X{sM%i2sy){c_Zl$Z>
zrNLdnP8Z)4mOH24+%c8A$upQ%B`goteUm*WQ5uTGd$X^0>&RQ?zgbQyl(yU1boZ{>
z#J8sU5#1<Nixp^9Bf0^rbu4uNb7}r|XDbVeI$ah6mS1~fyd(bL-<Wls;&T^eb*jum
zxnjqT??j^e$lL6al6y{8P)S?hvHM=z7_8aCQSaJ635+PY{d~|nFR-$hf<3jRzr@rW
z^0r?!^5~6m>wVVbA|x-Z92DNbnQKS(GlM|?PrkJQdT=7Rgp|9=)_tI&%yvSZF7#a4
zo<)6)Bp@(o^7boUe1Gn|K+m+$A1<)Ca?yXe015zm>$OSHc8!P$wt1%mXUSw$Nd@n2
zr6l^DR^j<<HJmc#$__nWGI`(~h&hovHwBnkpOXOBPjlvqSGL;&H)1BBN>>491t|`r
zRD|-4h63!lvz~hGx%^6@danjZn5t%~tjAF*yz(Z|IUIdgv)<Rz%#62uxfm%Du`sR!
zD$ad{GPL2c8?`1qe1o9=k}>_{-Ho3S_fOly-lA&dfoth;Pw33wSx#<emA5DlGHKmH
zZ`d3CilsSw=4TQd46(`Uhv(GG9lJ|>pg^mX6)H^{xgvefgr+ZVT$O9&>HnbcHPHdO
zGm_YjrQ60s-gtq8_#KlsB7=V<_n2G*$?)7zRijIY`I$st(Iar3NKY|laJ{OGBuS7Z
z`c7^T15H<fsglh;u+9T_;=IXPEQ)Sx7n(rRmr5VxOjpFc?`5>h(du)weQbCaEcdTK
znLdDC;85dT%;puHbN2*`Z>baY#e^-%xNybQljaja^XLd^Ym$B|-aP8!=`B}Y&hKv<
z+0Hdr-=mr(IxV*wqk!VYAEOr?edUFIZr0B>4x6x}BA-vWP}7Y|%?X<cx9Sml>lzLq
zYey<4NS-B@P^`q$Rr&!DF7*uWD?qMefk%lGaO*q}%MK08Qi=6bp^!#Ss&u@s@b2;R
zYSpa-^H~jp=?#juPB3(Xhek@6fN0a-+kJWAkRJ)X>Zx`SIkNihipPY`M?EEFZ;d;~
zb#BWsa$(xBw6sOMyi=C0sclh7H^257&sOCrBT!JfW#5mx!Ez4!T4u_Iv9PE#(xDv<
z44ygObr#E45fZ=7osL|vSEUJ)9g)>lmq;wrki=3=!>rl{tw#?A$W#=V{qE#$pWYeF
zSJ8pnh(~Yd4Z1C8v;Mj5-e{-$msIa`|8F4o?@o5NbYB3zxZCqNI_bPI*sIdew@I6t
zuw&i%%SZd$n2KuKM`_JjPHaWu6p{XVJ938NqFC2&?t`WtSmnZ%puGl?D5(QRE;^#9
zEf(sgU>R1QBeJXJVu9e%=};L)^Vz@B29a?{1f-W3#bt$H_uK`!%2$W|`$x6WQ=b#6
zN|&kAQum`oes+Fs!P8kse_Q_N-+<?OPu-(H9H_<SsiiV!;p{~S7J~q1)wARyK-$4f
z6Kb1v4=)j{lri$=XRfj(3vCXna@b{zdPNrz(N94G?L~z3P1rx;h8m+1gs>Atw8qU_
z?wB^nl8?E)lU94@R-*e+_NrR;PU_-wvWQN26x#ZO)!MoJ{u=q*aOoaKwl=(A=^wEL
z=I)}Tx259e1HEe_T8-0k`!>1;HssG<A;T^<t}kP=fizp7IHN3w{aqWHZ!frRC6;-4
zzbe}hqB*kSL^_rp@g#F+xitQS=L_?@7=8`=ovL!$yg4Y;>i-!Hp6sqIO;GDG&=w`2
zD)!4mfOdF(IlEx8Dr1xs7}Z2p+gROGb=6P*$bWH*nBE%9n1216JeTw?1TzPpUIQw!
z?z*+KRNmJ?o93WSNLT!*!z{Y2yXF^%<_>;|-*UQC*EYi#llZYKSzJ8xNP{uni1onG
zYi6a}+mez&5cF3~{I)GARz&MQ(kFoKwM{)2dmW`9v!}c?bj0jyxQ7AwDq#dXS|%<I
zwrRhv+48_<2)zsZ`S4&zL>1&u4XE1$*%YJKKZOPaceIRQ|Nb!4XJxgq;jd#TP93)B
zOlmQII1QPO8&Q6Ye=fV;d-rs`x?1grOpw_+@Q=JLcXks(VooCsq@I{Go{-Z1Kp}B|
z)o234>&Wkzm5+ZROHYkm@2Yyz*=*Z>aVTuIMK(ny?HT&RlG*;&A0Z)UUyzz^H*d__
z+W!bC3b<VA(jM1yPr78Q{=A)$xHUHCH7xrty{N(|F-!PE)obh=z#hITSYdDYXY9=F
zKHDUv9*bjri@UTW@F9vGq5<S{6U?ou{^fP$#u+vL_(_P+-@vlpWXySL6SBE)Din|1
z<5+~%@0Tsp4t2E~4xGIOo_GtwAKlrf`79U~71VhJ`<^*d+qYyJgkO?+b&VS*Z}Epv
z3>7zqXB(ZW<0L8rCMo5ILaqs7%35v}GM{7Ev*K!E%Ck0T&vnKT5zc5nQ(OsRMX~hH
zk^PX9bc#W;PE48GaT$6~K~;$lVId%A8@stt<I$!u?W-Yy23|DYIdVXT>YZyHXxbmp
z<CprU?KAj_T3ce{<Kfk96eU^b--=%$`s)m&Lvew?T!YOE-qo&+=8nRmFHd*YsC^C%
z`g&O(=&<uNNIhe6?e7>%ha6b^xJOAEQE5Vd8dyK~owV)YMPyy$r0{Wa`7#Ll(B;|-
zbSF!{0;GK2K;5PTcV52|>f^z%Pj|Tk?*1AaJN(C;pL7D(#dJyXPP3R@F<27DvYid5
z1Q$5>B0q&&%Q|aIQDkaVj7tk~21{M$vCdcyGh$!e_Uu!xt6}<#1vh1NW$BP=2wnPa
z(@^W*IXKb`Xx?+*xlX!I5oe=ElKm^@$_%NE(_6Y@@eBYnma7ssbsCI{Io7<qC1N7k
zEv`nke8uv`8Yp~q{(MMuifQu~xoZB)5r%s^=P^-F3#aT62(u_ycX-|gsefVwI#|8_
zbD|Fka|Ce@pA<%$ju*lc>z;pYOBNK%&~Yt2xa<#hDQdd+R}ONhq$!i6k^k#qr;vy@
z%KogpiJw;ezLa?jxkVr{yLU9=poaFM%-OE2@UlzwsA>%>OD7GOu`MH2<0}Y}ICG-C
zE`kdeIJT=AgA^~WqmGJqN>vv9wLxvErYC<UJ^9nJn(k$A(towZ{8sR3LI)6-{#CG1
zT2MxG8V(zr(q;4Q>9!$ar_eZg%LQ9}lsA^-sg|Y08HdRl5qn<idpN(X4)XDRMVvi7
zrSe_qSD)C^8C{SA*`2e76+wwk$~rG{h&AzmrC0wH8(sj#ZSX@nP-U5D8s}oCZ@S|{
zt4<<$DZ&O-Uz!$do++X|M)LAta=OoU4C$hD?&yj*O}<M?P_fj!@}1TkIKc*ByQ%8F
zhs;+}k;t*MCH-#b=xB*p1a{;Y(G<fTEx*;ct!i`kbrCL#t>zAHiiobufh)<duPDn~
znA=`Ou|}5k$6!=1vm@$ukf>~nr5SijUO|C5>el2t4XoN`<2Cy-&)gd^7GpUrV&=0#
z6)e7jLrd_Xym#vX-|l&JcN!k&sEtdowR07*bmivKom08AMeb$>{Y5k{1f)=8qXP?<
z6*HeEw>k#%G^f|q1o?>}uw}Y`%Oab+Y-@++^9A9P5nHmV)J?1qcg8>M^5@N=RXgNu
zjV6LFnjW3Wl@K=LFE0kk^$U^uPlvSz0cxasB?EcEAxcLLOVR0tO{mvzsa>cb(z4z1
z@Xgu=U(XP{aYcdN6UD8NO-tAHjfTggDK3A%e|*AgFz=Ah6yLCKj6Nt`Zb1y-7YbWD
zbTdyF17z=NO&#~|fKA!MFQxE<VTDl(;Y;ZWo3C*SeL}20eplDjl2+p76JwVA{=V=H
z|1vm#IaSS$*qLdmz~a!pL!rWQu%yQg3KMa=$}D$<fvD{I%8|4Z;>I}<IU>|2uSP0b
zqZqSBr1lSTRe(NgtJnvMp`~sWsa3^#ABLMj0Q)QMOi?7g>YW?Mgq0M99IARFV0BJQ
zU@)vb_14~5J`$vHfIPoEWqadrlei2f$!y(eO>XfCKgT*GA{}Oa82rJIot4zFnFBgr
zkF*t&IbTfqeC!fQDz<cgsl7F@T?I8Or)g=*VfN=gr(7AVopJA)nmS*ctCV609b)JF
zn;X~&NWylek!~kAmqFW(033yegntbqBGJil_!=2nuvt3=K>uE5bnvFZD;=!HV|1UM
zk)v@gTdhciU>vJ(1nFEZ2Pv@&UIhx%$kHC`{Cz3r#|7}Q^0~8d-Tq{00kiwsE7Yp4
zYtpn7IkkQ7ge@&K&p$<a<r3}fT3od_M8EtUHsmE9`6RY^I`{qZVi_+xsmy_Ta^7B~
zPpb^%8LmX!b(A7-8@v6$BeS`Xou(PQkX-$o#8up<5Qm#+E3O&K2{i!(fAyEzv7!OZ
zB_#DHcZ%O_wG8>PQKyf!$cc_IzVa3%1o83YmXDvqsqe^G_AkP^JRpL`HFU-)wyo9M
zRT8DbbqP;6_Pk|FTF(THitL6b=T+NUW)(mO2;9nb1ps}X!h$e`g5{K?oR#91&ym5G
z_sVDXtm*}G-nsn$-huo($NoLJ_;~`S$G-2PO{j+nqTQx;rrsZ3SL5ZW-`y5fD4!aA
zq$i)Mqr~l4r315mo)b-+i<95<t=UO*?}$=Yv4sZ)Y2b`+>y=*~gl(L3+-~XN61ora
zdC>3bzNRXhV{|7;b5Pk2iJ?0}mC8{Hqm!xv;%~T_XYly06^gfx_TLBD>1QUF*pwJ9
zJ)D`B9tu}ldt!|_Li#-0A*W{mMx(RJ0|Tud^?}}0f$@Xfzl7b*6xSq)g~9j{N_Jjw
zj;T@R>a3+PuexPok^$Gl@rAvXNWmc><P$9yFe7oAq2ASmLY4u&@8{?rll+hdc8V}T
ztp10v6-p#$`LIuB>qrlLVjtM9K0b}khJT~@79>DGX=SHn5GSNKy806i>@s`pgN-XA
zo-87?0j~Ra?&&5-X(6~}bf-%2DdYY=A#B)^>KtMNSyKZ~={lhpyV_|8jj-TwQpS!n
z6<4o-NL`hF@SNYs&qM^T33J5}TWh|C%mXv;uI@TR#+!dm`)QpI9Ii0#9eXLuT9Qu-
z!JG3+!}+?Z_X}x&oFPOnkG$U_1nf^CnV);vDHzsA9iiO6od?5IghD3U5}7-rngfn8
zNz$d5#KFbOo@hz|ercHTZd`%=l&TucLURX?RW2(i*T223!aHuFSiH#<;3R~PKioY1
z&SQjW-MebH+3IjUN^)iO;;{T&-l;vw0Sq2Jo=zTgi8vioNPJEmdtdg{oIjpkf208o
zwX{frnnY>@kJ!;3PBl%&8W=Ly^ykNILlvatQU(5{XiVsRjIIEg?RqhBIWC!8`7<en
zEy9)TOAGY7$ndMlQMReEXaccU_`}>I>`js(Lfw{Ei%~0azi^e;>mVEl^=Pwr-?(>T
z>}+CfH?}|BEQBRy0shVZbMg0bCmHDMu4}7i-&%V)@~CnPmD6th1whij0cy4AjtMWV
z>TAy04V#gqXn<VMM92(W9*Itotoqug77Bd=+R9|DlV0OnJWs)MZ<feg%>$i3-xFZq
zo6Ek=<&*os7TDt(+|K<8<PVLk%je$=`)`Zf2)570EAfSbo-PNkqC+2QZ`?c{m#oG&
zi|-9`vA@3DEDsH?Ig?_av_dK-7vJxgHtdKLyi%{fDm@W+2H8_9cc>5RRSIEPi(G&H
z5eLh3TsL=~VMdnymKK*DYLSGpm1{xYwO{!u&?KN^<yGhVjbBzE)roJt_h9Jh2)PGM
z)?bVN?3A`15wt&VWumo?Z%UI=Yuzp$3EORb{M?;+D@AC-r7Imtz4n=*fkK-aeCkix
zYihOo{Y1x!%z&C19zQf8!G^!*b@sS#_aXXZXAf04>eC&VmYJ^%`pCX@Eqd^rzEM&?
z!+rInFkAA?>0h?8z_176lH#|P?O%&@u`j%fecKXOd?fyY*f*A9zXVUtmI-rY`~~kE
z`S%c%{CAM{Yv%QCm&1}+3;aorU%a3zbO8{z$9a&Fft{RXtxkB3y85eweI1L*#}#JF
zLS>)}`Xb_3Bu}u+fd!^Xj{P<su}L~id`f%qpyjn;;D=2aKzthHl#i0hZ;kpgm%b9}
z+;c|CVz=_nAkoX&AYwx<PGbR6h@-xb9CP~`6lW}8rahA1(=oNZJkZ#NP@%ocz-&K$
zty*(nRE-^vSRG*~gy%b-_rjz{KxSq%-mE_Eb^lUc*fQ&uvqb9D`lYt6Hu5=m|As0}
z-7=mtRiR3ms55Tw4~nj?P~xq*SS<aGnnQkW6l94LrV$>UsAgOwo@(d<X4Z52DnNQX
zA-Fy?N25}Qimgr_E+x+r8t2av&1L*sVLF&@m8^EoO+oSgD~y_Ei@0!&4DB2^gWYo8
zOB+Ehv_mFf!+c0}q5(-_SNp2F{gqlB0OTn1J84?1?CNy3xL?9Myq07;S&&+(i`$k{
zko!l@(ifaCaE#Qc{jTzX#X9NdYwdf=+mac1dlifLF_e7Vs-g8b+)_-2n$KM^<w4*P
za^cU)8uIkL(m$VmJ^5ondZ(wud$NZi${-==?2{th2qZUZsk1FdGm0eV-lC_&iiht)
z8soW3+U;cYmtp3|+-$2F1v=vIXL-D%`hO~V)2{cag}tJeV8tC^R=g-8o=b~wprXqz
zM*XGpxAU&U<ruQ$N{Wx47)To)H&N!Xfy4PD-~}XFCEp;asJ7iu<`ZqnLJYA%x36{e
zRK)Krwa7V!>xe~r8QfOLeP$<l*pR~>(KPw^8IY_)+gL+LKc)2RYOJi<sYzoX*7%0r
zwKL4HoL&L{F*O|87n)%xDw0{HPVzlgp5506n`S<u3@4b?ZNtwMtp-Z(XsNI-vTuA?
zd^Ya9)A@v}{a&8&uC(pxuyLLHQ_NhSS&wMt2(ZM~&o=#(Chg@c|M($p75^GBJDnf3
zBX-z8ypq(+{+%b7m5uJ_^sBNdgdLO2glDpNMPU6o;3|;uQdBYPtJ|%9<be?rV$l&l
z+6tnitMpShhoCynGdpr+5U*;>)@onNeqhEvgOGZv(5A|w;~~g6Wovcp(YdK+jKD#?
zF{l2C4N{I1LlXCoyHAvBxt<8kpPyfM>+3M)FitcMs9lN5Y7yvkogIbjOu!6^Ub&P$
zC=Wnen#<RUZO7W6{uCpul107L@6<>H2Vk$ow~@#VfIoW$9x(_hZh$BJOLGq>VCVOF
zv*)N|_g}F`yedRJHh-R3!!ae%i8OZSNB3h)QTp?;y)d(4{AKwPWi)X6hNs%GdP9HX
z7Yt(o_!q|c9|9aPKLxi=-U~(10oMF{Oc&Vv#>qiP&9n$%()p9@P$D}5)D#eeA+%&u
z(b;6aGIn(#6eGWXZ+?G0xjt3z@=*f<+>BaruY}GevnLq4&^>4kCiJe+*lT&@c<qQv
z@uio`gUe&?9&pV)F&f^Rm@kdqY4w~vq@K!rG&(0zWR5dDqd)mtbbT5Cbl3L#X8>TU
z@Ube;2*9~8)anWo!Pa{WtQ!qR#dow*TLnYhOXFGf(EW=vu;n<;Bw(M*3S$_2iRIuQ
z!#07?&#ghCzBr64?#c$TX|WBZN=p4iquO42;kf}lT!LJ#Oi~{xs3T5JG=ocW@U>`E
zN7e~jT8XlzAAkChE8DHC*C!=$4@ee;yU%ELUe!0Wc$ZESS-op;(?lsGDiDg~TCx3B
z6BlbRGGTt`%4_EXHLSd^<VyJ^q8gdmE^mdf<)Y%-vIMeGvix(45L#*IK}{y;$(^2(
z51%*0X!ugwmZy8(dVZo@c#Wp#?f6Gt8Mm>VBN;Fiba^;{y|z!D|5_w5+~jQi@>$_U
z6Jf)?1wcAyqUvLF8mo_Xt#q?*E3LZ0QJ&eXI!p^rp0srjYF}I!{y0$oC8_<7!EwAA
z9Eb^KXSG(M%~{5MUg*;3x$!8auN7*F`Q+7d&T0fc;TqK!y7qCZfheY9-Or&Qyv8+a
zd12O|^(<ce>XT3Y4~%m13C1_?V@sIfXFjQ9g~0R`9|li7kFSo8eX^D3lG9@-+EH04
zB;doS?Gu&D{Jrpcgy{ZurzVJAIUVIp2MfC^Z2y3%Xcf2cA1xva5?X<ct!JdG*smyV
z84G*v#p-WCiX7>+tLjd^kFr-mbN1ElVUk(xsRM-P0e_9HqRic?`DLc`&6aJ<Z-jbQ
znTxqgl8BGuJeUYjahCWjv~v?#kZFW3w!I%WtD5J*<KX+-qS39cVlZ8d2j62|J@D$L
z?JRJ(x7p$}sq4=F?eS~T-ak{4Hv3cg@J<wjBe<s`mqbj4W%I7K+i^dDp>Dxpn8eN2
ztLMxxYT#n}2xWkfD!$0ik-IwHE6B}2#Y*TKLbY>@mrnb}8NG8{5#mxLUULl71v$?l
z!D6}9**pBuCP@%=#F5Gl<1e19+KnY{IhYUa8LL`>u@X{a!gezt&?|C#9OqVp*Nv`1
z0J?RmyR^FS8iD?PJlPo#VW`tN&K11~VYair{W+&wd+z3Tg*$rGqY~Wdoq8RQu@z4n
z^Q@n>$v)lj@MyE8U6(Z)!d}Rle3|Q(szd>lP`-JxKXEm#n&(?=^xSj-Fa6$noHuVg
zjS5}*^)$DbPiuJ{MiP@k&@wXhl6xws2Qhb0v;1>k6?!Ku^30kh-vW3*gng=&)`7)y
zQc_)P@E-^MTpCOV8@dFaSj?3Ms>dg5vQWcj9JT1bQ~O;nWY~P_8c`NA<E&mPQjWi4
ziLswMkzqTFOuT`U2?pvvA$ctvu`f@@tYc!<-35SdVxs0u+oDK`j0)KntZW3f{JY;O
zQs38EZ^oFgtACXt;2ZRIhh29en=$EroG}@(<2-~~k}em9q1QDGP=^eu8II9ZpWZW}
znPCddo@#!k!9dG-()3Q54xC6tz%cGqPc93JG>o#!yZtA8m68#3{f*a4BT!56W(k=G
z4o_kdz}l|0uaZVz6{4AtLIq4mm-+t0<?vV-PbF2U6~Y+!!T%%}03`;pw?jr*3T{K+
z6%J7f1ktJn4cVJhrmsij6m1KAM8%YU%<x#^I#Zw1Ei7_AR9ANOQVTkC4_cG2z;`V>
zg{jRY)L!dErjULAnESl=r^bhqYtVBk_ydm%a*7#52{RLVd=nWTV9kncs8rTo^4sxJ
zYtg@IxbFMV*@J1Zp4^%AaD1ND16YeRVC*c)H2m6F&AEM9G}2QWI=|R2Agp41OEGre
zk&qSawp?d>^PR!ZeV;UvU&eTdrM7@EUYThDgv$+%OLI;YUs;NL=GV9ud6*TlY!%>U
ztHJ<=8lS?NXTZiAL%5lPYUla8cV@K@Pc!>WPU+^48=#ULZ+?RTF8BA$irIl>0uE>O
z_kYvK2#6|fpBbWC$kn80&JN&2*qII|!dDVzFSQrsIoAfLNY;|BS~%AIHb?m!4KeC!
zZqq=NffdQIVJSXg|9N<+KV%@Sm+$3R!+D26bwV0O>Tt4Zd?YPgkNKqUjl3k_#RN~`
zqhJ1OBD(zj1|81<=kN8!_Ih6V0|lAm9<0(TIWX_^x~4RMCv@E|9A$uKpBqafZ^XS|
z9WNQ@Wv@)_$9d<B*vqe-crVLpk}e^x=?UdWBFvXwxFeb1lfC27*m+c;3r*xW#Xvq$
zXl&YRYS7o1<u9(ttUyY7CNzt);Wm2%{d3yX3U7O))<lBMzipe@xqTb>F=Ze3J|iR#
z3u%=2ae4-!dM|-%sr1ps;?T+yKMwB@dG;dpBs*|o+403}*jqjpp3qMw)!P;QQ@(V-
zq4SXVMA3j{m#O&FTU`7FtGWwyjL}D!DH<6bTmyNmS@J#%I+>w!m3?yXTWP%sXxnR?
z4T@yyaxA#y1>Y-F&VaOtEu7<DnOYLH@qtz2ThULfvkb8cu-Jd;mPr)nZVDajY^^KD
zzTc9!%1$;_PH4a&rP5+ilU`HI9p38gj7@**NXC;m!%ybQ-}qje$QeFcje#N}uKtEB
zj2kY2rmD#n2?*v7P>c(IadHyA^X|z%TgK3q1fOG`Pm7Ibg*1qsR=dj5eh=~4on-MW
zBi|_Hl?%GbrQi){ibA8ol3Bio_V)JE!0AJYVh|^%8x?#Xo6WVbl%&c_G*uXp4mLY#
z_8O%tv(xj#*C;$u)1`$c0yOa(+l(bJWBIOawr5CBj-FOMR+S9&5{II3y7064pL6dJ
zl$TlY6<o9Z%s=MUCBSBvB{nT?d$8bs%_W8`>yS(QGgsbI7*<dhtW=3(lVl*8#q5fb
zar`WE5uuL-?Tmc)SVBT6q9(QzBJw$a<SKA+<jlPLp3Gs{9TQ6Z?uIHUv9IP1o-mGd
zY@25@Jl(qr#guY8aiIzO?G?Y&RxD*R7l$Z^Bna43yu)l*I8=n6HI$#p+R6vzxiPS1
z9yJeVAvPd8?$@f~b&5zrR)=fjFRX#UGKcJq<eto`%b!@%_-_~9*B>0;Wj8c5kI!Ju
zKHkp|?&R+Kt#V)722|`G%siW3L0->MMhs*ETs`cF?>xkwXu1?-YXJlUaUPm9&wq>e
zj_KA0BpkQ!+=-pHWY2cE@~#`iqmatTAR$x8B{9uMrG(W3W_xQ3*8y0WI^aBYkUs<g
zhAaWrNX9xu36vGS&|8{y&*A2Wyb@rT6=jH=_I9ir_QFpPOCE(<)jy^5S&H&ORCXw<
zJ3$9j1`4<@qpio*HV0cNQ#+{)nH_zWm1_kFA@b1d9!45hqZ|pTSCaKw=?oS`*pRps
z!#8GWp{K1!lSxBh<cKZnrdFXl^w|sk^tTnifbm!->iR^esC_D{D>{XDx{|xe?NT|7
zwoU4xvZ5U)IxklHDMr)i%z@jr?#puiE^~u#9<(Dy)#zBEi;_0f-)^C%U2X1Ca>APA
zXsbq&J8SG7vy#e@uVrC$_>aMP<$X*1ArZO|%ROP9unnAp%-2c93*G3RzWt0on3rp3
zxtK3QDXVKZm+=_LR$H>b>=`LwW_IhI=n<$1+IO}<GRUU~jIU?**kUd3<lo0|1{-UM
zG{V>}0veif&R5R=Ly83PD-d&6#^Ju_Cb_oAV^Jb6%XTlkK2P%sWzcVa0B&#Rj#;VW
zBVHAdWiVGgPvCd<I^40F-M{y|7+9aFn$76YJpN-{aOaL%?EmAHy#n1as&bdeN8PI=
z>O0U2ETk!ja~y_YyfPa_6@w@DGE{4H&D4@_7$J0f<NA<a8NJW+puf=3y#L1q*bY{e
zBLQZA=3q+`4n@M9Is6V<%yQo*i}7U{p$(om9ZoYSn&B&DSd(nAilG_v(A_{gscX{+
zrK~zEXgQq{ko0z2HsWZBVfw&RmU}kIQHW6D@RNNj7r>|2xZW=@h1Sc8X6RF<G4!VC
zdQH43lIMZ~2v;AUo%Dj3F{&_ngfRI?aNbD+d8mi?<!b^1)R-yjT;u}Nl#kq5G&KLx
zz0{WE?Gd3D+jr3s4}9MD?65Oe1c`3>G}fIT(qlQ}dkFXS@dq=z3cC-?UfcXHft~v%
z=v~;SX=VlZdHiwZ<)2?`0|=rJ0d$9RQPk?>eIHqnr{C8uTK$`z?g}!j9Wk~n!%p|X
zbxUVQftDkp96@VK!0nItdW8THyLotXvBjYgZvYG41d}QoTMyyO=9^B>(qr{ywk{v2
zEgON*ls_q4pX{KE^HGSp!R0$PpK7PrnbceA3zM#G>qWUI1*VK%_1ynM+FM4|5i||L
zL?8kQ!QI_0K=9!1?he7-H3`9;;10pv-Q8U;E*F=J>t%=JdG>to`Tp#l?H|MC%uIK6
zbyam$byZE*Qf{;Jo%F&7QE7^Y4#md0i~b?VeAl-NeeQzga_79Y#$$F@65EC=8y<7j
z%^<cFNRXT0wW`HW4(`2KG7O_VyU*?v*j~SfrE4;-I9pUxn&z6)SaX=i9~-f@E!*xy
zxJJg_wj}Y!;9!0ZZDXI#$#c7Q8*zk^rJvWA6)GpCvYm{Wfl8@ktW7{}OvuTK9%)hv
z4+5BXM71Kx>Iy)bi6|2~Zk>|6_hGsBWaEHkjrzFrAee@(^nhj4D|RyEC=T)!&+E|r
z_4lv<9`ac`jx}mc@@e|yoxF)-r{+rXS!a%|r)c*628E@}fwdWteSC|iFPo3$3cmam
zopt<AL1DN7)rUfupaLe=y85kDx!PveEeLgth;+(j*^HCK9jn7Ya@t@rYiS1%+iv|N
z{ANytP1cKgjbmQEao3q6Pi8+Yc5p!LLg$M+Uz*myHC?Njhcf>cvXrE1C;9D{&~@=A
zte@kR{z&K+O_(X%f2=est5HLiBYN?BYS!T%NAsU@hZI)1Y~InOoXa8}6lsaXMGqJ;
zQ|)2oGrgi86K7d)FT{aIt@6~Hf-|Xh-icJf_ojo0XG+a)h7|r3NscY5Tm(W18QgfP
zERa2!0&>auwC-nIQBQ8fyfOrjoW9(b$HQ2DEVKR<GjHZJ)(0NtxQTXANkR#i4#;vS
zg^FM>G7;>vX`iJziUQ7+B@cORxPJU<)s{ybi@E~MqJ9_dG!IrVD({u3jVvc(LSii#
zm2`Or-IN0FK6v%51dy+CoSx0%R05vYx^}R3Wk%IZ;NUy{(w}^7Yl$&*RP(k=CY}Mg
ziOUL<1G-8pWM8%NUL7o$3VfY8-LlHuPV<h}ysc*aaTN2QM{ai`--!Sey*!d^KDO()
zExlki6z?zriW&7Ty^yVS=nUEub2Mo#HA1gh-RqaNGWp1PfH<cyqgygj4+P_HJt^pg
z+^4X~+$`ZFrlYcTQ`ed(!D(Rb4?PtHt-o=X^cv03li%rhccRHQl#>T|I1TsL=-j-f
z?J^5Ry{B~ou8%v1vEH-|<?m89hSYg&hdtm!@N32aBC<0%i!_<IhVb{rlDA+6XMOW$
zZ6lcvZ^*R~LT($A_SU*oGuerJ^u4Q68jJK|Sd{4jy*;_(z~($bmC*4DuE2F36BaCO
zo#mx5&oE{sI?SZo<P+T)ou|#=?K*?}`Eo*+d~GH@H}ZCMl#V+gnRmPTN^`hLtu|!y
z3No8rYQ}`$u>ZMkz|5G`da{)ve{p=knW@sY%a}$!+E1?fcCDTrH}Q6<azfW?vJz#1
zWnX~>)&3$NXlWE}Vo1lSYQ4qTL-tafYy{a%{6f2EysvZuqoQjn)9qyUYY-PhOCP5<
zEAzQ=;v+iMc|6NTgiLGIuBg;!mv2i1=-OVOqLyfF=ac4LUadlHXGk`NAW?d@)L^qe
zl1lP|8!K`nxB0Rp4(_{?2XA_35FO)(8L0pU<OXv&mUG>$FYFjCOpV3?=E`BP!G|@7
zR6ku%ND%7wVI{-5DcsMBMVGy5!s_sf*&4(zL7#0x#P7;p&ziX4XnbKgTB-rhj6P(t
zJSC)n`Rr`TUzSrn*K4U8-|HW^&m8}$W}qI&?3c@>Ho64rn$U#aii*Ugks{MD4{TjU
z=I+XbTdjE$^Pu)LRoqkxy%lDny=JrFuJ&H&mT6n&v17o<QK&7VI?0q)YcZm_C5?(6
zF#p9gfu~I?Xq6LAh~9`3#z6K5?K!Q{qm_oys=RBd`Ef>qo0A}%TSkn=uNruC`2m{C
zbK~}ox(mHQSzK~L3`oW$2i_q^m7W5D)JK;rj+anPiD7y50!p_;2vB{j^^aeN)GBsq
zZ^?f~)efW}6DL=~S)Ql?8B~;CxU*loYpS4hriG$5p#=8=DURgih1pf0=GD#`o9KhY
z3xrBhmDME)AgO$e<6s0jBsnp@EOr<~YWg;lO{~YYK-S1h;zay2stz4bVe0nE{?SnZ
z=kTx?ge%jQHcc(=t?(3A!|`Jac(4tRZ|DTSDK%p3mDveD5DXA+n0YH_DIU!N)GAS9
z8M~fFKAmOmjK=?U-jXFH|9N?|5`4VEBOPA0ZA6P5Mn?-EMtoFG2}iiDs%aHS(AH;M
zMTF7IUNE>-@nx+$MfzO;dd!j7+Fdzm2z1H+d5$25?ul5-(jIw5@Ii*uIT8X4It
zXJ_k)y4Nl1X3D)`bl)`5yphC?CQyH8cA|*#B@Er(=(O2pO=$&9T1KTJ@)W4)4J;f?
z`Ck7zi)n1E({RUzD&{N8g7*YXj*AXs-_BeAT>s!pT%`%By>KR3crM&C&bX!d0#)Ir
zEuisUgq#<T+rHo?jNbm?F7VMp{?_9vGOTYB55`v-Q1m0ZS{dL<o&rmtAB>%G>kMKU
zE!F>;Zp#GHxEx05jjGjNy5v;eRs>CnPaY^Fxai}!&lP{L14QRxiVRlYJzz9kzv<>C
za2oR*L%d2{2tPuN+EyZFCP08<tHzXvIz6!b0p`XQgtjUt>dlV%;i}vM0o2(Z7klB^
zPS8M)B+##**G(*Q`Ob69+L=o@Fy+pWh#Lo$pg>&B{b|1Y#q--RFi9`~hLxB4=yzs1
zs*2!`x3(TLcD&0kB5LhM3G&Jp2w>WCMJ_{S(mO19=Q}a)2bfkL{OW;3S5lH56R!&O
zpWt!h<i;<-@w~5q-af-#p}_QKyIs>wcm>k%S;Q_HJFUK##@Cb6gMmp#>rvxPK{Z|G
zt}H39v2Y)|89uDe&hr$`oBwY2WK~vz0xSoBWZW{ND)rwleO1nM+7He{T(T?rg1Uog
zhy2X5Ii2lO8sho%>IgowBs{)=$+#Y2U!{C%LNYe^?(=#ROQ2=~#O0#7Qd@ph|Lt_L
zU#uco=R}$bH;kTJT%0Cain1`wgv-SN-Fr7<WU<d6gmixR&#o*!WMKbk{bTrhZAugv
z7&X(sb(y{jrtx~+ojgHLxSS7f(w}Y#em_G81Jk?}X#)!b<Llq-3L%b_Xt~=jDr>pI
zmUx~&42+Kb|E%h?5)VB4pZ0%U@#(|u4&Rg8!wtly^%hR}Um+<y0VJTiuOvFnt`*JA
z4}&yi8I=F}*BLOWqI@1L9~6SYpinSfRMvX8Xi=8=`S1NTvVi|lR7&qj<==b2)ffD=
z^|{{Gzm~!AcZUD@@7qU!=>K~QCh67f^K0Mw`JQgaHv1xc&PUiO|5MZoCHVO<Y{*7W
z$oBTOs*W4%e_%e9U`rOJ0`7fX2EdMz$kFk-oSD<{_>l9jV+M}@ImG?{Lt(71g3#MB
zAACOVhh&>pe)(sJU|<sDUIW+hKnw;QS_X!;vq75w!0ZbYfufaohGPE;09%Rr5?IaF
zt3?aS56|zxxDF8gkCI*UZU0wj`rF6<#xC~Rm=E;)2C}*8dHx~zU+u4UKVsk^UlD+q
z4BXsqV^wwj3-j~VB>#+AxjF#sKMDn^x?Yzxb2@@%e|v40{P=ez{yRiFwB+B<=j;D@
z{jk3^wqQ=9UGFXmt`LWYh6YzrpDaet&g>zG)B1hkc(6WGD=Rt=vJHGYP}TcaJ#8QV
z4pYw5@?ZFpwdt|$F7_13LV)X3wA;*x<UnI>T^&v00v?z17IT%@(BHYXi}*ehV0&(q
zWOA<ola&Z7dWiDvL0KEr{TQ*~wej&2^^_Pqv}TO~9@?t1=vK+$1HCnc@NGkxuh|y~
z<e*ye^xK1WxI{+R$IGPYDZ<p~e?igsOOn1}V#yjM<Yk4VvbsKxPGqSXrvW1}1H;hU
zo%9C2rz?<+P)rV8OdMT*7XaH%WyRljVMc!ag~>m1C?8;VM}7Sc%9=t_WnhJ*qh<eq
zp$Hs{+{MvhtP(xoE8_VI!#B+3e_tG@{mbXYh{m&p@;@#w=X9SrA(pDBzlbf15b=H|
z-3JUU&XI0wy+3ZE{KtX$um09}qq_vqr0ac~^Q8|M$w5wzE;9ML5#VKzP1|;K`J&Hq
zg<)UQN$ZduF1Y2T<qEWRV^t|z@71@%H_t@5{g)!gv-F=B+8#Ep?`wQ+?+(hIwi$f*
ze>nZ@C)06I)UBz~bvc~xuv-GU3F)S3Z*)u>d;*gf(r`>IQNOT4A-^<HI>;Sqq2@B)
zfhyeRwl48e`9ur>)K*^exBK$i2k+<pv;M$D8ZM*SE9yQ0qENQ!{Ht~E!T9*6!8aM|
zza96_&MMDLw|{;A;O%NXW>IpdgP-4A%#b%+bX)javH{G-NR%{ed2ACI35ccnLqMvq
zPSQag5(BFYstbjJ-(tO3%c&_ZU$bxct^cG8J&RviS-BIrFAtB4j~C38qraAjbb`%i
zUsQ@NT;x-I5Z>nb4(mgHYo#UuHKMtHvpv>8FL0Z@<V@D{q{O7H85uUxGR|=QMu`;Q
zDCrlsvvg=Z)IO9tmckbY<zhn^nr2;nxJQlDSw7{ur|iInR3SSmZ9#S`b#Q;u7Gbb3
zZQu04Pat#2Q8j!MdR5o9&I_xD`n%VlB2PA*o|ZPUl0`X|v>=*W8Cw1e;{|2VT(l3P
zL_e4lH+)l!_gx4AEkNA8ex3*9@H&$8wT1T1U$GW@RM043x4i2&Vs3xz+&Xo+)4<G!
zLa%+E*yAm|o*u8G%$5zH&$~=-7!E-06n%C8m!rLZO^ES4@CN$}^H*Y43YWk(N1KrP
z9#+0O+G7T4Ga#3^S-<o|{P?_L{DiBGNUv0)(~&a5bc=46e_W~n!wR*145{|4h-LE}
zX40_F@ryRbKfw@6T)b3iP!z0x2Xg=<<{$|`&-tEE7X0YU0NFE{BiZ(N1?uj8ef;Hg
z(!aPN4+vj;q4Z;nebrM@yxgE^pB!$of$C~-=rQ=!rq%`;gb|o9r$DRIqo6XtZzdwX
z$PIB?mNhk|Z>H;3g1?=W2Yz{x(PJN@$yDWJ(PXy?%#q#S-@iPHrrGa%(G{MrM8aKs
z+rzI<KeP)q38<f$lx(U@;`nmeWmT9TkjmA_KvlI*dDKnnvAQ;~$!~qdU(5(4N+7dg
zSVTX>JiS~*pxuyxQR8axJb0EPx=)w7lwl?N0M*|g)vd00T`#vi9@$WSaK$2?2RKfc
zD^^*&y!95G9H_*uq?aPu<}W~Gg)sb&Pj24;+*Jakmmd&MN550C;zXO+e;Qj^?qgLy
zd3xnTu|~eNfXyfsl37FKo-z@Ye8yNH)GR2@Pt6~J|Hnb#vqJtyTiWl6L?-VY--}25
zfC{Yr8P+V%4yW~#*?6F?NHq*UKlQgmDH3UVeR16f8^5W7s+3T+lN}HNX(-j6yIN?x
zTChk@{%&QVu-dDs-$(<4X}h#la&>jZ#Kg3_Z|p|EbZGQa$gdfV7Qg9&U@kc}Ont%}
zj~avKTp2n)TPBeCwagQs{H70X2g{RFDJoj+!Or`S=8gu<XfR7wS}PJq3$NkkHHLTk
z0coYJrq&-zXahTBJwi@KRv}K~LDhvzC3v^!bhah#jEkwD`k$`aX(%aor(@CZUcgyR
zp=_4GtXb=w&n{G%qD*xBe6t{+syb(y?iGjU4KD~G$u2LCyl+0ex{8y?Sy));*X;O#
zQ*D{|!UHI*a}Hi#G{%xi;~sEw>A9E!knMtaHv3}AOoOJBfadP=hggP5OhC1I^=m}4
ztFyDYrRCt@01=;CO+#bQi?f`sji?!un&89b<s}$T!*nd|=;+AG#zt2|BN)KQ%<cJ3
zOmuYM+XJXNsdED#ad1S0C?cSf`8IW6D>IgtD0I3OWe#*aH~szTf+g3Y#YGM0&Prfc
zMWd64sW$}htgo+MWjaPrM>k_$qfv78Oj3Yot+QDTKXm$JFsD~+ZVKgecXJy_WO<|f
z8u4ZAtBy+JAzP<Pv2*Z<gOk&8xxOGj{|k+V=efNC<`$Wd&_9a5w!hzKxxjp3f;Wka
ziwhn}zCNrPvtQQcd%RF|bcFTwuB@vw&2GHAb1zXDGARQlm|Cf7{ljK@o~H}8`0&I^
zTlWhC14FGK&Hj6QT+1PJ{UTUkw3@oQvf|?6k`kZ{s?aZBNCeuglE)l{0%{r>rmEOq
z`b|xZjg6HDS*WO}<wa(hCNLT4hXB%!juzbDu(7c}V}Ahc#g6EVxH#C_t_i4J$xiD$
z+3isyIqi20&Tu2Sxw!$JqXMO1?4NHwHGM@#vI5mw-JWgde+p3oh^I95S=W4?Ce)x$
zlTNo4cR7j{5i~cmvf_Pzb@X>7Lq6y<2t^!S55kolwjHwT)B71ae5Lo>wk8QUQqk7N
zL#n7Mt7NbDY4W0LZf*{Ii$&wh;hZ5WC52GRg6{0L)avCSJ0}PXgZZgn^M3o`_PnO1
zW=U1IT<kDp$lK`+Oub~iciCaO<%uf2hp|o7wh?-#{k3tGsyU+mz~q$LRg#vj?$~n1
zRcp~<Tg!Bn7^#7v@}DA@x@P*BxjCEp^6#l8*dyBT_4bt{b@dn+PJp6f8&7D12Xi>s
zqeeKSV71zsI@~+^0%qxz)`{x*>$t#b4Z3TF_%8cYeGd+7q>2R^cEkcVW{}yK%rBSw
z`?lMi0ym=gEN74VI5eQBZGjRgvV8nXH_UkaQ;<rTj<$w|`{^3LbFP_GI;&$Uw_CJ!
zqVJ<3b!;m@X+Dn!x=0NG2uo!Bnr}n^YE)K6Lm#SSe8%O^H#mfqPxeH#H31P|JUtw2
z?0(v??W=~WlbPjG6emprXCQa#;t7ZMKEIxJhFb|ONJtIzUN3{5%K<zLxw<k@oV&Zb
zI|9{VprhN<+-LS35Zd%rvIr!w8w0!)1`7j-+v68oC9kgV>I)0ayZk>K>d{p5(q;b&
zhN!?WD)Ihw-)kPw?x@F&8&=TRPw=iJ1OHmqfq8RU_zhZ<9wuENnX+j04rZw;1R=T6
zyk>E7Qo%VIg{{+YXWAhCQt-g#v|2AH(5gGBEl@{C2LOs64H$9>2bij_80mX>ctfAt
z9c0eXI_$=<=@<%Gd-=}8g{qz5L=-HnpurcuM8T324MbOUPS@7i%JgwZn|VgEi0@$h
z$7B6c4U;sNB0lIW6u{L#l_9mQZ2HsGjO{KL5YQx+ybeqNboR9s!4*zDD><P1$&wkF
z&60U^>R!OEarD6Iu0dMkZzZ=4xwl)O@%R~7e-k?#cV>v92Cvg`l2)VIkk+F;rF@P~
ztIzWlkSF(mc!gpA%Cutsja99wRs~=<C2C<&fJk9{HQBsIs#J-A$eo6qhx*rX!vbo=
z1~|QkAC$=FM2u--tcsJKKvR<K^az2z08n~z{44Gl$!R=cb;R9e*V&LkxZ<U1l~f&O
zfZeMVpUslv`WM5{A-dhZlEuDSzXg!iTS;wuHFMpNG42>vhKf*hs>GfGEu5JL5e8OT
z<e>RpYK><*;3w|cY!m4C(5zW`?j{z;rl*z`sq?CeNK;w4r^dJ`yv{)L6j%lDKjTJE
zPqP!S7rDQa5wp3H5!Z@adT+*AK<r`?ZNJkd8dok^gV!zdZe@=N>x@BhskVr*Aam$i
zgI4U_r$5eK;lxpa_wh~NT~Gma;M0%h%e2}r_be;HkzLMnSH=#?S8&%hvUI8!Z=X?A
zA918_PViDz2;Ao85^S&lEnkjHKH(Fx@C{KMX(K080)$FAX}{))spk1;uV&OjWRZ*+
ztrKn-tD_T$%C~7kG2dK=PfYb0H~w1Z_17jib8KK_HRd6srsLUylw->LBQ`izSxK(g
z1U@`-Syc!!unBO>Mb>?cBR9!|r~nc|p%0>>C0UX8xmIzrYzngogl#`_#H;PZHC(8e
zUCZ#Z#JUJ+m{-X;2<Du#8wAt}hG|y-4KIDDz5{;hjSH-B=HvWs39ZWtuhNY&4y$86
zp=uASGobMc9B=;TBLGOD<DJ{V_|#><qqV9hbA)k$<ilbgk?QjC)R5r*pM!epeW!9|
zhJe3@5zs3P=0jwi2JN|8+&Be*#(!<?7{{ObzyJJ`fzQ+a*Wlp5fU^Pyvl_Ekta}qu
z+_cbH{b?-R<pe)iC%b5eqe1+&%N~#{YWO*kGyd$>6F(xr<=<!sSi8^5&Zrhk?iS$`
z#JMzrK7#%vnywXXQ|}TXoLbTC(W|RGyv*53Zk0APUy6UJQ`U*|Z)E>Z7k`}}nlaoM
z_!N%zHI2zTYXc;F>L(QYa%={}MoPSY!9yVGe|8$I&33o0TX>#aiKl3#%TspeTmHR^
z*%|q4;UhS?E1O)Y?_~5RMkgC%1hZW2k<)!_!_u8CI(GpVjH=>@5qgk+>)C*;?tqnt
z1O!`Py%sv;qGK*g2njjAiU`qR8z)7Zl&0uw|Gq>tUU<SxYhQ5icAa+=4=7Nvaq9)O
z?@FB6Uln|I_diTA!gmnFpy5m@?#bYdjBz=Xa(dN;GGy9}ivL+y4=VQC<1DzXh^_iG
zWsBuU{A;)QO{;$%9T!;e^OP|+G#q;p2h2pB@ZK{}Dk|Dbm@hsR55G?Y<3Q}hKdcUj
z>rp>B78t0T%ZMnT*Y%Y+=TnVHEN2y)>pvALcqb%Pk(jL<CNvY=$YjGxDeH97v=W)^
zC!aU1oQ2kXr}D8$JL&Y>81mey7?t@mpo_6f5SL@NUR%VX137zC)DVBQYF3S6=8*N$
zGyd4T?32~#B*p^?FCEJQ7gq$>f;S?OosLv8<T~t^g_YGKq>sRfa~V*Ho~^c%&R_7r
zwu#V{Ve*XqHEY3V&yO!~bgH60yyHSWt<Cs}<MI}>vA7O7_?Dk5nE90m3KCNGfI6Ga
z3?k{t!AEVTo~Gcgx<Un$O1y8Ew?n8%1lCHel_dSp(T`$6p#{J4b0oQPg!76B(8{SC
zhBj(HHs9?b1@q<Q5+H(@Q$>aL>58ITvCtY!b+Zsbc{6fpm{Fb@C7QHRk@N>63B>%a
z*7{jLmT%%u!#bb?b{ywdeg^w5L#6=@O<rDJPHuc@>EW~s@%;870N;IgvBp9qyP~FM
zrmt`37luxGO^xthKU2H*4?xj{(PK1J!ig(vT<(myyu`$~+KG+YSpHMH8x!bSIi65{
zk&ScTDCqKfYOZ%=3t7^xeFJOb6`AR%V&oVeek7*yp^b0_I}X*xDbH%&4YK;s4+WRk
ziAR)a99<!K#hgXid(7tYG}nc<EG%<(wkU7kawFScSF#sf?|2CYPuIM+&Q9g_tRTLm
zGnV)IMrndp{$;LAS5{K;{%nY06|hGjn|+}9c{O$QitUk+5mr`K0?$)^9i5e2>Ey#B
z$k{W=wQGO>H;+wN%ST=ZlO1TuJ{=UQdFL4_$*Fyiujkgcl7g~jk1spbhu27wNXVWZ
z5Nmou`bXrI!_FJhG;VIEuymelBRAxTWSkEegF`Z^W`$0T@Yvc;%2-`uQsl&0DygMO
z6z`&4_^66y@+`Ke_KHLOlbyazPRqg1>;`?RSC1QtXUO~WXC4IF+1h&h8gB&&Pzb7O
zYn|=v2KxIcX=$}u+yS-oVoH|paa<5FbLE+`^z?}Sa>}$T_lvPohs?EF-geFV7QwoR
zA$L7QF(DcIHkh{3pAVXjgEf+>N2MDLZqjiaL%nWyZd@#@d)S~_W(4^N#N61xep`O8
zUcJ`FI9k2Z7%5-&c;N=;BMo==yAz+M`}X#)76<sNsQ^UDd_GXXDWj;Mu(7d0E|uV0
z_>4f`@4uRz#=q%fl7Qs<aMQ<jAjXbffb9OyHv$CrgT9@kFpo;<#A1FvR*{`b8mq%M
zAN#2H71+p?eh3$ql(h7jZfVNr{qjY)ghRn_U*rm_G%s#7B4iH1WKcfPNN7E_ZGyY-
z9i9kdYRFxCrj%NOkp$f{^r^VIv)3=VatlYtA5#g?-<G%rD|0D$LVBxhg@uJxR8(35
zN{Rs@{&u!C0C<fTOSPM#qoc?Fe(V4S4*hJlLm-CDM#I2ut}OEhSp%Qv)?5HXHuj7T
ztJ`bJ-sO{rGxG)Ll7V}{eREB69#v$2Is{$^AB$Ie@;adONu{8X8j?&Sx<iWDs6;(O
zk?#4!-S_(qWBiw}^~BL4Ryyy>T100r$O|+<t0Kv$<jy`04hz$86K|vYk5ad$3+Db>
z_w`*z*yFEs9y1s9j`lb0&ZBL<s5k~^%3I0$RPIbQH*X4?_X>`BMQW)Om2BHHMI~!N
z%lVbHMsM70aj;reNRi>k-&PW;mgziuH~5|&&Ncl@QdS?2U35JmJ$qAm>E3r{N`%kk
zg0Q9fS*e4tGe*?Aaz%DJ)3Q5%z^l=wWxs0<#g%{(b+cP=%`CU?cHulQkO>PRex}gt
zR=jKrP2^}kCS%vo`_9W-s(WIMgk>Aw6}~I7$N9S>&>S2Dt<(|*2M0YsJ<5mdb2PUl
z7xULfR6a^yYQ6>Yg~{?VplaHsU*XHY%e+@59oEbb(b(D>W;Y8ru~I%4Rljx%09orI
zY`qX|7!YA{OZO^1bC)KnW&0L9Ww@#cFRbNB{SoTmdY5&P<Aa>rjC9D6gk2+yvnthJ
zrRlMu_h;C;ckr$16JmB2DU}_3iwHzob;tqQ$W}1OcP&);QjnHbgrC3T2P{0k$6-b5
z>aVe#wwsNRckkZ4`H1@!_IZbHhY?U?&HvKCn=Kbe;G4I%X?pef)sMI`1WB!GN;+Z@
zZ)yi#_u|f<!kKckTJ@hU_4Ijfw_-fFb9{}&!EsOiJ)X_e9w<nVZr*jXlk=E(GzYH0
z9czv=Ai1)3%eW<GuHPZ)K}*e~cFJ@r%W>9gdENQL+6%W<3q_TbpgTs}2M-~kmvz21
zCwZTH18ij@c4E}(QweJ5S&cMaY7#;|tR~`9kq~bTWM3az21$q^IQoQG-M`&}M`1BU
zs?`VwgTcUY1vwqvk@Ly!(NT#?(ZT*cVE4*}Juf=rvnKYzf9h6U)M5*s<guRrE>tA3
zpg0arm^XWDA-B+scB+QCIok}!!cVB%1wJr^#sVa-neymnx9|(w|Af@T<T&`#ulgez
z^6XJ*2Bg=*cXD?8-s;NP;|$c3s+N=k1CyKqbXaAe7S<*q!*A;x(g{e7++x<PfmQq^
zONpw*rEHAc9h)(aI`2OuOsA(PtwFB*PqU$W>EK<ksDwMuuVY&g=z_?y3j)?)CA*ap
zQdCWhfp~rZAW>tDHC`F^I3#7f)xq8;>IJ=l@uxlMp1SL{uIo1lXYWq%Neg)u9r3z*
z|MLtSvY656OX9P8-IQDH?$#(xPZAqw$=JX*qD;K92rl4x-SyFLc#*elG^qqJ9$E#^
ztI_#&EMbBM<Yh(Sz{lta&sFFl>F7&*q||x?Z)V7F{ZgXEhZ#=omY|a@YMc?4GTgR0
zhsPDhUHV-FI?wM_Qa{qKXon*BpkM5_`bF{GwwzDC85>`A!ZRqTsNCJ$WMDt*aC#XC
zf@lcuUMAMt*(h`mB1pq)w*Sr(60)RP=OH2jn=8AitN$vE%7?U>4h;?s4h|J!FSdb>
zHCc~t$qM=`{IXM1-5NK#zm^H(422zv2&!9~FSb5x>8(aOaSTpY7D~1u%S)T7?3;3!
z(L7*SQL(>ENx5;K=|h4-i+<l-i7KW78TEV{RaNBQtIjtCx>O4!P4D~gSuYN-G=^T8
zNF-DhM&D_sbe;;TtGhL5*bv9G0u`<+&nheJm&-hfi67VJ&BhqS>6lIlT%4V=Shx%x
zOog^|pBDo^MHW8DGCJjhCKpS7h@>MJ_Rhdqy~t>CD*CU$e(*gSE8X!f;0ADYOn>sJ
zHp%LD{lIq2r&<ld?1`J?)d%qC@%>he?Ea<uG|@A)A`o8K;3=6GN(3ukvA2=Er=Z-j
zyq4k;egMakF^~)A+@gvM6?~=wZx#h3I{QfmQ4-p@z-W)Yf6#t;GBdnHX7b2!aBjFV
ztPehx%YhACh<8_33G@+=GHHkG)~=NKckK=C1?9M9GILeE$r>5MiE-T$BGv5oOg<zi
zXS%IA^{v|$G6KRAysXwmEf!Vi^#x)~@ALAug=8XrUu+IYrjRwqC1LYKdD-MeKDmh=
zr;A_C-n3^s#9WakjjRr8wJhy68twQSwbfDw_P_TeV!N!0<$q`2TnDrEB5wh!YRksG
z=d53kAgFy&$w4FW(vHan9Cu3PwXEMh_Lfo3{{xQm<J~1!lZ~WgNNg<H{-<Z8z<mO?
zDL;AB;<p3!w;O?iE<SC>?jYS}DvP@2%(Ev(a;HI!DNS`%)7&F<`w-VDTP1a)(?FkW
z9!s(;JN@;kn0Q<T^Q^h@?=*%bYWLoK<*|W4<p2Rv)&68;?8CG)M~xDSh6J&-D)K4M
zX*^kc#$_i%FqSmD_#MQ5<NS77A{RZ2iyKz9mD?e}t>a5ugT0PG&f-^lN0XPnZ&GN@
zWiz$y_e(<)tT+W}WY=h~HCsto*uGBg;K4nF!o2Dk9X$p%p~k9v^Uq0GEG8Kr|8fNk
zkqw@dYw&Ny27Q?$9}S2ro(@48H#J>rH=o%Aaxax$-y8nAlmsQ!62rjJ!?FH)H{6(G
zW~RHOB0AD4a^el?(~I(xv#+kq4a3b}celPOncl?G=v^dLDw<n4&G<MqKAC;qr#6pD
z(73<!XHJwd681cRre^6V>Xeq=@vC007Vev;YK%lS1$Ah`!}z|&A5yfBzcvkc!AH{G
z?z$Bvnv|4eZef9khi6lF1esCQ&B@El{&<p#?yny>cf>u4(Df;zkA&Et2^qp!<JZLX
zvWt<9J8gM{1XC*a%7(+!TUM0NAQVBpYG?n9FqDV88K1z)<;}_Qbf`{tY2%#*&AYx(
zm)Kwr?g05b-Xe?76K_3&hNX$y=U4%Kis<cm-A8I}-gFnmx!Ks7?kVj~#?Y@|s*@pR
zIXZIUpE_RP;gP38>>h9asqJj*G)2NgcTtP1w18QrRcnOX<t?+Jr$#5A#PIb7UKG@a
zVHii%N$qi_S>%CTE?(H|g@)-zsuN+GoE;y<p7{p%QG}})gUjb*PWV2052rU31rj-D
z5C~F`P=1$(NDSN9_JCPMU(@J)*_iu6kFjv;_do<;d08G#4vB^34HzlN;y{Fqv>cO=
zQU5y!yTm&AsoH@^6fkq#`Av32uy6!)mYvZ&TLA>=yGA#jwjluQ+x=XQyK6Uk!#>f&
zrtQkt{wa*{qAkR~wRfP#zmYbz!B$6%9<13TvT9q$XPP!ox8J|Rzw~P9jkta<zEXCu
zMA0sncG>uJL$T~-`UZ~v?L|767h2bfl(-GoYy-^0ANxU%^EV@zxO}4OD=2OuD6yGS
zv4UM-_8$5&)7O-T`ZEtpzcnFfK;1{2E;-Vbm`iIG-=uJLX#d)QW{j(N-ObPA?ePWb
zU9+E)@Wxztd@J3K-G^(hcdj6yceUntU2nI^KjQ1_>wx%o)lJ4nI0ueBZI<ehYFx%N
ztCNar-g%4G-Yi^C`=ugri>CMxw$<|z=1=kZXEncaU!ds-PV-iX_qP`RXkSZC?0?%M
zmrbX5ETP?~iUqyqp>nX9(aaQ1J$K`n5*XB$czhVwytc|8jZiO&JV7{WQG`)UL^?C*
zq~dQMtRCqY3w7Ce(oj@Rt7GNxa?7wjh#I~<+F{C%#_+RjUc*LhGVIY}2nzHr;jQw~
zv^Rgq67Kc-HDT{a3`_-6V7$#~H=iLqXw(2~Js;VuKSaGy(48M<I^vC52R;wPoGJrf
zN7j3U8_1yH9h_#@uU7)O7zdiAD#|h4%Q2Dy1;UD+#f3g&(z?y`<dQV+yy?^?>l8!F
zvH9CDIk`cSoT=iBw^&uZyW3d?tYBnd%yLg|d<H;W0c;s7bMyU8?QkHLAJ_%Lv02~O
zxRj~GzrN(CfGl5p*w&G4i~m6e;H)dvg$Z|*0V<6D5G@*U?WwlGp`CfEOzHXf0EYV|
zlDC!iCHiJ~rg%3$WnYt<9-N%w8OHrE=KT<T$ZbWC7WlOtS0->d_rQrldNzp^jgL~X
zfrMU&$N?R`8#z%{ZmG5<s5PtybhgN~!@Nz<Yt4Ov-Qb{W;^-ToL|cm?x%5apb!FU2
zOuJesuO4v?oC>$g{b+WQE0D*aiM{87Uekfbr)gG$M=ajSh`;}YBHL=-wJ5$n9b2&7
zd({-Yb!InQppD}oF`Xn424)r%+QP%X^&sctHQ3Y-Tj^(n)4m;5K5s#lzrSOm9nc_l
z$ti46zm0_>HYfygMo515f^JJEH58q{=r+g}EaeR|^bailE_X~)?XNA;u`#typx@o3
zGO};T$fPt>v~$Z5$)3T&BsNu@3%KmzsvNvTK+5JzCBR85MKR2zh4Bs7!0hd3q-)9h
z7}_!4wpyey&0(Qx#a(<dHM;oVH-~b@uvYWkJCWC`F*S90wnVc`w{7J<joo_bi}T_9
zXew7ct@;o5CPdfZ8gc$M^zQD%{aa)vvM>E%@@S~eOM+#zl*|op-ZU(a?j3GL6<DC<
zZd~B)>s+&-9?>`N<KtrC>{G~@`sjEc>=5ZD)_04?)^lc8?xf1>82d;DD~rF_7^T}k
zJwjVO39-Z}>CLZnVBKhEweaGs(T4SY=OHO;47af91=yX8zk!1TDNHAoY2YN4^8q6&
z(;(2X$-v-BQO(={)J~J;InwJ|wttZ?q^#*^PN&<zWG52BY|>hGt#o2$C*dC`hTQ5T
zn<XlJhkOEw;Je#Zb9r(yj%H_$0bf)6`t3(XHk#2KSYRkjJdv%~b#|m;t?tAnm;#`y
zOq5o;)^V>g2Z_co+T}N<(Yoe)WshtgS1;aQdh2%|hRBlWh@grsEqV0PbDtL8l*qnE
z^F%w~gUUzkP7E;QCF@9poOg`V!1#XrY<8}N@8h`W!L#!*l-hYe+Ys;zu9i`~aw`16
zy}i)Od*o4L>2nNaac}-I&Hxy+&u3(9Pmd=~2eUIXGt2dM*`Q_b$IH1NKYm;u%$XV)
zabL|F=GWbKa@cnHM<NZnWL3J~W%g*`V{h@VZfz%?$Z01$eW22L-+JWqD&-N(J2K^^
zIH@VC)WW<|&JR&27h)uFJf_s<cdmfMop5x7`c>QRD2f-cUw!@~p{i3_6@7o}%+B#W
zb<Kq9(o<di)!2~ggfz~w4lznLKRmE1-Fi^Ksz&$Pl5mXBBnxrzY^yhn(%f1hARd8#
zy%NPGBN9EK_l>C1+NOQML&%y3Ke2F12}f%7<-nI9^&uHt=ornqHi7ssgoY_~@HIbI
z+3T98j%3u<O{D}KO{bY|0#+IFT9T{`8lt0QF9jLhhORtuFK$>Qb;%zqe8|7M8AW1(
z0vQO2qx~#7@DuWzGFT$N5*x=mJVM{EXu`l~e{;^EQ8k8hygo?g<s&(HiPXB>_35)g
zcOY=wcC^&M!o&or=)+HDw@DKsuBfOW$~u@Uo12>>;BmiLeS+5g{MnVp<AIBZCvSLt
zLK8?xXy)LmE@toOpGBxJyUdO}(5YAU+ppOxseiYYHzUc*E$@b95*%pr<cS=_l1??T
zry(}kRY#z0T|3x#v=$uygS4sA{v*X5U9&O)?03-zQ5s2bPN{@EuK4M$`}(wQ<DlA5
zYTwfSi`nVvamt==2*ZHQTK>EDV`2N`{JTc8f_J2q;;%fg)6}RjR+mjD0~FQHU0>=1
z<kHC!+Qn>Y0|#vI9|kX$8-4j)_Q+<lMn{#f#yNd6LDe<2&@KY#DIkV{b1`OW>cYlG
z_wK-F#}f;7SljK0jgB`^>1D-Tk`%duh;~W#rSJy$v=SpGYtpk(7s1@ocK2c{zr}4I
zFr8WDLo^c0(1*2fY(+^9+_>>CnIj}ZICmVXe#*Z4m=>AVNF&?E_z*Xuo#wF=$F6j)
zm=*0ALF%0L!PnbLKX7bEbD4Hxr}X`OsAinrbaNONF$Gdhf>ifrSUU)qZo2|@PCoG0
z_YJ%((&uxJZBwkmZK~AS8#dgL#loo9BN&v|E{zfVEqoLwyNUa7$f3@c2ng_3$6wl&
zb<^5tWSv@j^^Qo>@v(X9hQu{5wqX~3=_)JZV0LV#d1q$6u;#hkV$O%RKMRjn#tEdb
zP2=YI;@FCt+M00!sO03R?}1Y>7Sqv`xVX6B;9ww3$NqAE#*>#ImZhFvcm;x^5<7fv
zt6Vr1RZpc|2mccNuBS{6tlT}2_eptc+e~CcQejDb$D_CZQ(GT*%q=y=Q)j;0iNl+C
z*k7(Wr9%Uy2)a`N4F=otW_D!IcN6b|Ni-kukIT*cSR<r>Zy7Gs(R<8cjd3If=4-?%
zXS5EEYUqvZpmaS1vjcu|-<)bO`R9{iM0AUeP*v4>oxyucn>-FhC$~g5f74FE^kYm3
z#w6e~P+swQv7b*<Mc}1lIhf9y!FDKtC`_g-0;o`=^thSR)RaSPNd@qInbQ79Q#95L
z4hHqZ95~g=&L023=&r3eaOaWa%8ARcNC#{C<jH+@0i6QyoeNAY6vskaht8Z_XVykn
z4rgbHqjuK=2t?*NDIwe|^OepD%6I#u$}T7(=MMrq9D7$gMVJOkxA#t)KLpYns_K>V
zCK6o@MS_}6%nc|9>891;u_JQHDafmywo@qhi#W>=i%gJ^)Q39xB$OVaQE*lvlcJ>f
zWNS6MMHa&nxuT_RT=d6B_4Sbh?CGH~F(}$44VLX5&qO1jk@=5kKG-Oz-8k$qB`zfW
zvZzl)Nwle|1ia%Zc(e`UnJz`(&3r$@AE(Qr3b|Dz7Fd_7g5WaE)I-IjjUkdr;~t8F
z*O?82blz4VR&{-7g@}Lhig1gua-qAYmi{re3!4sO{>mAm8;i~!$oS*J(699E15|>P
z0qse);5oTQruB1?pWoklX}zF5H6Yck;%GGzncR1Z9-l4MMC&*wTY$YSWXIfw7M=0I
z4V@e|g*f}HoAo-y%3i};?Zt8gTpH{^?t<;lb-$C2{y6(_xS@u~*TkFB$?^GU*+5uH
zDpd5hTSfP>gOWwuem$?fco_>yKR1uC)Px-x3i3FIJlp;ydBr^b3E-&r&3yg%d`N`P
zJ-e~S_a=g~jrEbTF26Q&cZz%2Z#=dY&tU}JNvAc5&%t=fk)C#KK(+gqq?22NFanbH
zysnyvPFe5$@)A9~57z_u<rg~hZ76leQh)5RRw$v|1Cx|BrHc2IvA7M&ISu62M9~4h
zdCu~d5*Ot?j2f-k7T5(XtSM${Mrtk$p8>XMmZoaf7!9lW2gcy>){XyJn1$jH?2C2D
zrt}Tn_~a6ENXQtd$wfEd={-BF<i%jhqt}rG^~Z&B{q?uT28DwUZc^{yQ-yjoH)7*G
zTB=fszhz3bxfm*#<xMed6VNAdlacN)uRI6p3hv*d7Xp9RUUZ+oLv$IF83ewzz@jA}
zw3u?yH5@w4n8@mc*}Pgfr5r<=Z@6Yg=MoE%YTPdpivH*PH|s0aMA<6kPCEu8VPa=X
z$WATcPFQgoeq+VDfXfPZalA_2NlYrAWk1bdB~9#hN*Pu@-XjuTe6Ke^Dmlr9t0;7s
zZA}hu3;+I7Dr|%feRzp^>~=Q5071D2c97bFm>3+Nge|SE|GhFLO4NG!6K7)+S9QJv
zuj+YT9<1ZSpa=P9v833v<UIIy@fScj;#biMokN0^2qt4)8nw($Rs&gEfeUx#s!w0$
zK+42xOd>5ZB?`yX(=p|bAWT`LWaL|AvECslYgx$$FBx75p<GF2GLK0`ha#<Pw$~B8
zARtgzkCJ}{IG{Y$i~iM$9eI^kTx`>YlE+v$Rymz581u3z)YVEKlUOJI>Scs%cLFA{
zIV%#9lZJoqAT7HYaHDWeslZ~`C29=O*6BF9RBedZg}`nluH*r1Dei42Q#0CrY6Q=I
z_~4O`V1*77LiX`aPjF%{XV98W989M`>Zu6EKqfpMilX15DDJmuvZDCJHPj5IIQC+d
zT3-S_Xf>?LajU`qRp2Xo+rHY+Pu(*y+2H{`oK_Qs)wF;X-KAWynXKZEs*=~N4R4GK
zf^e$l7)CZ4ThF?g)7^raNV*z+b-oVn?i85o-U=fS-D;a_jjtl6y%bwQ=0H3Fg1dc*
zU{|$i(X!`M_UKum=Bx(ODE15AK2O5HWUzFkpy_XdjeQEWxE3SjZD(l6m$DD~U}Xu*
z4X0JPKG?BNPk(y%T7cMHjqs0%Af6ytdkO(g5Z>YZt>;Sqo?W8N(EUaEq0tNZpcjXz
zfpl=v)>2}Q0{ct8!?(NJ?-{oSvLdvCj;()wA*{;E1T_U>?C7X3J+jg|j>V4SKOJtv
z%Y6kSTJ3Dy=BCW%UEF-j4%Z6WA1WAOyCIF!5kK_Ai-qexLI^ZOXoTnd8FceQUdFKd
zN9mmAchPVAE92K6?g-3Fp#~cuNx-rBc>`d6f8ya)^Y4!LC<3{m1Si`EZG?=rt!>vf
z+DjOuTNYT;2rw{?dPhiiq6}T@7YW_8Xbfu+M9Uu%Yp#@B-#hlf4~3chn)!7v8*{+&
zo(ZYJZ4lN!x(iPQV~#*B&W?a<Hil=pU|+4G*K@%-TW9s?PhhBwaz-OZ=J%Sf=*Wlq
zrjyF-8^Cuit(WTUS`hE*L4eLGpJR&wd5z58u%u0oLvc(eQiC3t!`#rM<zN1MytwN<
zC4i=e*#()8L|X`5UBpX&?G}c1rh&t1d|%fZj|wqhMLT-ixg-lbD}s!S)l5c85x>Qx
z`LgkZkNRz(&K?s3t3jfzmq3ES=b02bDed?S<ku%^^d!WU$ra23$8ALB1ylZ)A!x|n
z*|}dxpn_p?!?3U$zH^<Ex-t>2LuOrtFbycHw*0Yp7_sJRc3b^PGDTr5c~u6g;8RKD
zxw7}C*j;(LrD=<)In3`(#6;J7e#ijfUMoINUjBPfjBOlt>uT081h`b0yoqo1_SPs!
zJQw4FOGoQn0Tg6pVVF!WodCTV2Ild5^GqjWp|)ERlfFk?seHB97@F&F*oEt2P+Mg_
zr1HCnAtN~uTVHXm*`SnG*F*IX!SnHIr=e_MMVbk+Cz(fvf}{>l+J-nb9E@0Q1EP;u
zsj4EQCUPh^fg>hf`O;QHkdv?xhgwI#zm}YE|IX&J5%bBaXuc`1rE79hb+AbW{vb7>
zDb`?B{OwXh;Ukg+s=OJ5Wnu)L6D3FpI5+K}b7`_&@5~eoASNNXx0gD<0oDhQtfPcB
zHH`jA?kF~)LX&$Id%GEQPYBEmdWFhv4fo1B7MQJH!T6tWHz+J*4HBzv_e7<Gw=?nC
ztJDQ=OKE4iURZ#K0s{*}Sl*j+#|KDM-@5gfmL`!0kiD)LO4w~18G{NaU-G>&+&Sc6
zp>=FM1T$&;|24Rm=Y26P7Cwo&{zPdpQcNy(wz5OmKmvph5cDOF?$xXqL~3vn8~BmT
zAf1&<rkF`<c6VQ#QG>YR#5JfE{k|pj;M$+xh2OLsf16WfxQd+9jqDhV&pq(a-EzKp
ziQ%aFfxI=wA|R;e+2mm2x?k#EYJm8gr<Se=d44xac`az3N#PYFN?;za59l5+z8<g6
z>ohPcq}YPxF;4hEAj=Z}?03+qmxlJNMkiZF+KSCYMvL&04%Jqg()-QRzaLoea%U`P
zL9t1Vs>{xD+IF)6x4*bZ9!y1Z(0~6{R?;RTX3k$ZzfQu7by#dCI3i_?%Xu_QdODEc
z@xaJ--xT=uwAfrlCo^i8D)khP25I)`ZmkU)UI>lEco~Cua0478h@LwIpuEf*k)-eO
zs{%rnLiDPoc*SQqEVs)QQ6zer@qLE3gZc)xj=j`-q9|<R29~$&+5MiQo1U$@44<ML
z`Q}>kwbMesg&6qs<aLgL8D+Ksr%t1W>BECUeynr%-WgK`8B4_Yik!QRd5rk(;*qlj
z4EnuIvji57^nuRnl%R98I<((^N>~0QK3qjDeRBiJBld|H6}FWjegeg~4;3n1-qd$4
zu^u0a{M@H|ie%r6ZB2aflCy^7=r?6$Ic0^F_F)|B?Ll-Ab{Vqx>5perKQ|x@3R;5R
z$KfbC5k*%v@@Y19+bgBqTs@<0ZXTjT^zPot(nr2);$g$xZ60)_PlFWnxS=wVQQ7Qi
zwJK{E>1VmtsX`1^;!m@7JjE#+u)%jS8C>z{r8*4Km>zso{?QOcv)mB&vWK8+%imy|
z9RZ06L2%YHP5EBLLR`+?WJ&5jXaYd_izzZ>55HDGin1X^Wol$|Xs2A8Y4iqvT67?u
z2vBw4H?52tFvdG3N+5I~i#=`=nruxIJl1cm4CYs~S{vBfY{I)-02C6I&mysDx$6$U
za#{RUV)$(ckv&EcF=LZkcVRC>SBJ=ugQUpgJ^IO<U5TE3>d#FtkblK|{q-m_qG@74
znM3y6WXiHIg!e@*I*Q@Qj;514$!U;OZX7ScoVnkc{&|~EjtVia6W@RieAg%WmRt#K
zLmk?5g0=l&65@^vI??w$b>2F~yxLj{aye5hq*rQHPO>QxFlB^)2Lmq8aYu9?4H>K7
zzE0%~c)u0X2l>s)3P=21uAOCO8Z+>4#5vz_0T9jqVgd4tn5x5(D<Qe8>G@GiV>+4$
zVQ~H_?i@vOl!S#WpOKwVqJZOkUYqHQ`r*25o@L!bJnS93#VZ8*B89H_+&)uS&jyij
zm;WSS0mI`Zw9QmvD!&VcwA%If%WT74Cc-25gbIBng{X*W4E=s{VUmnB^W7k|y-QR^
zh}6~|x!Qa*rNs>)S3un}q(B?i9+4xJZpEkLA)nJJ=X+qkdu1M%G9Jxrf@!@-OY0+o
z|CGeGx<9BBr{rxKS@uvPc5}vm=g#;CkbAVwIvorHI4%qj-CCxndJPBuyrvAQ)pRsK
zKDNr^w<>jz5MQHN?3@SB(P`8u=1kxq%$OS+h;`SBM9itcm3t}jfJYtJD4)<{lnYs=
z4_NzHh>2r!C+fab9md9XTowK?E~*T-xVVICQHiZ!?)F(J#N|N>dZhy&lOi4AN|V>T
z*v=T*TIp(-U~u*$Y#*I?l7_uU$d7KJUblpu5y5agePnF0h`gW(A@y1L|J(G$_%n?4
zUC(wqy*B=A(d{9@M>hySe*1PWi$^KW?<x22@BON#XXBt}+;MWQ;wID|^UuHPfI0_v
zmr$|#NQ9@R5=aWYRF@N29JB%VT<|s(dA0jDOLZ)6`N48j9IEHMu&A?ohlYam&J)Ff
zZ@okZW3$al86<f2QBqQus*ar4?wdyKUdhcgTI;}3sut`qKMtF}<f;4rX6MKs%`<1G
zTOlY87!=IzsmIYBMk5%ZabkyD^hP}~uq|!ul>B8yt>)$UWLPxlj-Di^OI<E0@24z^
zN|SD~dk4Cmg@(%f?Em_Q_Kv$JGrGO_A?~TAy?T{6+nh3%&MIj}-Ww~okURLsA~=-Q
z0cEyD%j|ZNhZE6G#mh!`)em&jZb0tTzt;W<*qIF=fUCHEj_0r&ncdFjP2^zDD^Hy?
zSFJIeqwZ*NgJ#?RKF#Q;5B>jS`hEWnB#JRoK6(~x+z*W$TG42(CsZg<fI9BK;S9MP
zjeC@)kksruJDfK(OYiiGG9(%O#JGB~zPl?X1O%q%19~~oeBXSyt9#j=anZIh+;&36
zsjPBkrLI2kJ*BNgBh|aP)~TF#(WC|Z(9@e%$G$(RZTYRY=x$Ap5>?1?f0|0^7c31O
z-L{U!KM+4YcSUGS;%YvTj!}1s$*OGBn&_4>c#Vi6$He&h3nyn|LxZx43iPtf=L0-^
zj6_a8ill<BZhC5J$&C5((h|8;yYh3CCJYScSHN%(e6cET>2VYt6_|hP)912{`^9Tu
z-jeJ$Z`;kxEs9l8ex|`#I1fcl*a({dMg&KQU~V|zRm%+#b#TTjtv9MfYIJR_cdQ7=
zzU5D*akiPPm$HJ+t;9UBEb^>3eev`k2|8+O<(~V-3NE-EisLyk5EL4z*>(f9eQ}as
zwPZ1w3>D6FAP*-c&PqDd=4$=oRxu$$>NPrMSx~na%){X!1Ffq_Jh}O;`z^n18l7SF
z)t;=+9T38#U}a_H>|6`vAYNUG?3=&XKRA$-lmrqW#Dv1b!{IULg;M`c@+$=pTd7xW
zZr%K+>6@Xj<A`g)MtcQ#l_6P7NE%IaaQAjzi9s~k|18vn#hx^oBrWS~IT1}};-n;c
zk!>z*c!a}y)C;0m8>Z`%r0~#RnJ|Jc<G820M@4XKC#A7vz}8a;Ics>e%@&fq@_1vn
zIZ*uP!eyw>oi2PCmFHn24UxB>n-OaBJv({qNM_O7O=PQaH^|e=O-H0R0JY9~G{NMk
z6tPY>1^M>4FsL)vLDw_MuELENN&cXRhrmQ91^aP@y5MMR$2lk0huV{25q_$cHxO`O
zZf<TWQ!4}c6F8F#AkbJE4+sRpiDf}G`xi_8@ddF)y)9AaKU&^JKU0Ch{FCIvt6JSO
z5gxdkRB?qI7PU}~991IfTKuLdpyZ;!&T;Jx4-Z-EtY_FfVeH8lC$kGB;;h_YlpQUw
zHN>`(D2r(d>`q-5udTCn<{D5Hn6CZVFoiDaXxi+!i8_=8otvzhD0=kN^T8Kvz`~Kn
z1+B3iz2iU;7W&+~8QYVW{bZ-WZW^^`q<%2H3yXqR5_p4xL3hgC&><#tk-kZh?3HaQ
zZdDTKhoXCdQ&o7ORW+yR%1R(*`6T*Fpd+Ug$dGx1i2qvx2qB+dS;6PgdBed*NkI%G
zdW3jMFOW_AU2d5FMb}%0)y;K(-Un%c;$EC$#ih7Y+$rwv?(W6iin}`$cPU!j-QC^&
z4fp*#^Ir3tnf$@UNpen-Eo<++*7vj0%nvkbzh4n1ht&;Bd)D~{p)zZuf8)Jyd8%$(
zxZM$yG|H^WII~a2`gSKq1X(h^XYbV)RC4Iy(jrA+@iL#sv!jZ<&fm+|ka=qkzt%%&
znb#*T8jetkTo9k^)l}h7I9MEoax7c1F*l$jW8<U-@FU+P`RirNzM;kpn4pOVXeg^@
z=Z+5o&lFcxl})`xw{ud+gy7s1=va+!qj9JP2sJi)LIwxLX?yjFl?^KW#63Md1qA$r
zbJ-ma6+AuL9*&BBkdGHSk~iD!h&wtu0&0Rf=Eb)6$Frf!T7dxx{4AHGWXUM8G~>!h
z&i6bVxfczgL00s5T%;u;s&-vw)oZhcWM-YwpnfuT>pNvg)po3^tFhHSZxD_0g4=7T
zs=}K!U_yZj2`;J|FsZKELX8=*(MMI5?CUE~A|l#cnvcl1aFbh4LJ;}B`DuH9|L;d9
zKoT1tAHO%2sa&k^&J-<_ijsVj(UCR1`-u7bo&Ql<I#2}2m?U~#aJ|{=cfd?qQqsZM
zS-Z|`MufnV$>ZK;`X95iZ28_H$xWJFj;9g%2Z)vm?E7>h_0udug0TVI$vn?ga9Sm!
zM&zu<BC@A%R_ySAa`a(zUa~6$wb#9a(%i$XWIVK}DraB4Wlg4IWTx~Ub*2A|lEdZn
zu2Qm=wB)6^sBDJ2E1+n>Ge>vK_soMh0|SLw@wxPP(h&=J0`+t<kWSick2-b{iCLSP
zq9lIg!lvs8iq-ZH70yPnLdn(MSWUVX08=f9@fYhqi=S}o%DTVw=a&jjGwYDngwKqO
zSa$it6jfy(0fb&aK!8XT{z{vB10UfJ%6RkHGSy~>gI^T#KiC^uC96=g4UCP00s^x6
zspXuUoIE|bjYNQPyio#>baxWYXgCv^3L(2M^$D}h5;M8^{<`nNn#{Yd^TuKvEJ*Hi
z;o;szL5}%x$l>u8I3vU_vhllM@#E5-mSY2Hbvioo{atuRt8J&OU<5o#D#63DCGV$f
zYnPt<>u^P?6Z{PvZ;SZ$&*%&jfq(n-ZT*epcNga@dY#HgB0r6VutW$7*5eO#F{iRr
zMk2E>ev618E)>=|RoVx8LQhn83OTwPT-`_OKtW#Giig)-@NU8>8m^evh%Xl=nl!6B
z&g5=Gve?i1LH@%rVFM5upLM*omzSS#p0yIAganh*(~kq?G*_WX8Bh7`TWtSUc7j`#
z%?5wGxT2a`$&48wd5ky;575M~si|4ePPLf;LV}GF_|3LkAI&x*XJ%)sEyn@xLM{un
zih%(M^2ur&jv|976YDICR)am7<6-IIvci%H2~M`CiN2xIqrHA^$gv>5dg*J1EP3?_
zJ?nRKdwl`o2nV!F&eN5}<@xdXVN8z5V@pauAk_BrCwQ410=H-T^z1AUhpU3I(A3lf
z#uFLNnm<%^#?=~$DrRHGG~a+Y)Rka-n0$+qDs#W1HYw4yPMt!>^X1SxGTDPFBPb{c
zhr?3Vz0*{8Q!~cH>3)0sP9nc*L$qlGprE1xQoi}u8vxVYH?#&*9(`wg!1zQ&`rj$K
zIX+-w&;sdd?oCQY21Eu&fw<IY8ha6Z*!6@6foYwZzaIU1bVHY=?v)a)`wsYmmYoU$
zW>ZRk<!-|WY|H9Vh1bKL6)O`{5~f}1@KgTnv`8Hli?X4yH6alzD=QIy^*P}^;L5MJ
zT-rT7ZP)qp)5^Kg>*Wy;Kd34wTt8op$z<}Z^!0s=pB*w3$(KV%L$jIl2S7!4clTVQ
zKnxZm=FIQCBO^3lzJM_Um1A>9Xnj%%bPmAlfgGbi3Cu0JU8rEqGwim?S*%yXAIjea
z%>lk$MwNm+LRHflW48>^Cz^BwmTh-P+eFjSiL<Y~hPOX0GK#9f0NyA%%(>U^0Eo<-
zwcTl1SR8K;#Fdx1;ZECZ^})Akb$CAA>UIbA_VwkgTDb-=$Y?H@|4d9wlq3^~n)|*A
zr1z+)t|`zNeV69Dg8;;c&HMHFUBP>ygfYv{?Q%}YX-U+l{^wofeu_nFXV}tX*Dx})
zfU~T0)z#Af%*mUYU>C4VMyp0@P&{4(1PQ(GR-A#{6)sy5tbLJqONlKSElwu{Z}<Ix
z3U>|I`!&NQXKEN?{N|h7|FYl}(Uy<Ea8!1{Dw<L$sGz2{I6mGv@-m_++T{2yA6&=h
z`wvD{9qwdiYI;yCU)Xr6{I_fU6!6&!>@%Jwq$*Ixi|6aPODB7GpSfW|7SOK)D(8Ch
zIofii7QNn3p1|@l;O;U3+&+x1J$h&dATklTuaPKg=rAF#dS8ZFllv=GM$T55GyupA
zfE1CF@AP+Q>s4R?_1U`D_f?Kg_;-ve2f(-(sNBFhxqSfsQ&>Piz{0{pULJLI%tqVe
z`Jx}7E2n@I5U$3)rt<PrKrg-;qU*2#{Cq%6v1n|`?D6(jTkOIouVo@H6U4ikwnd3%
zv(=xZM3R7;24v&?#kx&hYRsx%Jw%WQ7}84qEEd-dO24sME;2blzSj_y$|Tg<;Y=xv
zKKOb@i^szy&@P1|ahvpdKdGxPaq^4;ojqRM5RiRaSWpwqake8M$<kd=&5@81)iVt5
z{N9H%Ulk@LqE$A;ggItW%fH<Kw<rZnF?^ghP|$Grvqe9_`bTLCe~`}?Uno$bFYOyF
z$pF&@lJJIRf`~aQ2&Gd}d2%$9LU7PBR*WbUV5kLx5V9%PwVJ(N+{DBNfc|^5wWY7G
zKb0u}kHbPuNVx9Y@w9Tj(Jfo$ybL0%d5xdu5pw`A%+yX4<T?CZBWz|~@Fo0RYyen0
zWw~i~TKi!dS<O0+79Xa>xw4-hh=+^&KL9g8aBYAiklv?WWl3ETKkza=R!2PnRiA08
z3ZSfxQ2Q)8hB##=<=~*}%=mak?`90}<z%221o->Itxo_7Xol%Sp9cTpHVCpXSly6)
z0PxfxPB!nlnM{uxCYqCmv6$nVeP(H?D->9%R$Nd7D!CAE_|Blgj>3$t|Db*{ICw~L
zgz<e!_?bVtVRZ`Q>(cTu(`rdYMI_C^G#cZZCBasCvn5#S9m2R-UE4J*y>+=>+51YW
z)6)W_rh!vxt*1VyrJzc1x~^`R`7LnP-q@Orjm_q!vtE{S2Jk)uloCLrsq2>l?AY`x
zXU9X-vnQ6d(1|ZK{KbXR3PSy6%Cskzg^-|?m6b&edip<W#=sx2$uBia1J?`F@xyj1
zSG-N-=~nhFj_yV3Jb4xK{?i^%SKLiJC7PM<uooBMJDqlG*sxU|9E_*ayYu^a?CJd?
zc>OAl*w#V~?As+rR?|UMvTca<IguL&G=9?+@kXnhog*U=!AL;q0va%z)TXHjUm&wd
zCDO#{N{j3OuyfoSfGgeos?8B3G=VBw(dUf+(f`XHI-=pw={LF~jLLr(;GmB<+vNgs
z1VLoUh(Knr)le(@zlw7!kC&iFp7~IEb97K=^vw9U&Hnh8>36~c*uGfTSvc$eRkXA`
z0XhMoOJoI1CN_Y-5t#wwj+q%h_rK>MP0%a6U*<YUlq7}O9ODq|V<8Z`KAx}I0!-z<
z>&Zsee|Rvy4uFeUSX|7(*Rgp7Ql;2$|BZQj-sAoMPwG0+EhO7#iuqqlVGt6q4EXQn
z<NFT)^q-G9Iso1N-$S6!pZ@{Uva|tDI07DrzuUh@2)Ez;iQ_Wm<>gXRQbq3>0Dz6C
zTKL_Q%;y~}n4?-_lpRI|Cp`raI&8K%etr-o|5WiQ{J&p)*TB@**B3#3P6I^lNcHar
zJKOvp-V@IJXZ!f^f4j+l=$`)^`af@Sqn@3E14Y)-lIHz7!A05G8{^pm?>{e{##RLk
zcc@>bvnb!czP|dt+6DadlZ%Us)6@QLK7WcUDo%l17L}!?0*ddabhiC_u}2{il#vu>
z*qisAz=Qo;7MTD&ZeeLDi}>$eAhE!|{{Vk=vi>iH_n)Kx<8gE&-tWfe)x^Yv&0!zu
zy?#xB!b)SC`11!$@t+oU7dQjhU&iT70YybcFR$0XF~pp8c~JzsUias^zYtwqfeo<H
zWvVsD$HzZ-$weOU5|x2+RZ@}$X5a5qQ-WWJ2X;J}l?s$0&}YWK5WDP*jg6t8pw!jX
zrx6F;DR^x+A(3g`X$U{NU%9wD<RVRmiM#+7;lcA}WgMSCpV=8tr3^4JGo{KcyT8dr
z*au}Jfis`262tzjWBecQpWnav?;*rXySM=d#A<Pg{WT7c`*R?BMN@M#55S@A?huPa
zzH=rIK8fBpGs?R!>ay?moF1QgXD#%S<sq8*w#(W9WFdO>yJG~dbD6E~wfz_Q+tFI+
z<M;nucv)^J(`6IJ{HTSe?N9&^vl03%#Uv#G9RQ1ffLFH*a{6N@9*}tf*i?X3@7)Ok
z=*$>^k2iq~L0@uKTAXv=OFio?F#Ej;{Ac?J`ENt}t@2emO{K$&y9N7Q;0XF>h<yKt
z8i4Ls?$*4um6d}^-gf|Tfq(o;*AfB(-9RAa$N$zA9*coRQgJcN|7bRXL=!+&{f|=z
z0OV$N7UKq_vw=;g^nBF0>gkS{R@8WZg}1EfUM;vZKi6hHmQLO=ALVs5Vtkjkt;DpY
z2pAKeC$Rsxwyq0q6^w>cDYVndnNz~+(6#f*A_k~C6rI%Sj5QxGQaQ1hzIIG7)!&|Y
zlt=7|51x|D>@aC@IcFpP^8?G6v;XKT)SGL0T~}5Kg7=X#_ni}^kH?d1ZkBmxtsW1J
zb2d&Zts<{~H=XFCOc0NCG;N@p&UvPcojnfi>hsp4Aj5kf&Wun!dAWdmz~t0_HAy{9
zA5Qn8F4B61jlSux><(|q^gM>Twa-=7)w#8#h7=k|TUca{nUHFt(FyeekJE`1@ZOKI
z5K|(KV}D4AFxr!~VP4c)23hj5GBYDH<5bq6EG*wLgcKc=>Z8Nv$y{)r$zD|ICwf^e
z6)rfyS3-22q_9{n_Jp7U!|e}r=d#<QE#7d%@C=))W@AoW*(?XeiMhIVdpw+u(dE_k
z(+WC{^Akkj)ua$Qf7(>e-Q{}3b~g9o2^(6{*Srz>g)r-vKe<)p#8KeNW1pu=R$7WB
z*+vSe-x_y~Q>)}Lgdn5X+la3^T|H}bHq&@pIr?W3)au+>@$KQFk`^_7#n?Vi-qc?$
zvONlyWiSM$(Bh9QI5uv^4hZjWK5S0*;ztS!MxeVl>=8_{FfBFRe{9}#{HInwzE?`;
ziAgzkyJbrY)1>~k4}N{s@w{`k4EK^P#&o6({!;~vBEmQU0s;cG`qROrVVg^uME_7+
z`4H;nqC{daZYmC);_(cEH|e+2c!qBCA-hS|{H0eK4z@+1=9d#<eWKN*4K|C$bef5o
z#W2j`;`rcjH%r-x8}=A$Z(NDJ*NfGHJm-FC4W#r(<Lp<H!DgBE=zQ9QSXu9jSSiMs
zZu=pvyH;Vr^2AUU>ypnjsHn_dFQ(!b#6x;4?ranJ-uZ<djr|O!F%l!bQptw;iM<6p
z+<^{4nN~a^xm*{EfNksa^nG}DV_V6051cGX-`dV37NmqN)}W*B)g@Xn#OTQPcC;Tl
z3@^s&?@6{1v^QN`3Y{#3+AoIH-O(jk{%bN<F+m9>57-OyaAyi^%YD0Q_D_Z*z1A03
zwEXQ6CtLCYd(9^8w2YBx9Fe#@9!cpxNYFl6a^5{PR9~gFnB+u@vQ1bI5=Y$13uHRh
zR#js$jyUIRY$ATV;<mI9;?MqDaZ#$-{}4um**O10zBk04)*jQ9Z2@T^Jeg%~N<r2s
znKqq6mD6=XFN&dF$$-<Uk7$j-WdpDdz-;>;RCsCD#mwDkphty3*ZN7}cwpH!D<?2a
z)e`sX2sFGk&fTJ-{mjKyqA_IT?kC!GuIv-`T+4PBy}1f+d`{)1_OST!p<hj5UErB0
zD@ThamX<qY!XVI)zBy~_fibacP*~zcAsOdVJx?V|mq=r2c6r<U=)4KZ;mM;Yf41jR
z`3AAYiD1_)1U!M2eR<PRikg(Mv!Ta9qsvP;gOcF2{+7&0%CjTG(^CCEt}RI^UaS*A
ze%g6YK@#9C7<EUiYP4@2M*ek3);d1z_wd%W5i`)hGzJcByWD08a6D2`Q331PALt-Y
z=hx>4z>Z0iE_duMzdM$be6Vq9{iC9qZaXBTrWLA)ukP<v*6_Qg`sA^WCS_)wMB!7+
z$MENW8H7uPx|D2bKt^t9gk#dO)M;@!J?pJg(OTv!_D8*pS(pd8Gifg;*3vfUV=pZ_
zTuf*Df=$es6LMQ#`m6Zw=&T8dvSw6@hoXAX6Zti)oUkn06l`BbFyE>y6Y~npr35XE
z7A=^tEpda^HltY@s;;&5A8yO0CeQB*>NOXRElw<!F`vt)4*fBQg0!C&{=S$pVH2Dy
zTh0s2YHO)@YiS3w(s$A<EC$bVD@VCCG6Hm_{msssMn|9x%ty|a7~;MrF4r;Xpf^$=
z??epXQ>X|4JY?y1^Y`~a9;7D`X2%`(H)~OZG!o$`3Yetn*k8hb)|2Q_@v_h*3x9uF
zRh&-nTAEiZ*J|dt*C43WdXh9O=UELnNtxA3yhC;$+MlNlV|0A}<7p*8kW7(7!xn%Q
zrxlWg3^PMW_E{$40qm>q837m!uTXx%p%=|tr~U1Mapu^9``+>0>+u-vm->`J`VN(N
zIy$PSz<{yKgAB8g-85M=<x#aJUh4W!%wug<!%B=_M7m=xqmDFfW|S0mOtcjc9!@ed
z%db2%PJWiQFFx|e1cG;}njL9?$ZujTdiO9-1TN2tR-H+JUx*Jhd@v3DJoZpYB^9B}
z-gtP6r&6R^h`B0~dhWuN{X_KdqwjeqC34th;`3bO<=8Y%#vDBtVUHRw)mxhS<u2V5
z4$qvNrl+m$pDi@D{)-M3<I%B1aHQa(GrUny-gVpZvZ8wXSd~e)(n+~=;Dm`myf*7^
z(_}bA9hw<$RPG0=I{wl>Z^acOa|ZnoJnA>fPLkg9+8S^&f@XBS^x1UD`=9mvTM7yb
z_iC@4G2IbFd_~H>Csq4Dy4kh7ofkb<B_%yguZFCO6Cy$1`2iW1^7HcG;*>;yJe48#
zM+!k>2JQ8WdP}l<foGDv75XXVO~jM)*f`4|z7Z;FFWE5=DDG<Un}mK-fM55wX(>3q
z;k4tg=rbaCHmnMnTE(g){*@ap#IQb-)B};6Zx7?gcx%%EGfKO~a}u+G`HJN{s@J!>
zpKzdVujgicv}EFxg$O~NA;63^jV%UfO7ZbRWwwdq*juTU%EuII*BoE<$Rls$#3W|h
zOcfB>l0MB)-fl@9wdi+h1-UH*djA0LL>=26^Rn)L)DG)8G_<z*+uov{$%x0jb+hPP
zRtW_P9nfm1`|)BS`zOfI`lUvU66~liZ7(ztG`-yAICeZ=w=+jZ0U@0?V=i>EJ%S7k
z+4bTRit@lOtp^3J`?n}2$%mfjt4s!N{VF^4=X5q(B9G-AyMU-lm8zfnf{<No!>O!i
z^=?O@=v2YkUf{turP@P09l5jUn-)QXRB?$2oMTRGkIwCSI8;2fHw5rik>>=og;bEe
zNQfZu<n;2_)~>z>>aW=ILJLKeb@w1>Oj^~y>(9TRIos{8wd-`ow_t6trNAu`K^3_p
zA6_gjGoT?s`-g{sXJx{KsR!-OU%~o0Vs&3%6F(9~B3T#zB;0AnS_!>blN+r&7)n7U
zgUEPlG!3cWHp_(1%)?vc-oM_Y!HIq=U)_1_5m83_ImSyg<zA8Vt`b}8bH+F0?k5Q@
zY8)TYF6gHg`RvCg8cuspS{a%9te942a9=*4+3=zWT9YmZHa0c_Id~$uc<hjwkkw)Q
zFI0gHl|^4Z5Q&M2eF=ONBa~EEt?g+(p`(Sg5aE5)nBd%|o|~F_6A7xT(C_I@bN;3I
zJ1<#+3Ca>9vIf8M=!bos5`xe6fJHgd?8e;PwYE&ie(-q!R}4AJvoazkVP!BM;WLsa
z0Wpv=GWf^0MKOG#7kXGp3I<~IC^m7}d?pxce&FsP!8b$#_sjj42;;@S###WWMthVX
zyDZhG?!SD%pwr^|gCUXx4$Vx5xs$nKbO>@DFd`@bhZFpOGD@eC`~}C$v{r#T^s&=`
zR`tF(f3&{|{7v`2T7U&J-tv7@C{VBbLW=C|uWY?XM7vcG)Ux=-jZ@HDaJDEt7`U%Z
z2NJ0X#^q<A`TSE_{_03=@isVlatA>6P!UJl2!FN)6ll~hR8;1;ZSl3anV)M5&Tl*<
ziU%0F)Z;@Ezky(FbZS)>8XTZ$J7uqvQtjjykH3;S&RDTgV}RC+Yii<i3x4qybh0}6
z)OefU7{d9ZHJa5q1v18#0yEb$rxt(~hm|9S_76hURntYT`u8><`61tHEhA*Vyr(0c
zF9~|X1`L3j&_2aDU4!_FVZ##D)_BYaDjG{L&5{?@FqV`3482oK0c&~AG5XxJSyNdY
zJ|jeDL@9j3o6oc$Ls9Zp!#haDY%ON=ynm-iJ*c3(Xo)*t>GZ<lq#CJ&yv6?Jm|yIT
zO7jwz@^rL@{VBkMXU*?EkiGw4%hC9lmh}kUSA0;rK~f;bP3E6!doQiNfcKiSU-Fy@
z5fvnqjJD<f6zz09<u6A|r*?a?1Vx9Ss9|ck5#}}|5<%@To6EV%fo*#@-yXVP&w?ZD
z%%^B%Tv|$LfeXr>{cSV+q4_dxW(a5Xb-3@3@smWr2_14FNpc_N%^kZ-wjhx!FnF8|
z^v)<<&@Dx2BYrWuogA@7Cl3lv(O4P1c^bX;`RZo(0&q@&c_RGM1GD_3h#=fdYxkCF
z8jlgl?$oo6GJIEpQ;S}8PT=cceBrk@=||W4M}-z!LDpxFgN{cta&550-K>%bUnCwL
zHQ~X03Mr{qVu{=Km_Q~YgOl(JxzqUdPEYEY_17SfOx1OH`G=4^7*wDQ|I>8h5GT}O
zQ1zaL#s$9;A(4#&Hbz`RLX6YRW}O07i;k&7Qx~UM2tBS&mr7ICD{GOXXwEmbD6#eT
zViNPM+1vdv=>U|aF`FpHZeB!girnCyc>OnYbdgBJ-7!?k$pma|nsM>d7I<LXI$ved
zioA03;OTTeHCf1HI%_Q}AR2RY_vAt7?Bp@<JJ{IycrxT_-L&x7&SRxoF^h0_wL{I^
zkC*Yvq>*LZijvOWXw7GlPa=nf=f^gIxV}8-%FROnb>6CRW{?NAZqD^uN9BGHY&I*a
z{z7vE`!svyv5I8c=FeGHrfivTHoqZ4AbUj6A@fIFa}WE3wF^B7<y$CwlmrEcCunAo
zVfjx+tXZviwI#;CeTo1!rvZhm=ia>_E-MbQs|U+mgnBZvfGyPU3|EDvuW8{y-8++a
zutm#Yxw>e4PS@Jrjr~WC2-@HRoMS3i^6{ESEcbCzy~&tw%mgw8kf4=DRk*%;qq*;P
zJX|vS7Z3Cy_}+a^EC17IiR)c~Wi@=iOy#r@TwQSC>HR@SAGb|fRbT*VvF=ric^xm!
z)S_9`Qs#;C6Q139_Owb(bu|;Vl!gZGLH}u;tyWda+wQs#eo#}4qJfzKsHi{xPQu!7
zM8rw)u`WfG0s7`Qf;vv2>H}Tx!USGI2lH<b*X>91G29p2KqzQtyPO6do9mk|l%Yww
zgGe8)k*NR^*Odg=6#6fJiA=zCQPijYCjHIB<8J)H-=s$T26kn<EiU}3vp~IF^r{))
zwM03>9YV?@F*0W%ES-n2{dm~OOh0k7aMq>^OSL<SRY#b%w6Sl{hQt8#DZoI(k!@X{
zd-Zdbg3H?Gq0{tChB}L2&2Cx$k>-}hpDG0RpdlPkFS$a9+?TUI#XoFqZ3QjT(fk*G
z648JfOIGXChA7;KuU>-~m3Us<Y?T^??CK$`Uv8r!l8<isSg%1gRb8*?4eJp^@KLQ)
zpmny$Ew=Y@mgxHS)(%%hmPC}(moJGacvU=!T%YErR+Zid1_WOl5#Vw88sU{ohwjYR
zJYO<e#hdKw>hRb<`03T=p88vYFZyk*r@qO#n$qJ81(|EVB7yv|Qt!&9z|R*-cvrMl
zoqh5CM(AazBw1^u^EPy}E-Rmvzl?YJ4lJ~)J~%G4yFo#j%$_djHCn$qRaBZ&_qqB_
zn0gkL;w<auIjSR^E*Mt}3brFrUcdb|yRpX(R1oU&FG<~5FN8W$Q;ke8%T#Uprc48A
z%>H}JV-m%~v37RXUoh_K^6=*a9B7Swsy)=q98)9&J;gYj$~zwoT!$t;6PMd-^pEVf
zo4A0|L~|_saam0z^f>N0D-NfODNIXOD;3YYX){u_G-^W9N*yL3K!O1c8H@+*A2PnQ
z<o<AJ7%wg)`JE7nEX+g?ybBV`5xG`@63^V0$(QeQ*g3vK%t<gLDk0LxML+)U5I#qO
z5ADKhn!6W`a?peZyjtc2Ty7P&_xZs!&XJN|KCE-9UpO7~*ZJvDfg;;}C|`^&5oBU1
zt!G?`IQ^RAJT3LZ%2ycu2wDS-cZRyiLLhb+BB)#e;VKYe0kOR!_nVx>hjn`<uiMr7
zF^eANNE>(lTfa9YE9FWr=e^Sv(tw;wC(r8$M4yIkwU_=a2G>+)uj`G3)}$>0C8kH8
z=#zbK1^Kt%jzHRG&b49VvKb>*Bp|^h9jioQ{PR*qI%z<t{mv4?0viY{@#XqYsC*Y`
zB?cxI8nM-JO-7Xf8F3Bxzc#ag5E$hoUG6F4BOU$_kCsDrtsqnUj&%~eFW&lRWCULK
zP~}>%-)ln%R&TW?;&{BW{Us&yB@YdxaWgaPH4t4hJ=+BOaY(u_Xj_?4>{^heqw4GB
z)L~u3MAM<8`u$IJ*E16qe|>HTW55RmBdynxn^e|H?RefyFwm&%%TulZx_IWxPuQ$H
z8Iv2Vg+>3T8gbba%kdu|aAYvZidj7C4G{g~almHwz|zqntv!fvjCS?$^=WBsp}K;v
zn~!|X1eF{xZdPLfiyX8q=c;wlQ1mH=W8`JtC)Q4kjGth48PLdugh#vPulDdjZmk-U
zl8qFiwE;(A-$*-qo3~bn<LS)unaGRl&*LpT9G^YdW^MSAuY|L;%A-S{QpX-j@PO#!
zs{5IhyLmTnyykT4YjsB1t7l_&FOeouTm~C};%th(&!coHzi+}Io%+OtW%aL#gz*uE
zPo*Fb@JKEqZ~P%d`}E+^J}?jlbx;|q<n@$l7lvrD)$Zw-0^iIzN@65^8y9+*i_nNu
z0vJwel&(3O)|d}`rU?f9>2DsbVF4-H=w=`ZWydX2ge<hMPq3NT(GH_CGnyYjJ$u$J
zQ$+<JzEl>d4%eY+Fpcuk8myU^)k7}RTV6|=$u(uf*5t72I`3+hM>ciKbm9g+uz8u;
zMK+(WBTV~6yk1-+jy_xbAU+twP{;s&xBv}y<5#)h0Kq;Y$zU9<*_9u+O8Rf!?yU@i
z8NXL4;|D%s|BTMr*aBr3El{Og%CNLGQw7FQ?&j^_0R1xIh9@Us6=jIzTc5G9I~a%&
zEj0Udzxo3Jczl`f3zhkCK1<o=-|7<-KYx9lHyQ5YcdVp5)8AZ4z~fl8o!4SdpKKyA
z(Is{HSF`>R{QGI>l>FM}eO7xZ&1as^@p|KPbQvEtsAykq`E2HWRI5GPoFYCeQ?C^$
zYT?ib3OKmE=1{hQI&a9#7qd(CCDB7!FkW&5M<E)!gAQj|Zeklmfw>dnxA&<X*oM$}
zgVTsws|y1R86V*?Uc+q`@k#JGh%iPp!}E4Q%-(iKtY)sl>Ru|j7eafh{Sv#ki*xj+
z9583%t^94vGZgaC381gcX?Q?g0TLMB*u3Y2uGn!mbZdX)%F6pWPmwVeAYka=;Zg4j
z6_wgw!QR|?Hr1GH#A+)s&HirmXq{EkQBX7QZ~}GaN>pcNQFo&Z#fxJnfEr<;*4$nQ
zNfQP)4KMqro6FKHZXl|EJOK_&VBPl@aHk9o5|uk%W>ds)?Rh6)z-gZI7FgFhOq9@s
zDfwPC(W**yEGg1H-Ho77_8nb;zo3uqon#B<FwCwqe2Os2KuXfLSWs{(x)@-+=D+-e
z--Uc!yP4?FBKHB}?F=3=*Tf5rIHqhM(^S-Abj{mUCu)t{*JlhmC-6m(Bsics&n5SC
zGG&dG07gVB+a=rw@f<RDnJ#qq3JVPUxu{B<B<mj7=`%6HGOIy@L!-fxe!riRH*dq7
zDfV##vdi;lA|Yxq<ivU-OJM_}DBPt0QvTE!jr8^(Pe*wvOo?NLdvhU4zDDBe>T0Wl
za*|CFUz@Y?kXqs{zjKPAEtr5fN;lDOVOo|h=E}sdnLB(<iA`6ICzHdZDSj-l{i;!O
zjzvvsR8V}E2K7qNM=Oa#IQuYfUvh+E`h_4yv$%b~2%&%D6}A~L5l@Y$;MTPZ&~Ri<
zc&=@()C7@BIo1CR+HeE%ifWiTc}#vrSRZuNK!_T${j;bduX%SkOVbnzmF49G<O;5>
zHDSV@RPaWMbuhb|;86F<P620oQl6<`e|WrSKzS;Qf3iGGgk{ppzP2qFMxNTe(3{{d
zH_j|(Xb~L)mGvqdrD(J5v6y#Lj9l{Hs6$09)(SJ*_ImvdmL7XqxK#ifpIcbevOAU&
zYT@-E7bc?_O_z_QRLCD)sk?ui`r0NyYi8}fxP1w+E;5D|{B4(4tqh&<a={(eYWmFC
zL3(%J&8W)DZJ`Q>SZ3RhB>F}YD0;sHQP$m(*}Tyo@T0#Vm((f~JQp6nA8A^W{w<Uz
z@gQ{QJ$U<0Dx^lO0F#4!Y-Mlnd%@WaHueb359U3#fN&S{cw5csCD|@6Jhk6)+Agol
zA%4?S#;o?~WYtp88c$Ca7+Ud=o6eJ|f07}cwgF5`m?R+*h#sr5@^!X;iwc9M<-|%4
zojzFt!^^Dr%C>${hRSUbjPF9Bes}d37BU{{s~--MR<{e3!R%+ZeR!`jm!F5nq){md
z^?sI2@y;1{U`S>)jGuLWMd<|Sb~}}qZ&Vtow{&kfuyf+lB0}AlW#W|$%|}y^#mCUo
z^1mF`?KvN7ywRZ)|4udRO%__TMJb(*9lT_X3fsgQ+_U@M@@Pn5axnN-WiaP$@pxnW
zO}29(_b-JOD_Er>Sq**V;|atJyY*h25D_AJtcV0-!31c@jPZGOb(!wdb(Qp9{fpQ0
zBj!=jk&!nOJe(}{^Xt^fF(1@^g;}%QSbpj?I2pTcz`X2ka?hnu{=V^^r)fEMfs*KU
zZ*kQY!jQha0h7QDz{%GXI&*y&q!rKNw~S0h{t2y3ib@I%hy!wl=uxu8hyD8JJa9fh
zF=fi+Cc?+sK>250DjUQ$AEGNh(olSxhxKCWR3@dU|Ju8QSF=6m;a#XllFo5=Epfp#
zFO?5Z;NsO-En>X=`Yh1~8m+HIwAi(cTfHocIwm3vmWmFBFSgrasL2j&^Xo?&y0egE
zA#~VAtfQj-A{Rym1`MsIaMjQ$NFgh$fDB6L*k*2fclY(gI-&=`SwvD&QX@@)HW5{x
zD+~IDDy&+4wbXHC=jIp7lbD!h9S%a+2awC*@zU`xX5ViVXmDlJo%Jdf%Y34~(JOW=
zT;axOmlV|>wPRjtclXcA)5gr$p+MnY16>U9@{u>tj;;a88*c#`1(P@Edp<rZt1!Ub
z2tQyCFFlHB^)#}KG&+9&a%9Awf;B%MI=Fl-W+WIu?cr?P2yd!2UtQDL-`s{v8Y}*2
zsJOBGbA<EK)2cp5FUhyO>b&vd1y7)FjJ@N5<ZE>T-Ac2uY0n!0YkNmnYZ_FBe^WIa
z4@kQ=ag7;q4ugBiY0gSu4YC?$R-+MX9M;6EOdKU(AXb`dmBm+-G?H#sv&if?k_|b3
ziVDu7^W^9kG-{v(8Dr>GoP+}-%A?3`_ULV7?BJC<=i?26a7Pxq_Rm?^p|M~dl}D3i
zHf+#O-;_tA+yQP0-Ue!A9-DD(`87SJCL%7Uhl7K|$=({c*w<x<jI*d|?Rgb@OF}7!
z)hjIY;?}9JGS-(|N(r1)nz#{8ORaWieK#rSdK#wBXH4>f(`r?l=a1|ky?&Qm;^gKB
zDTsC{2vV;-#Zqd5AlpRRTYoI>e{*;HQ#d70Kr^7YE2anE#Je<OYi2p*hoaGWwpd$P
zN5*v}bD{Vx#MXAx4lDlE;j*v5x}N<>G~#}O(2Z&%QngA?$kdcV6;cYcezDAD=V@C+
z$0uo<INYVmTGh4UC2@gImG>rjOi#gfi1@m&3qk%MAHW_Zlib^+cQRPVK6_W9>CD<M
zZCafvr|<@7f4#2~4EomWV>12@hgY&_e&CSgtk3Ptiuq$&Ud-eO0&Oxf&N!#q|5bfz
zxhuPl*3WSb=ye_BlwbtakgGq5nCo|#|7M!X!n<jEw3x{}tt@LB&mC1)z5k^~77n1A
zC%QLDm;$}geczKlbA=|~GjWj`YzE}w#*2#4D|J~0OUnnFfbi=}JQbjZVW?msH1&K^
zTPGhXPb_y5&Gg~jT*dX(uhyfVV)WWRL(ed2p>LM}o-+?I(-^C;G$?<Lbk?P3fbBOe
z=~-|Og}k=1S5+b3xI%?QLqVmA4WNYNTv@qx0P8M<58ES%`3+NSiwyN4u=k3JYYom{
zem#HhHK&h0{Ug%$q7~$asRPK1@I-sZgIGqapo&0J<&>PyVBSibs6lc0CF&nwwKym7
z)p855G=FZ)*QuX2OIB0MNKU*SG4HX-;laq)B#LT3-`g>O-h6qVN9LL6uaM!=sx|(;
zjXK}Y6%X}->!0cq)M{9~emSK>Iy7K7hgwYYeX6VGnci*G#W%^@b-F_qnLfCo#M8DA
z4PDp`PkRcHj+T-S$=x_Jb$XoV#7r}-A{XMHz@0n3j&x<ulpF9TUOp!f4_<Y{adZ1y
zyJBNBoCLNGyo-XAVN_I455KvJf;W^5Xf`4x?pp}MmgsNQFZVYL4$VoK9qm`}+#wIn
zuye_Whp0RtwVi5;yaw>DhrS$b)doEK<&~YmV+%*cfCvb}X>1JXpGOojZO3&ULr!si
z5bk}~7g+K=^O`YNi`DMIBnS3sb#U&|+2nhpQ*eY@je^<}E<Zo`^pGy-L;q2z9GV>k
z|E(BrU5f1{U7>wXZi@RpaYtsRItHD<XoTSQUFDhzS{cn?Hjdc2EG7+(gMPl&A9?ak
z8x-R+-zcZl4gL}ovXy&ko_+mBvTsz01d&l{D8-#>7sMq!xXF&Q!v$%b@Un5;TmVQ-
zww&4uXcGG4cl(r&&v2*Bd%9Fdhp2AraBc7Ssh6&fKA4j{Ur#ssv~)L~9fKLu^-O9C
zE$U&dWhw7VXRty<mkwgc6i&>|PFHV^lhzQdhiTEO{8r|b%FmQvoUW@UWm1VXpm=7z
zot;Olb+2>>AAzo8=2*IpD(<oEGtg3$XCtj6QsjrQFyGVVEspRnX6MrhFXjjN&fZ2y
zt#pqyS6Xm9m}v0`3~fbR*2hNj@HJM6Y7<4Z>#@fvEiluUL`?3`mWg3*KJK)pmosLL
z6^Z8aU6_E9WSecpU_e>r7fWz)oSnwvVUE3w5bkgD%LNh?GojsUsbEosC9fVAK6LEq
zX?C}75=mIk?GASbh*l3v=ctsg%HsGP4Sfm?U0mh0D#~1_OhRBj)K$CQLA>2RgAivb
zEt^sI{EmE&Kp-$j^D8k*ZqHFDnX$vW04SvEIi~uGr`BBd`7=Qo;uJ8D)+%gRB_@ck
z)0FiXK?YB#I974V-gs|7|EcihWwk_pZx;6m8~ZJ#QoXg)AUfaQnF8*kG_#h<m8-~=
zKxs2dn<E$PBHesNMtDfdcS(lu{4u(UnrGnUi=vyxMSCMJSXB_d=Ud&jT2a<zzcagk
zTBDpIH!QvMVroCryUM3vhFy*&q|mO!6H9sQ#%}x7GvboK+E0@8+e@>pHP-nrG^|bA
z_~MbH_HtFxuQhUbpZ>BN&8-cTJeVU>4xThHj;cFDXP#>`^fOs2%w%;EWWf~Ypfld#
z#h`S}b3%poFBW8y0+KOfh9(_H0dF`u4xRpMYdN<_Q+OvFd0se1fA~b)JgKKEY}a)u
z_-WsqQ9DCpg?_!UwZZZBmL!`&Kxq<NNkYi{I*NYZoJG6jk})-q1d}F{uE2M+U^ceE
z7wXa>ep*}os`ZyxEY^Lx$j3~gAG#dQJu9#v{5dF$G3;)Ktf%1O9UuGktOsG+Q#jFr
z{0m(K;rydbM~_Z>LwOa84@lSeMO>)~MQh}!W#in{6}?!x!fUI%uQYZ6j#(?VS^Bpa
zt`@7$Yy}6DuzZB-uaJoQ72jI^Cb0Qa3|=3YqJTVi6DlU^yN=AwZCUpiIf5NN;dA`t
zL!$}Sg_K1S6e-Ak@DS$vGlqKJ&9mM>Hl1q%CIrE!-AMoq;$sq!4<|Hxu;+;P|H(VP
z+b;<-65KmvUq^X(hSfh00`V}*^D$(Z2nxAuUVR4ZJ%9TATJG`Dou!~dl)G#%a+5-&
z&?Mg!H11iy*2Ju4ELu94{wIrW{WgjFiyD{ISLDx=%w*TYrN@WPIRqX~p}P=#oon_V
zK+#z<9uKh^A3)jC&JEgprI0yXe)XZ=9NpXa$HHhK=pkGjf9sn37z<%P^Q31~u!o=4
z4w@qtyJJkK_Y==gx%agUp);Y%!x>|@lk!#*BVGLp7-o2P?U{WSiy!zl@mpUU%hoDg
zkUu+mY_Fw|dbfPhQ)gzs(4_97A@lRsXsxQ_bcmrqJpF3#E3w<(`inW-P@z`pL?<}z
z2(1i{6xkdd_u5xs%Lwj;L$EPS-;1N+ihoSYr}Da*=^>6zS>JDWBE}0_E;o56?Ib-n
zAZ{aa=F!Y&(VRCOM@r8Q`Q~65)$@NLlk(>yl;r*>_=(bvC0HrMnw6bI`Pj{CzE$2@
zedt53X|-c_mbJBjvX~rK+;frar0VX+bS8HxJ*J<>tMaVipPNU2(xtI7e)7pQV%ZUf
z9Kz^_1gQa~6&y@+#N4oEXLDq3e+LP|2Byt^pAOKSok3X-6XBrq^W6bR5cuesIT7>D
zdc<Sox-eGn--6<GTn(8Mr;|4NVVIGcx#t&Q_U_^I_^*v_orq+L34frtDnb*+x7k0S
zLdU_-L`s2t#^Z7HMLn$E;t3s|i&QsNbciAo+eBU+<j%j2H$Nz6Bu?nCzTBP}vUZhI
zOz!=vjD?h$Nz-~YR5nIJ>W^>#<}E>#ug+E2Sakt$2g#ofxsu9rZ-<%#_W_HH8@`ZR
z-?=myVjL4X(w&{T?LHv0e4kFp;irLHU|PY%l2Oy|a#HGP;i>P|sUn|EnD+2s&u~At
zJm&`l)q%%*)nBO0t;!pH*GDTDt8lfK=HOQhlCGc+qC18MtxHHG4l43>9;noG?D#N3
z_F&s#;<8BQl1|C3<Nq4E=c=En_vm=})qx0J;NZBJAm|nP%CJ>s?oI&YGstOJwR%QD
zmk8%eDk?!0>WV~$mPZ`unn{f~TxvcD@HoEH6118@xny5l(x8S?BZ;R%AD+LI?=<Y#
z*8$ipB`T|kL@tEXm7N`iPx*G&(f(kwwgVfN{7jk!qQ5?U9f*mx=fq7f!z3d6&L9bi
zD+?kxy66ke=YPzwq>kZwI<DbO3OiCbxgz~kCwS4x75Sq?fj6r~$H>solqsdLk!=zW
z1)A5@t<r@bMBp$?&GT`#=&}CfbqMO298|02-lkJqe|nc(86<!A;dvRop|GG>-#X0s
zN7P1_oiLhIgn~|v$tTeBAD9n8hN-lQjay-)jA1uZLy->&#=~-87DGF2r#x_LKG6iD
zs_|)!q^<_ivP|e3P?!K|Y{ufjg$5nT_4zhri$Pag$J-?7xX13h4|pc)_TJhQ<F`q@
z)TZ3N&TSZyK7~U*;>%d$Z+z<V#!}IM2U{V4U}bHxzbqdj`@CpBWETtW!$)@*AJ~W#
z*Q_>68_%}F((NeAzzF-vhp8($eO@;9FK?rK^pJv3k2rSPI)YnxTQLD2PlBrg*VOgz
zv<O^$<iDm)9=bZ5D+R77!fRSCE>O193t<G@uJR$RyLIQ7_#D9tiVir~J%3^Vt*s93
zdRe<U%;7Q?GT+uU1z{7-D_#cG%Wr5rTO5e(r6_O1c*IEKN6fAURkOr?6aySx{z1rz
z2PfVYzXP&7bdSsq_bYf+HE@f<;WNvoAj)p`+oPl+BEKg}iF5E8@2KVSP>Ti}qo=NZ
z)BLcDZY65|bX_BiImcTs%d1|y@gczSts*LN$>ZhEXx<|xYuS)If>|b*`$^#wLdNf^
z1|CEI!C}l$r9H|3&{Lx2k+<~@$joaZsAB*F+^6z(g>`VuW?k&6;gpF70hE^S@#0=b
z&36#SyBtk{JAYqfbu4uH|B5nv7DBvD=e%FKqq5`y@!!B&w>L?0kNTrT^`m|o@JH1R
zMZv@r>6v{BtZb{=2>j;$8uew>9T$UhcUW1_LAdOtR)R^JCLa_9<iJ3tP=-Orj!1LR
z;AMVHseJS~h#sc@n`6fD`0>N9-=8@Wx+bkyk6XCvsm#~Vz2`NVoMwl~?}H9Y(6}W)
zSy4C6GNMn=T6E^mUQ(ywXfMZHS0l^=XzP_lXSgS)e}Gr_JPjANuQ(VPOf=Q7o*L?3
zAPwDs$*zk`@fi$Yh4PYr5&EoKt2EPem&PF@fhD(vn&PHRashUReC1ee521UAf}6yN
zC4U%tm}$CBrVMs$Ztje@;4|dewz`r?8>S@Y&*b%281)*L?ZB+%#^GnuubxJ(CKstc
zbh|Lv@XQdA?ceDy8{q=zg@_cyASshwFhGKg&qcP**x_;mf^eX>h?FlaXm*yw&Yn)y
zH-5Aa&}<OOK9J}8K~5uRK=2d@3Y`DgdU<Nc!#qLmCLFyzsJ|=073dDIt6(h9Tq-U<
zgNraR<!nBp?iUKTw8%W0zopq~+zxH6y_mYYCAFaY)Q`Ax9Dmm8dEPaCJd49F@Lw%}
z=WVmhs*~NBg=UEkhDkhEDGg-Bj2XPaktv}L^AZ~fx~Xi9<Gc~vXR9p!14uls02REw
z2HpB?mZ^3e*FC#p6R*X5YF6=zm^t7*$fBTa6H%Cy#;wxaHi-}FG*Plz#ihUiBEem3
z+$C)rjRN;u`{jwRt!H=B%_qpP*gycHg=fxUErl42_ND1rpp-~z04~kYEnd+2<zQlN
z*={6_QxobUrEtsn<m@;t&tu^hXC!#)d#wg*TkY19sHtFnyx}>RL#EZ8RBd!bqB+Lc
zU(~M_-o=~&QG7wU77A+>=T@3Oi&UT6AC>M5ju_f3=OBEdm4^SWYgb_ZvM<aZcmSz1
zG+2W3OUFN$rQXWUpu(76fA<6J+bGrE#e|9`RYP*Er2BkoRh`nOFBzl=N$-UkckX2z
zXkmx^*<D|I=<J7n?!9hupLTbih#3i&T#a0&`ZB~lpy1YL(Jf}Kh$NX~cXlN<WY`)!
zc`I$>i-Q-rh61<R9<}KfRB+%qUrPK>1?!r1kx(=o2|};VtGgadLslY7G;9wlKxdTj
zb~N~Tzz1EV@xFRfmHSZzk_Yf;(-Cl;HJd;YhAWSW#WD=Y`S=MIBXJ)8sz0CU8YU^|
zmsP*m_znwuD(85UY$&t{eG;?9rWW&{@rR$?`E~yLlq$wLZlVTKseJ<7jCcj>v$4>L
z5Q5H;*)Y}Aw*a5gg3bL82q4lirgo*@^j`ukx!ccj#Z#kd6!FW37NaR<4pFEqFu~Sb
z%{i~_eKBe**N=K%*OZ)+fBFtBZi0i%M!P=n`&)>3&(YO?ySjwUnE1P1JFAc~`g<LB
zrh_--Aw!m1#SDlqA->0#etk2N?y{knaW2Sl2pYf#xrE=&(BOFG6Y5a`QB=aovdwb&
zI*VyI%MBb<XgYzUMF$N^b2kSW%J}G#V%Lz>>zy4+V-=%dJ%gyvac4^WXN__b8ATG?
z`3A$4=V>$*&dwjf&K(!O-tP7>fYOxjw32>1tshXEiw{msQP_P!Dl0xS!RKC)n)<MY
z(M0}5v94;Ot(=#ON+z*l;k75iGX8`)!~G%QsGPMWWeQckr>36Hk=gsy0v0@&dAk5@
zD97iN^2*=oH%u*CBpW)8AtNi68`gF5m_u~A6tsK_PDTxWURbVN>?Vq|7Ct*^u6S%7
z$ft6FL0Ti69L%vFIrLzonUd~U{|Hgo2~SpliH$Pl^*afBNAZlG0Zb2gJP3MlP%F7c
z1Fk-|X4vDTcFU;d$|#lX;M&b_P>{p-H!(eL3R&He^cOdvya<;g_phIY6y3mlj-+O5
z1(f?$qR19iuET(wTk8k}CF)BKm8fpD{Pn_Pgnug^o+pc8`UX&fmG{v(9C8PQuV=wL
z>KAzzh-Q4!wmn|P85+M?+YgA3lryAoRNmtWWWTrzW66N|<moKrY$fQ^cJA%#WJg>t
z<uK7%Q@o->@Ks)CyzPBbhxVbjKS?dU<_D?u8NMjTQ9+LCI{SZ{;hJ4G*#Yr2Z`b9d
zxuSuD<r){2?q>IB@2TO9n}&rxy@2=r2}9^~Z144L`lKEg|A9|-olT=wg^PXRx7rez
zkDErNp%3s%K;X4l7Y{hA$B9aHruOHl4U%}BGtC^SvfOl+PckVlGU{R1!=mO@#ym9K
zn&yZgu3|6Cfk0}C;_r}M-9Lxy&Zqnw%c(^q>c+}6l1nkbEj%B18mu<z4hQ?{kGPU)
zoo^0XZ5)s@Pv+ZY-ZE!qq{J|{>C!D3!y<%a7%V(pll2D@FsJv6XiNDECcUN&Y#ro7
zYYl}eje?A=J&dKEUR5j|?a^|zE;oVD`SAYEhDL<<qVqLQOI$A6mHj8>UGpA35l2-;
z$y<$<gLzkJ?b;o7Qdet4Jm0o>DUN}$(4AJa6Wll@yz{%?-^YJ`4&=>>a%rwyv#npP
z9)xwR9AOR)7`pPV;$odYK!o4LYON}-zL~~@q=8qQ(PlVO<NyKgkKHQ&k#_m7bhx4~
z3qvRcwfh6;C#;K&MXD%O(ijo~98(9QY))HRX<85`n5^lcoN2V;VsjZ%H!LlFwFYXT
zNl@30ESlt|DeG6u^X9i`d(y^`T#yT&!lD$ucl{#U=NaGDj-%Dda491xw%0rwccYEA
zQss*KxbC3n00;T~76L?5fThEineTWhJKYQ7gZR9H{6#!pN8#RU+){%};A;r@BgaZR
zi0|$xNUo+w*5(#*YQN{;O4!x^74GcMC)pZE1`yIQJmmMU;_!MPi-WVbCqLhtSBF@g
z+0S=)&CeZ`BX~Lt>BzxT2C^a_p^~;lz7YB=Lh6A7vZF>k@X#@6AmCLwIl>|?EF<uh
zr<sf*0r?yYj5%)DP)gG!+#|_f)ch)QF5Dq_<Jhr)SJC5nBhpfRZSh9&?a;l#eSeZT
zLz2;%k0P4fU;_`sm2!jzDitz-`dOr*0gCVM#KhZ|{<@c@*5mH#^Nw8Owhh0sun$@H
zO-g?tK>eyXICd)Kv_6yDyX3AQ=hek7TbkJ`ToAc;g+3^=8#x0K$&AU*tL&FtDlRR~
z(f(%+F?!aHAH2<-x|Nl6K~T=N&1ph{7Nkh{*#CD40;r#(EFOyH)7EUd`>UrX*;8f<
zLIworwp5ezmB8-R_38BO%4&jZ=?^iyQv+*Sd-)LTyXc@2h-WbiEjgXJ7K&c26dGo7
zg4WG$B3IxiG3!rP>z-DW6=%5(XL3bYH_D*I@9NCMt<RQgJg?+*H5WRPaW^bv2+G(<
zH7?C;V|0509dVBuO&Zo7o&S8#%Z5UUGS_tVyT~hDn1U6Y_&E=0YrzKj<*40udwU@~
z-8gFbAyC>k^7i@Cn%Cr_8g&2c%Ozl0o;aKEQl58%V1EoaN0=e6nOU_;u=nJ~W|*Hw
zJ;o0a&-ER}CU*vG52{rby|4Z<NUXMnly6|jDn8ty_ot1#@~BZyl8%ZZU#;KGq;cur
z`{i2ZBwsVDa~%4CkW9|XY)}IN=z9;!*-a}iA)ZZpE1&feGsN}@5-$|QqE0AMI)qU^
zI1oHswt&f;;MXvMe58L>O2B>HIhc%NC5?<8<l>$K<1<L2!k(Fmla@PG<7#_@NTs9o
z(lT9#_#RbE3L;!@bY+3&`%Ogr9bK9@!D=iso<65n^8(OBBscN%{P}?OvZC|GyKvol
z?dm>N2nNkoG56&A+C*-aV6dJI89Q0lxS<bLk$M9WEbY#Zty;oYTDpqJr_rhqPrKit
z&u=!k5F6gz^bH;BHWC7m5dkJ%tPpF}^0x0Usw)-@MGucWvY62X@16*xCi+02*sfl(
z#XeWvQR*dLM`@jmNs_cI197B~{B-#e{G0Tq+oM2Pb-neW1+j>*1|o=rc3NB#0kcu5
z^rU3Y18be4PR7`+gL#4Op=?cUOn%Q3<RY$_g66|IUI`v%B5Pnpr2FP~1Gh4z^7n7j
z63@cS;pN!%lJ<-wNs`af$?Ikqov0?Uq!gfowgpmU0Nh<!8MPkp@~9u`{f5>qN;KPG
z;YJILq27<B@#=X{ofMf2ui_Mt=PuIixBIfGu=W%~b9?b0-ZI$Lq4_R;T_jiHn28K1
z#9i4H@Iym0OJru~KlbhVZ9Zz7<6XhBjv0fXXC58~W(uqq(zGM?<JinCx#Qj{AFyFS
z0k4L+uHF5PCl3K2@c)mlw+xG8Yubee3mPD}yL*t}5(vTF-QC?a1b3I<?ykYzCAho0
zJDkqm&vU-_J=gX9f|==_SyEl=uByALhJUfc-EPU94Y-vRK#T`JvSgBywbp0(9Wy7q
z88|f^C%zty-RmgF&A(0eopb6k6Ea<*>};`$(M7cTcI*Qkk?s(}GF@v8_cJ9Zepz(7
zIZd~hra^W)Hrpl=ac!KNIjou9tvL6|+eq!I$R+fIsVcu2E!wsf@NSRYePTadQCw3a
z!7!nML?8r<YFyhiA|+G5n_18O4w(qYnhF&`im2X>t{O;KT2if;o+pnst)x%D(>@)U
zmM(O&zDDZ9L2w%sl1!;;&<<3$C(t|EZm-*$tV>NYE`%w-<nO3zA{u-WLY76~FZ=Rg
zj52LbwFP?FrU$+D%?{t^DKV!ok=qMQy(mt$tdN^1GGayv;&hzmKDQ5rrjlse-5Lq^
z<oDXlFXcm=Uzo7{JG+eL5*>>tU;Qmp#Q~~bJ)1Tayhp8j#hH~Hw!8tVyuw6#is{|A
zVzg<Lht>4A1<0?4htH`KjwzTj^Oq);Y8`nCJy4SxJ|~s5IL)6kRgiV$Q25pPA77yh
z&QrnaT8>|@cEP9mXAfKE(&!$jLt@dsx&S`bow={~n+~fZ_0_OTnOb5}hd1}#tJSS5
z&coAk@+`EZI^ATvaynP0wYl6xSqg^LtV|LF63}8laYeX@DE?a2+(b0O{M_df$F?d^
zA*~#2&MWY_*da_W`O7DJ{kEzb<b&U6zUmYL*vY5WiUd1Elzeac?aa6^ll*+=i)=WF
zTrr*r&{OfI{&&kuuDS;(D=#(t=Mq^yo%Y=ISVaxad|o^SMVZbO+9K`set0rICbbDT
zNTyt?Gx@E%+r;XDagG2S#c9bk-=@9pDmIK)!74`!L^N-{4JPpIKCE3+H~6{SMZh=$
zyMgh5V@|iOf8VJ7xdflM=}lfc=I(5=t>HW6Up`WAL^zZ1;s3{t-?r-+X~7*{!$G!n
z2iQJuhr~xQ8Z4}6=(Jl+1X)41{Y<eM=@6s_-_ue*jTl07gbSt$78OEYRBN9fdfBvb
zGAW^ht{+T)41z}N5lFF5hK$1J-5`*`01nXS7IN`eHAR$W9p&vvpBK*yOHJ<~XOxcH
zE|QtWE@dZ4=L+xj;<@M7G>i*Zb9Ekm#j~#!&9Ca$tKQ8!*aR#WU{Ma0&1o~7-7@~;
z1$*JlGKYV@8N>!}USr#--X9sHYk73kC8$~>Vuv|tvc6Fl<OpP=%2Hop3)pBJ_c?v6
z_Rx_>u;crvP5ZdroB~y{d!?$5oCccQ8YGNw-|J6=v(pzV;oyB)Mh5~6OLN|9Y)q=h
ze6|PSp_SBP7?UV=Esz%;cgh(y(}f8Xi1c0q!Up8^^F%U97g%<5lLIR;mY4|D?4E@h
zm5J$fgm!n}z8wbB*@v|mnduD3*z&?4aBG#J>`1i<dMW~vTYG3e8~Mw9Yw?W!i(<&n
zotIFcNfSP^FAw{PyTj9cj^g^RFWo8h<$VcBsbW^R&_LK5ux!6wx-LVKByhB=x@4Dp
zz6xUzFI2&L@Lis#{uHT--Yp}elEjc<6nz2BSKPrb@1tihRzFT`w&#6uQ|4%Q(}}du
z_otLp0=ZmVYAJ!6TjtB8+p`S;r(1hdL0A*6B#vg-F3{^dZ=!6uhWX^Rmo4K$TKbIV
zry^XFpRCO2$>@n21e|d2wmUVAU}bV#fzg}%h=2KWDpb~_IXVc#G`U25&Is0{QWTR8
z;doH+r@vnU)B;9W1TG9Ua*f7#V8;d4&tECJU~ChWBkZycTxB*YoXGY{=Z}id)-iX^
zp{4&4%tOLx{Vq@g3%*_bdRJ3tvJ82{^2!vYI?_`kL_Wj!q#OCl_xUg|Qja-w&F9v+
zJs*uc{LZaSpy?j&u@y}G+uacn@>!ug(dygMc3=ugdH_;@xpV_QainEHCZgrQuG4PP
zJZ!+B{S@6-R)5X3>%c``bMiB*hF?+^EkzP&rI21Kzt%JD?Lo$BWj)uH@8eC^+!qE)
z({39OZ)e0U^ezU=Z<YMfql3|Xe>7bj1|l{QS-tNYymXR=j133%H|_xkf#V0ietsKV
zwT%9~@O}(+WNry$scXAc-V%r7qKw<z>A9KMsXql0zba6Coz~0FFQ7qnij}y0OKlBS
zKCfT+`1RbwL2?2U3En?`jz*ZU;7k<nu*j34;qRp4TLzrJV_9E~hnd-Edlp#`hphk2
zOG54Qv9eKM?_@vN6%vHLAf%oP^OKd@p8eES<$BoMQAC|=6@Bq(XKCeaT$hkO<I=vO
z;_dMP%;*b^QFTL!Knz3V2f=h}ox;q*>G5(`_XYhSs)RR_+sdLVL(d1VJ?FWT9dLg$
zAg00wWryI#9VgOpG}OSV%H_$IIijraX0~m}*}??xLy#z>HQ=M8?J?d}Q+BLxu-jqd
zZg7n_z4+C}h(@gtR>c<?RQ$aZ5>(GXGFhg0?_l(vDPw3HnDogh+1awP=&fJ>v7lwE
z?OswZGxK3|6t*%o-xbIyyDqiw#%?jcKkay3!{Q;3p+bu-5h8jC5c^ZS`q8Q3B093x
z$H=^u%UY$uH1hTO2L?N|ZAHI~6#WY1d2Vv0-)<B6$pEH3oAT@ugzw0btGg*Orh2Y~
z^Ng}u546_p>__5sWwA9>KU>0%Abo{c;{!5JUv8%ovr{p~&#;@69G=(d=1%x#E$(@J
zynzc`0!+Hrz{?}Ju=4E!fFt$1^sBvL6HF4oXK(EVeQL8$uUMt%&#IIfvpZ$xw~lMZ
z;JJ(&NOjF#W70bNd6G_$zsJfeNTn)Ls}{Jlhjj~==lAX!KI+6{n*#?5n-=e%tqg%I
zbnr_X@_)`6MFpRHwNgnjMKsF#N{iF?6tT%MZQ>6+DkG`xUv?4O1S@DXc+$V?Xef+?
z3MGwt`!c#W>vn0svL<*-n<>mMB_c!Ciho4;0OP31{I<FB$%GNvf+<gVhmW~eNtuXt
zEP))9PQQP3*O>MLl%wzma|TUR;m>RuPDO%aG(G}_kz-6*iV(Xx#0Ep^+Oi=I)}sR2
zfL&Zn$Bhc@^|3hZGG|3@g{Os9FSbhGxuL@v%biEzpVXh%0Z!`=Am1rd!gKzjSy`t|
zkTgpATZo^8+mTV_u0}%|Q22zUAZj9+Btry<+%vN|1?A_l_0QXno-6$pv`wd4PikmW
zmDy1Q-}L)lq|<iy5}AJdE=+>%TwBIbE6S~H-(OX+a4pw2$3D#YiG`n1fz~ovmyo;C
zb`@|Hu4yucnrs*sL($I5U8J*}mMF&4$$qE1GpU)!;B-!So#vX71RApBY2WdOvZA@k
z07+~;KIM)b`E(1@dre>K(EnO?1^})8@6$#3zoMy2L*Txq5pSP9c#Z8zD0v%CNAJiT
zwz%RGUT7|&VQ7Fswn5WO6p$aW4@1&AMwMCE=~v7Ufi~z%dFBZXp;y`1=3;xpd53<a
zv!<hEQhu;CNIK+pcqcQlxh@Vv$F{>Yn!bwtNsixpt}7qCgVwfIz3npFs97FR%L1RN
zGV}WURl}6|s&MwFO+ql=;$KVuTse>@o5ZU4a@HS4Pq9cWO~}h&6ST>*>N`v{LzgVX
zrq1b`QkjnWiB<f6Mx-ZyQtyTQeC(fnbU6P`lfQ>qG0xnfvJJk_B6CMC$w1)bf&(~%
zJ3I5R8yTn!SLsAp<n4S4NynC3C`lsr>eJ6xj&d`WFUB1<JG|=8@zFxQ3H{->1`_6a
z6T-fD))c)9-qz4_Cp~FeGqMQ{2M9s(s3qG}Bd91~mscCQ5FhJ%2!|W$K6mIXerL9s
z9oH*#efrojALW(^@TQ~Q72|wPY7J8}|C8ec_@!;3A-z0sTf}3xQg%clV#k@;5Aqf*
zP;x&R-e<IY>Z~$)ZFOS?^Yy)cP8~d-{hu*I|8Je33y&lGn05%iRAt6k6NDFbadK2<
zkbiQa1P#6<MtXM?IFkfP*5=1(Of=4mH6YOH3Y+(+alaqsZ9G-t@&^vjIS7cNO6DiR
zgYBF!2D{dZZ%0D(>m^;*^?4woA#qcUuGR2+JbdgudT~16sdQRgPA^OP5iB>VEG`n9
z(_Q$*=V}J#Yj4#qYR>@@nc_a?yM=emvfjC)8*wZJ6?!c36%?C#pk7_&Ip@XG#Y>%q
z3xTSHIf~x}Q&%olv7;b0V~Vrqp4OHAINpbGmIR9DUna`yVurZlR1t(Hx1K&08_f3K
zR~Kx%p-6Zhz6pQA1vE!7JI~DtuVL{gUni7x%x>9#rP&2?*|E4AUxzM5tt?a9d9;zo
zAQ4VigW=riUa#q$Fe*C-)Jcd_5KyCuoYlEM1u8NH!F&kHZyx^$J1Z0vcUjnBnao$C
z_-X<h_o0t!CVUIJ4T@fAJlzx9FP3Ts1<K72NGJ?4x{J-f=Hq?(ZH3MUV{~LPXEWFw
z^d{L<G$|_i_n8XFB>B}mYer#A6drI;{&#mxAS;vDygMjdzK41{XFTqu8p2(u%(&F#
zl6|b!Bt(YtX8rhWxEP@i+E<qf<a-Q-ER8xOX&ziCL-cEOd+8?VE~ewaJ1Pjs74&9Z
zvA)OD@%(xskeq7x-=L#@sF-lkIPKCek(0kUnZQpn!2vArzZy2LviZ_}Yzy0ME8vvT
zsqo^{fo*(!WJ`pA&GIdXC=Z>5`5~$F!>ZDMefS@}Y@xxErQr(-hjJnj&fJ9aDMLwY
zmiF#ylgDAGG+pMKhBSyI$^VJmlR6VgN&%gegBSiz8~9a$rp#!lqJXK*X#kuahOjuI
zgl5P?p$1>qT_x3D9vW_AbVmgj)|OF&F64IviBuKVr+D03*4%4Lv3*_V?(&(H-XT@W
zh}rX1F`yh(QYxW*1uRq|2s`Vx{K<rT`XZ&LCTOMbQ9M{Cmz2(-A6P;iwv`$qL-Uta
z-aV6?aip~2HbuUX{1c|_Mf^LSe}@IyT6ZVRR$w#9jLPKPQPlxB4!ckHY$g@)Y}yv^
zYi=3(+2Y6%x-MyHX?gj*awUdF#hX*RA^sXyQk~ny=B=t%yR&;6uVtx@p#bEn;CK~E
zG#fFt@)1h`KkM|KFD}rQuLKfdvrC?6joZYbJ00prd5oXo>vUEeY0hk0sK~WEzRWJ4
zApVIpqt+(?n9q*cdDJkVjL1I<Q8)<GDZK+7FFIGoA}#`Sk}<LukgA_~n_h$W#fOGO
zJ~p!`E<qI%KDW2$;(~LL%av`3e7$t_I`$#Fh`twmh}pPw1@>YAn5rZkkfzB!KE8O7
zlZdn3dMm?SHiQeeuK;Ac{<9JV6ulj{eBGJU8!SR^a<C{d3}Wro8~Y~ng%vTYhs(j^
zgRhT^5!fawv<ei(0f&K^ncC&v%{rE+@Q}}-N&QRv9{;BtN(BFB6FM%ZvqRuY_HiAd
z7gbn=xX2L?4Iu#?hwc25%L0<7lHra@jy=n}`ziex2D|To5459lZwI@F5YOt>%v}b*
zwyfyT{QBNcrjTrWQtZL<*~J3UsqW)Y$-Jh+lI{`f^E1{^c&nnC_8&*|j>_8C*y<JW
zUgf`&;GwE=J{C^I-S_-Gd$t#x{%>-mq*}1gu@|N7hU_$hiK_?R8(lS(LA$3^Npj54
zeOdqQs+5Q~&s2a|x<(AG9-u_c539pG09f&LGFD^`YcKw&Pcp|`hJm}X=2&D6C;`42
znx4-3v~b+poM5|tP+t5(Q)0jH)kp0y$*3pNZHZ;gl5p{;)SOO<vKXz;F(DZ`>Kq6s
znAm<xH7F!}m(P;;RREyhwIUa1XkQ2$ZLHLAZl|k`Y3VBk6qBM}PcAG=hZ<uS(~H(?
z{iQU_A$Zb%AD(}NbyMWWLHT|9+OhPV;xm1D)p79dAMMwd)ctK#vf2iIul$m$CFHLP
zM3Y1MCOP+^%gZOQtD9{(rT|~DyYSPAqx<rU*ER))jk|gU_{O280$&po=SI7-^TCZ%
z7bC5FE`+`uT+67-<Hl4S5zgDj#`Gn1f@D#S?JA+Ev$Kq1|Hluh+}~v+-~*^@u$I<W
zPwr~BA91fx3`r6|WzWX7LBS0H&%)6b)*W-6X{_E3r9R+Nbgs{3e>Vbo?|<cR%sqg?
zF4DNLGN~!{WG`J4h~I{F=U}4k$qY-Gq^}@Iy_L*g@56!l)+>73-#l<aiww=kU{q*V
z7`eU%-Q#ls{CpB4=dP75O|Snq6o60-<h22-6;KT1#1?|u#v<s+i=$4&_DbbVcD=k$
zMUg?KVxB_|GH8@xANXjW7ZCUFCNvuAoqtTkV;qN?_!m%tqRHogYT00*E+BVi{>x7g
z8CmggB3~k96h2H3j-_>T-621tV)KBk*6yX-6;;-~qStZo&!JUlns@7pn~z0t-4Vi1
zC~i0CBPn%XUFNB@&bm__i7Rx*m(94L1PWeiL=*Zs=TFWgJ@}eA)OvlntKI{ui!nVf
zQ}Z1f56B^wp@}yh+qXvcqzbL8g2z4ha!C@v`*Oi_zaRFKlY@L4`fXSw&jdI$yj}VT
z-&-p3b?Bu;<-Mc1J(%{&)>WowwDnsYhu1Utb9E+vF@8P%$y2NA@n-oW0FwZ#uHL)x
zY;xfX@jqg9p-@;_TAH6PoI8cA@Y3m;-f(oZ_Oi8pvW9RxaDGO$TJnc(z3TO2|6|hq
z+FMkvLj&$?)!%Hp30fLAtr1r`5q_F=XVV^T#*$*h>By@Ovqxjg(`crZZ>ELlu<&62
zI=+wQ8_zN(9xM~MofBXs8g@iOiYdjb=0(B*y=3}rO++vr*e)@EP^|FdtVimkVuwSW
zhg9g3Q(ha=)<fI-Lv0ed9E|;#G~>vnQm7g1)x#f^hm*5uqchXMg5)mKlBsYayyyC$
z+Z%A**uA;d#N=-17rxqBN`WaZ&h*$aE}pgtS?A3e)Ar18!C&B@_NT-tzEK`_(qD%;
zre6_AgZ*_yJJnJrOd!!-c)$o@z=JcW3zD-2AO1`vQVO4$5Yc&)TdCO7As;!eSze2f
z18l5bmN~Qo<EQtQ?(QkHKA56l54qPa!&g7hhZ^DY^3&Gjo%mE0<H8HD0It=vLouEI
zasgJo!OBr!K0d|c{3Pml@sv`v-~ERrP00O=PX1CT-8%Cxp^ir>_UMAv3)={D^s789
zKuy@Rw)?#b^t^}q%gC(iL8aGJFVX{<W<2(dZeud57>iO1#3YnImNlGu6jjc?zMri6
zXrijDaXiTk_DcFj6N|%`_I(OP5SCad(5Ok6bELN@(vp~26Fk+%o&QZ=_u2q79XEbH
z$+f$8Q#b8cMQMymJzbTzZG^M1=&JWlyZfW{hyHgj4O@JupEB`xgf39ZV89B0w~gE-
z`_}&J5g|Dzi^F)z5XW`O1n0Hcc@!L>RMaL!^kM1ziq88|;TWw=;oD=Bf=k60Die5+
z?%R*j^Y;^7RG^#(BOiCQ?QIx5H3+7Imy&XK`>2u!>5T*2!=3qq|BC<MKuw%CCN_3R
zo8KgqE#AG=a@QLep)-`=0Zq5-F<MuP7e0t}G#K?7<WJ;_m^1-l0@uK<+yo8s4bQZp
z79B<tgO8UhFmG(%LFK)ep;ep}N6{=VDW`An#&YXP$XY);i*jx>9d56+12WNArg4>f
zWiM-;#0JWw_7QXT`$B|Ls!Z_LN>fk7WSc`EA`9l|l~sDMD|*;KkeR1foL;X9KGsy0
z`e2}Mf3^yJWXVv?742@i=1R2cbF>6#Fr3TcV>YJV?>nSS1mBURS<}HkPd`j^B7l`0
zH&vXZ!Kr|JWewNlnn@BPWvIruQd&AZ@~Sxeoipj^o|k6E!C=^Ey=7Cd#rh{0M7bI&
zU@0*KV6sQhVIZHLn8M(Qzsa#czuN>-@d%3l40=NzO(y18CInr$x|^&K^bpc8B*=M{
zXMiQ!6|f76f8@7{N<@r2JG#EP`_g)S#b|<{#R$Ph=rBn^8!obfk%5lN5EUBUy(Vw~
z{`CCpA)H4f>8oY8=*qLEzvu&$aXo=!HLA}ry+U%-nxNNjbt>8TpbGR`!(Jjki+gg>
z{NO)e|HDq8;r02d;V~sduTD}@kbMLS)U68B_x(p1Ba^$dg30t}D+OFJ$c3)Dv8>$o
zo>=@i+SC{vneG7WGH?Q*I%bv1b2Gv6mDM=6Rqh{ywmn{*M4w)hpzXqT1_uL~7<EZ;
zQgld;CJ;R6RY-6BZBtp&S2AQ;+znvSu1cY2H%5b*)Owd^iYOJprvP0xWLiXaub*g3
zgu0YL7xO3Y<2jPE$SllT!jm&$14M!_S`@{KEtZSSl6JxeiIh#EP5x-32dBqSSfPb?
z8R*}>1^D^?vYX?!r=DBpes~AB|B$c#zBFM?h>6=3`5h*knmokQk%G-(ZN+T?-E<hm
zh43!3t)kLr!1Y-1Yrx0IR6<z94{BdsjYG+VaHJ)EIyyq45`F;X5ru=>YI{b2F^lJ9
z&>H7$)+v6IAZ7lReXRM`+Xua!rk6*=irYEMtXo~5`K(Wf0VBLshM-ntO@>d12qM~w
zOKPy!{zWH<2x@0ZM9oE2<Q$P>Ok*I+?N)$dqFGx~m}f4rdgU9y22n?oyx<cBOrmdG
zKLYKX7m#4G!<t^t9M8FDZ|^Hr<UtufW;4^(o48Vt{BO87%A@#4nz!FnecNsL-&)ag
zKp<nrR)_{4cc%M0$Ik*Aimz{#1~c~)q}7C<1Yw9Bhg^}JdD(;P3aI_#W9>j76YdY|
zO6VDl`g$SqB!r7XnBv!XGCU&>SPJn}IL$=6;Ft(I(A=J{bRby&CR2ikimiC?SAz8>
zH_!tdxxsjdLa=L(29nedTWqSYSRl|qgatkz&LE=hqNyMb_Rh$m;LdqN;s9h|<2vBi
zZU!>d<ewVe4m$=(ROc5LkucC`dcbtMj=J-0;;lVfxB~s(f*p<Azu|&>iJYAD=5X^n
zJQ+w5!5c+v_qTs?tYQBBDt3tQM(LQ>Ea^}Zu+I7E4II$M=cAu{lCVR!v#QG$>TS*{
zGD6R4V@uB?3J-CrX`+qxQ~lq-TL~Q6rtNbE*3RlPW8&N6vr#i$up8P2z*w+hCM1ys
zpE;}5{AIMs^hZ!(T~WO^ad9aqP~(PG`dN^x*a_I#otCwDKT<w@EBPI{E1NWBXGulx
z!o=-5qbz9Fsu|sfsrPqmjgbn4(&^{db~=5%8y;tsG~Q27<%guJ&yzmZ_>bah7yQGM
z?}3b}ms_};Ue*(8V8S6mxrLjJ?~dDPi(ZATCZu_a9E4Xv{62=;X-K!}_V;jPpJz69
zGd4t?_+MeT@&5jZ0#?69A-J1(&1WeX(6|kkxhF`#Ts}t>R9G(dyU>AZ7&`^WN&4(s
zm&wjDD=vDE;uA=7R^;O`BH~ZnzRkJ=K_^|AUo}mrSkyY@;<EET2H2<t2tZC;9hI9?
zgjHbFl9eVA>i{h^g~*tt0L>O9OKVu6Xi6Vo$o_t*v9S_5R+0FLstD@~=?lO684|yO
z59ym+4pl<9$}Hcn%3&4qZ?PcTf{924&wo|~@%QT6-Lg&Qvl~zs7N{0`Ame=*rURI|
zdQqz|{<dbLkP6Y!o1YCI0{RT}?fOyv3d^cwcNKgEI{`Ju=Ng#r7+r(CDb$z3v}Yw6
zEJ4H|&%*F$88<;AL6vlUd#Zhwa5;*@tpW1r4}hS8Kz~fFUW!`hyFf;&E27A8qhlGR
z?Y6e*$27&fr(hs8!QCe)H`wm31L{u|L+Xkt=)xg=G}xP)ppCMzK5$S9vQ9FxI<-zr
zN{&oZe6AQ7UgUf1r2~6GCMv>R#wj=`Se7xkNdPa&%5KgTj7YwbM2sHsRt76QQL~H)
z%s<jbdQQ)q7U$`coV=Yecmg&dHKprxeN3LEv9`OXc1wIeek5`W+h+upp;#<SVVJ)w
zG!L`asO>_sno4*${<`3?R8>2*gJ@*>WUlTW0UM(z9(u~DmQE}1U8ERQG-?^i;7Z`f
zT~B&L85I=q;=@WTxD7u%gwu1!<eZDz5_sx2Z{xdFk=T_lI#~2!<MB}GYhcX};CcmF
zJ??qBYe=Jqn~H*hkJGSa6X%=VRMomnD~Ei=>(^}%Al=R@qaZ0xov+`lG&?xvX9r?8
z;fg!AZ)^28`|LG;G$b;JC_~amGWvX3`7>Vgt%-&@^s1VGNJNIJv!U61#Q`o4d<;%W
z<xUL=^TfQ)<b;#WTN)yB<YG$f)}Z<d@+mH9>xA;9EZERc5U{2)Cw;bzFN1}0sM5Kr
zMLFX>edI^kz@wL*rlcpQe>0n#Yl`o^!h_pz++??oL%Qc}(FPnP?OxASgp-QMsHi<m
z;j7jiwx1CiNd5{;wgvRHh7D82`jmG|_n&0Bt8;q#9-;Lg4KnR>?04136j7N8X#}My
z_{OFDu@<M;9j?#SFTCp-t6;uN;&kz0Vxb^kzUhntI810FT@-`sKl;plmcq9xfeoE~
z=G&zUYMBaN-A9NZ5H-p$a5P2M6TIB{l?RPhYz#WP;5R<{3&141h6oP3)ZvZ17frK-
zVAM*jSJeJ;wighmmP+MeC;u})m)KFe`nlai8u#r*O89ps6anyMeTKhWx0)^Rp^-{Q
z)GajekjdhR<OT!CZkBPQ%o>+VTU`TIukx<&W!BqBX^oi{hCfz;K#=Jig%3l&M-n)A
zK4q5zX2H$vtsZ|CwP5M2b{#^$#$Qz*Fj?ON|KSq<bboebMbm4`Wc-cMdU8BFPY(N1
zFqGTE=D6XRp(@ukPZWz1juiI5{vNQ{Ca-rd6$9gu_;iHQ=H1MCj(T=#iV#ww1Bi}q
z*?w_MY(W-_B2VY&29+)!we-5l3x7(1{bYxHVC$r@idlw7oR|9`05Dh(Z8r3sGQfaJ
zX{#3ZA6I;NDahpsQ?BQCc6K!O+3g=LHnk{gUm!B5)%h>`3(vn$T!6hOTZCmd^lTx5
zh!yizj7PKC<Ov>Dermj1*-PctW>E=#8iU@zJN3#i^LY?PrEYfMYs&YwQ&1*CE-ETo
zXtc>xE=`v#g3q=%nkiP8;Jsh@Iijejs4J2^b#--Bu;{nx$TjhclQd~0Hz&vTox~CB
z_4O4oNQek|l2v-hiBlTr9I&*+<*+vj6wTAK$)Y|6%A}i2WN0<nZiZvCR7_yl@%d5H
z9oV^Oti;g_J*f^nRQG_HJRhdsjV#~zPVW}mX_cW3EUBw5?^<=uTUZ^XD~gSu+v>D&
z<!Znt@abfBLc^%if4O~QIhp)qWWiJ7B&5~l8Mw{D4BpQ}50eW(ie#iUzTdy!1+V3h
z!PPMsFd^|M-G3#qNik8wnHaJRzIg|g-gjFN!TJY)A+f)p`7tI5Pv$Tk+%GSCWmI?2
zrJehyT|b^$X9oQaJQZvt;maf>h&<dfEeQ?@nKG{jitp{y`VN)<qjC;XyIpAoO;fil
zFvjyhogOt0aV*;;g_a(rb2DpGYwSl3BrMm?2FkzGxm>jk+g6pFM8fC_(|<s9v(hT<
zn!N-r^=Rl?n<*oZ0zEu!%j(~X&T^f(p$!G?SC<x9f6>2RWB!yPJzThw{7Iii(l-9W
ze#(XyeOVhunBx?(`QsYj!CS+?qATM|sm)3t&dpkL!AX;(9af!JTFa~+^jBjHgW*`}
zUZJ?I2X~93sH6MLItLIa{z}>-GJZ!I^JP9eG~2Y8$4*=9&Z-$Ur)%u-i4Hipj?O0*
zSKO9=0uUhuAlSh7YJ7&r{i|VNv=lge{?0daH2M<Emv_o2g-Cypvma99F=f$3O?jmt
z5vB8%dxeh@ldLbc)lztMSoKksSe_{BTeqUHxlu#-C)-Hb9ICIz#%pqqlaeUDR@`Qn
zDDhN+rO9kZt9F}@p(*w;z?(*!fDxn8_7>^iYa4sLyv#Fq$A<(6G-Fj+dZt+T+GKcB
zK;CddElZG4X{P}*=IqWO`WDmNvVjG+fy@05D#5t=!5VQsRWbkQFv$2|MjZ7(aXmU2
zihc56Q5cR@FT{0FaQCcue+{1+eX8YeWWfUpRCkNH0G-bWsHmtvwEoVOIG_T5)J7Oa
zYPvR!NqQ8*H<BT-&2-asDIQ%|aB^xPZ}qip3|ipZPI=RGS$JN;P=EipQ*NA+(dKxf
zjgnEhvJUdMcsr~9en@}0@v(KEr4w!g>l=*X+#UFx=A1b6Y)E;vGG}Kj0Ue{ZG^iRP
zk9Kv)4d=a%Y1K^}5Qs+4du%Ch6mqHP#I=))2OktA76P`8U7KH@Q{Ff-vh-CkFB$-`
z{go*-sL~)`CNAE3F79sUSX8MMSe6L>^d98SmdNAb_F`Ko5RC91egduj;X}Zmvbf^s
z-<0zVJQ={|rp?SK{$BMy+Xot45IQ6NoicJu0aFJ$F{_r0jZjF6Us%G95sf}Nq;I1T
zAskSWeO{w_KX?co64?yhz=5Y(x<@*Q?FNoJb3paj9vt*3I~WN9X6cxl4$s%Lz}b(e
z&Ypt)OB~45C3X3O0A5_$HxQ(`c?T?%23Y6xoVn!<#Dx+yOYvv)W0v<von!O##4nX&
zt4PQg^g!WsuO8G9L=MC+1#kEK*YxR;AvoU)dd>2AeEF`=3gA?V2szSjEndI!0}B&1
zLPoL30E~SxfWmC|`f;0&UuTW>pXmsy!0$DZ1OPc?va+#0)v+&%DKH>al|~~+5=xhX
zigemoSSmHLj}<Xw)JcWo-14DL)%YO$jkxTGl4UhJR<U%!B6Q}X4OpaadyHrh2&z`6
zUqJFWUjg$yrn(WkRzz7)3=AY%9Q*;x4Lbcf{_)_T82O2gav|c9q}e{3w`+Oh$f?N-
zaUNS@+f0K>l#75-8iN=bw2nOz;y3lN7Voy^P>`91M+u~CZGS3w0RDf^=6IA;d0ov=
z3uR>tX$s$e#gQ%ipQ*et5c<O8^kGI|UQvS~fjTj<*spyHGa-;cU4(J#OZ-d?3!Mu;
zUlJO|x8hZ4sK4lVc9SLU_h0*RYgr#d05-)p5J=Ez&+`_N4VDzvcN0`}U*<46S__eQ
zM5>YJf<_)|kDVX$aksnTpnSoBsiXaVWme9g2?UD7eQ&9H6DVRv`~2uk?C9+$kX2cU
z^b97&1_;o4ldFZfIgK;?zhX4Uv=)k-n!!;yh7O|642@F$H1432B8=jh2>-Df$@S?6
z(5<0c)OYH$h`0dICIswGzTw1-NA56UHYC?!x*Io-`9tUQ4joc{q&);5$hSRo)w%Zh
zSDs>IeMIp-CzNO5$7Mdo$Ru-=-r_&D>Npjm;l|&9^5ybJ>Tg!|QH<kv&|pL}6`!pz
zFTg-eL?fGi{uLkouf?k{)~o;>K!Gjrzm)h<rS(-6HH%nrtLT(@G2!ztrCeF_L8N5T
z$jeZphWZn}7M%bGr$!D*<a~Gl5h=?SEa;-i13#(GXxysd_OPZ~39dyJ4{W2dIvprf
z2s%D<uoXaMq2e$wJM0Koj`@~c`06P9eAv~Ie|;1h%K{tXZk;^D%RtI@V0-IE6pA8T
zY}i_Iw|PLXlYlG+-kHMZ!B0SWVE$hN*0cnhJs<SpKcM8+uk!0x7%7$UC*@jh7^2Ct
z!J^SrVE}M!$z5LNTPR%jLZ*xMfk89Lns6IIgX}afg=(_(1)aVq5N(4!SXdS=N-SUb
z&hFKLMZ?x!=Yh`R_6o9HoZV#Re_?@$W=g)*n-+0=%4oc*I6B#s%7p%uui@6%W9woz
zT;!-k;)s#Sp^)%Dp!?p>@h3-h`)Yqk7_N3swV=mxnCwHT(pkiJ{{O<LTfe_=nuT$f
zzt@r)oW4lPeY2MSZY|xpeQFYdZ7)aleQ1$R1c4q5q`Mq>SYAuRLYh2!Lu%IM%q}8M
ziv5|8=)jW4+_d$k$k4;XZJ9?{5KCZkZGq4H%Mb1Aw{s^M3Z_d3{>NQO3k@3}W85-C
za1@#-Y)KjuQN)<iR606Vcv$#b*Ak)GQd=a;Yni|x*|#Z3-^$V!tDHk+FNPiw%Gs#C
zVblW|?a<oHi&~P{7+5jToR{ExNLt?yNMhEu2ybo-X+Wy~IU11uIT{Qsm*sBfgvh0M
z0T&J*ImK=clW*3*tXfU2cK!M(Hr;|w?f&}GE5mO_(NIz}Sd{Oh&^np<cUA8Dumii6
z&6O*A#$dkO3fkxRuRja()wG<;`uqH-NLYju(@2zpAIpB%=9O&<0xleo%@8;AEp!_}
zq|AWAQ9PEEKAIFHZ-oMo_x96e%LwoPANKkZ1r+>FPEM8$rKJGW2Qd2V;}6$>6iwqE
z+aIml9SM^Ndy>(^K|~(grdPL2q&LNvSL2lxqchd!ntmp1_4){)Cj%4>=n`k|;bpgR
z`w9c>4j$(-=k&P`F;3k$Q0Y=44ylntMsnE9#IRsdzY8nUjTl>qK^M&u&gv518~aub
z93F#`ZdKpMa66{JJPaJ9x8<7YXoS8%czTQco<GBFI&lVIwiE>|ZD~!7{Y@|UKaV8<
zHn4!>cNN8xPa|x?c}iciqwTcr)m_oF^YCh>slrI3Vr~z)g{pD6&1RkN1e%rYr@W?!
z4BcO4nVlxbgmv@$s;9>x`#iUFD42a=T*mpYBsvNM0W%#P!n+=&spa+NQah_11Z?NN
z*?92Z9Ti>HO`+o-+^;cSv7f<!LEZLlUw-)T!IO{Bxv$)yKO7s&CcWT3`YXRPcxs-X
zF7QY20D~++Jc(W-kh(%eI6zmK-DAc3h!k)cCam&Z4=lj`1rq!OQ9<dx)^J!hZP5Z3
zC!SX^Mz9_5NJs-~0+@62^Yd4$^L`s@CvX9i<$)9X+x+n=*IRUw!Rh}h{z)lMu_!Iu
zY+TW`aL8<9YsDH>x_;o!J@VF*a5c|01mMPhr1I~~H87-E!vFi~FKAPN{yT3KL!v=F
z$}C*Rlg;p<f7h2dpaAce42anAk>i3s#!^CD9N-%+;2yXG4KppAJ%19b0C$^)!hiVj
z;EfCX+lce`{HfSMAjx?cvWtN?8SM<Qm~z^MvS}2+>hBSFkKb_qeSf+f^a>2yRFEJF
zTBw2@D&(90-)X2o{XeS@0gM$E>PMF@Q}8QO-_M_$hCcwm+O2fS0@UB(wV=osJ1*83
z0gk(WkS8(ZUts`t1byqj4!!^N{(onjT?)1RFE;<reU47Lt?2(=`tKrDApV`kf3F9=
zan|or`R{8jpu)qyqV%6n!T(-pO-{GGjon?tds04Wc#L~I*JgYvSwLsP!rNv&qD@Ia
zbs!G~?$_SVcM?>Z)7IA3*oX<7h<3cZyr-w9{~0~%Un}7t@zO=e{Dvt;lx_<)XP>tI
z$;C3mY}xp3(C_LB;%?|?BEFN+?remXYhIQF`M)CdYO8c}bE8X{RcxcH3yM<uJFx6)
z;wpGxH&Eo;fD%q^WFQc<Wnr6j<zk(-)%A?m70-{hACtSA|8td0-LLPT)EMH&1Y;p#
zNCOrqO$7lt*5-)HojT*#g^*)SugD@5@tRf4_Gu&D?%Qh3v(SkkC~T{zbtfQ=>eccu
zz;jFe-e&l~Sikp~{xp03wR+_)T+Ywr+ke*)b?c5V?yh!qHJ9&`I0~AC%F&Ix74vuP
zDN`)1<S+alg}V_Xuchb><JmGKX17Zb`5m*Lv$IS@w3O67^4Mu@m1p8ABB|`pJH|8g
z;r}S$<4i|`z8yyd^-Wd|wM=VzXJzL?52`u#+iP{%2jVfT8B{J>SCqIoRtt;|4GvB3
zu96bIO4OaPs(8EiZtCUk7qiM7*-saRurI4C)YJ;yDz?8k#F?q2{~+IvU-B=sPZ7Pq
zD`R@CZrS9(P0t(mQZGyqiPRo{6#n_>H4-z&kaZAl^w=VTHgkZ=d0Z8rQp(rpVmnnQ
znETv(^y_17q|6BIo4Fascx<idEGblr5o?GDn4l>#by^B8BcyIQ^lGHV@8YUapt$m)
z-iAHKTt>zgs(D-pY<{TdM(bWwd#$}|G7+$NjZBL&>^2_u#6T&Nb+4M!kt2NWjNyvc
zvmZ#AIswEgtoFrWT1@mD78%O>t(L?p3Ci<?#|H;TRr5;iWe2V%>F)K5cCR+@))Q=&
ze=MvAUj41%5Iqc_IfDv>e5xB_gV0M%Moy7MD5r9^+4?mT+-sefP$mtg7{@2u)#rt$
z9Od!w=owYZTsI=J7v(%GH3$@oG8h`|>>TBPoo7+&6c4XEWz5TPnoFjQ?n!Z$6w=}w
zYT|7cxe1pJ(^sF+E7E^fboo9o=1<Cf%bB9?aD2gB=5!4)s-NE6F2PDEy;Dr<P4Z(Z
zej$)@@>=zy1T%r_+)5{K1NcCY`)w@ANb;d7Zz855Hqz0EhOi?F{%JAGUO|a=mFoqS
zfN6`?y3wAY^XvE3Tin~~gz6c$Fk!Uvj>=>9ql$I9wU{riVpvJBs&A_5j|O$$9m{El
za^tp~h8J7ngRf_!C4X-(*vx1dylMQo93f;YlVPDDpUHQDCy&rI{8X3hQ!Ei%qsFys
znQm`<luwUYZTA{mv0n6hZnQlcsez$m^REKIkwNQVvanO0+bS!2PGazJT-|fIf7hl3
zvR9t$*zZ2#0H9{t&!jT@A|0`hB5YSBv}?r8%@}Z4E7_Opx9N@ez2J{joGW?9796PH
ztT39I+b@~Z#biHfsL22iZPKElf#dm6^5kJvZSD_6OdoRwaimvs1UDHIo(p%oS%n0*
zTHjR*W_D_XTPNIUL0W@TpUEUwE}qUoFw4rL@CL>0!dcvNIQrvip3-5rh9Yw_x3d5}
zuyOTv85QjgQWZqdh9_l^n$2MKXQ*>_G{$@Y#V7J)P1-!RDc!qMfU11h54@K<TlI?{
zos3PrGCFWsM-W<6W<RE(fpk5TRQ?QF^2i#rx?Ff1)8gD3V+1mfEkXo3**Fz7d{KP&
zWNNisNT_trnM_`m^<<=ICSfhNhJ=J9Cjt4&uO!-Vn%T?921+L9N}vTwV@@gDghmHr
ze4633ijyEk3$bgdqi|Zhzmsq+!^_rjnCN%SsBQOKTXoZ7HV4`qe+_XJxDfTlP3rM@
z#f=0M->M*)+Wot%>f*`@A`$HBLGuyq2;a*y`?}LJdE;u^&E-H6OKi**V3e=XFE3wc
zH4+9~h7o8s8}Dk>OtP<Lb~J7;H(O+gnw2hS-?=%ki|Q-XD(_Y*4)<G5+*Ym{`INuz
zJzGbQwp|aD5=^L(VYkHMc6^Y=N_*PamZ{KWb-39&PdKgMojHTfCR1znRC@cf6C7jn
zx=uj#eC@{=l@7G^9ZErZeVL-eHa^wH7_~|Nz{7T~$hCJEK0({xC$VE8iIMi&+BGMw
zF^l`;<feXl^wPbqq{=<uIrL>VSz9|Pvvjs%oN>jn0bq1llv6*ZFCOXPLrXqU)-P$E
zwqS~v;P4v2C({i;&l}P%%h$|<5oq-7b-&E2Tdi^RIEOi`oPIj4Yp>0}2+1CE@;AsV
zHZ1&j4m&y9ul{A9=VAi_iRYmcA{Flx^(nT^nOQhLC+KW{-!!yzILN;osBcd=HRxga
zQ3~j9HBv)FB0@nyky0q`@*++T4H{Q|RN=lAm7CJaO$4;%#f+l2cRNIiBCo|Lerklk
zn`lL-?ogk5&!6mJvfX`}p~(Kg$@9vJoE3%%cI!dBNM2d^L}JGOaslpf5TfK7(szMY
z2DQp>v(gOnxM%o}eAh-oZ@=WM)|C!LhuO>WMaEI0(Dugd6qOb)1_e~QPo2+~8<C#%
z%t>;9&^D@fw%pK!mrPdf+i^I1$Dr%9_E{R2)rHc@n@N9~j$!MgaQ_@5AysV>>!a$%
zkV33P!b8zNpnyGfeF9vEBYAScqv%^&LixH=a#*^=KurYqTxspR&jt?NlE$;en3qiJ
zqfw*bnI;!T<mpn@koskiF+T|O>qTGW^1W5%#qiEtftNNUf?USC*~!a!F1MiPZBuHt
zqhTv3)nu*KDekh`@IJ}rb?c3=;tT!}{>#%2CU@xAdE4#v$!rHwd6&{};I_M3kD!Ki
z<+ob8u}T-VIETY#2htV`pZ8ubq;^-`1C4p;iM2xSoQK*t#g~=l?gof7l^0tlF)zlm
z&5;(E0fweSyN?*9BJG;xT=$7TqFK!L)k=oSw3JmLLRWTaQZE+P=w)r_8Tn+jqQOBM
zD;h=PEH3$G_RNPI6}i?L*@sbyR}&-Gf_}In5r>T*BP&9OUAW|vim~02yehx`ZfiXd
zlDI@Muj(vFUFm~5;skmt(6V4CXx$}^&Bh$z8T~4%AHS(vY_$<tedgkF1l_4N@|1m6
z9U3*_#C900kZClfp?eJPTQL~)kt80_3a_AGK7Xya5()IC|1~gW`nf|jx}&bgq<m%!
z-fNiaT}j1d!P<obPvjKQ;|H|wEf`2v;+jpdT`#1>x;V5sU0O;%Su#P8U^vl<LtU+D
zO#Uj>Yo~GWiz~<6_6<nVm1y8IbuM32co-=$!%yslVPWA3@CP8=6#Zk`dq#*kbmnu1
zBeQ`XjhZq4mfz?g;(%SOw?FoMn?Lag^!o*eIxi^lh+X{XySU>5XG70o7r5>WxK@Jz
z(d7Jm&?}ShCDyK0s!V$Y_I;KDj!yI!xd#8rdFPZEWdibutUk!Y>t(I>kbP=m$T~Vr
z#9ynk$ISt)GrCD578Zn9PtyiwN4cTiS#eh^u~dJK@NOl;L_$fisK0{mJKl16c1c(V
zi1rQ!ph5ns$hc-4PZ}XH{p7mB>wMmceu@_(N{kx&>h1A+!XY~{!mwAbta0M(Y=Cm=
zFD6Et*6Mg|Q-5D^Qey>mX0P*Wcv2Z{2l-IT2UOIxSm+FU-y{XL-P>p|fW5Dep0hFj
z(YV9>$aj0k{(d>1zdCL$N-(PT>U<v*Qh8RX#s2U_Kfh*@RLPFx%yaYgE4c4F{5DIa
zPeSWv&pkVp5O(dVj-^`L@WaVDrq5leFjHNiRxSJa`#YSVTIrjQ)sVJ!O<$|^V<iTd
z%R!qJ;ScBKH-0)o$tEZKg)74P)m86k*v*@>j~$*9?~|G4>*@^fedZafm!@}>?3NmD
zi`PR<nT+WD8aCbpB0<nw*1z-E*El36Fy>^TW-~ELg|gG7pfnPT5g3z!x_6va_yfO_
zGgVTx=Me(^GuU{u=vb|1_g|eayaYSd$9`Xp-e&HM!OvzF$fg)9X=+#H`DU1UjCZ6)
zSr%2lJ2Np>9z;MZGUAByjWKG8gZrM&g6AeMJWVol1mdVJpWDw=a_GUHX*tqMd+ZFR
zL}eZPwvSlGOZ*ua+QH>GF+FKE#h*j}h&i;9(6u7X0?(F$;Ff1c#m7lmxkAHkQ(lEv
zR?4Nzz<H(ztA4#aJiL!iHc$hy9ro^>`|NHJs?=P3J$Rk&E7tYU5`+3HLC9sEZv!;w
zkdtFbwFKW;P1o5R76;j>tJnjL2U^x^WDD1ZO-M;94e!yV{jY_Tyu}6O=7FCBqmHMM
z#i6cnPIwC?ll`ll0<_{KZT4^gE%-kj{&119`zuNnXfc|Z#V0y}4FU&}Y!BHJ2Cn;A
zfL|9TcN#^a)@FJzQ2J|oM#<71xWxY80Tv1Kc{W<EJl8|s?7W7;vSsthi0t^eA>hM{
z#Y%}*%n(f@NG%P)MC6yNW%sIl-!P5fhN*GOEGuMzA)wv@^VPI75m^FUV*8(4%~;$3
zhpAfGXG*yw6i%ST+Pd?kTvg#L7sb;kP{g9d0fqI?q%%Zh;yIiPVY!_QSCxcpKe+H$
zR*SdV)ojtvdBQ``+`Ssr?-uE|qq-byiksdlUsFWdNT~UVXNmydi`mk9M7G_<;fIuN
zyvL56Da}SZ3!$Nl_)h_?FXZ2a5L8Rvv#rt`rUFR;_<ew4z~sZ;@$g6FN27URTE|OS
z19m*r@+}|;+;#W4He^U`8bw&%W%NUT%GGWPx7w87uQFkvMI<A`*D)#hACJyO%2#B_
zDi@j1ETKS)_C4>{JzBW@F9)h$yF4HE&$Py>OYe9}@iH<5?4z_R={@S~RbB&y#lb<H
zNMUm?ExN9upiQl)O8Iykt02uP(GT8<P%+VX&ChZ4s}Dh4hx;u+!?38|34xf2NIFwp
zFKox&t-B#!9%-43-M+`<&;(WR*6w(qXfikIso1%atBJ2|vsnb=xnYWEJjGx5G?Z40
zr^iUpWfD{(88Y*Rl%k*dKOj*BCK4$zVnP1^j(oE#LOn^JqS|UG)B&&k9b6r3Y{;>)
zF@-jH)q6&0IwRlO#Pgb%kr9!8S%_lScG2In@{!z_T7*|k9R5y@ZlKO5^X$W>f|uoN
zOKT8^E_*nQ#JB=xYY79)>(q9UyIR_N=vwPh95^SsRIJ@d_--27d_m&W6q-O#A%yOY
zW}wzMB2mhLC@40RLhZTO4qIMpoB?z>h$G6?(WJk%PXeR$<!}6rAo7(7>QOpP1ID05
zwmjVA2>&kTxFZEh{Ifu(ypMdmRFqgJB}}s@2u}wR)Tu@{P%?csBStPvyQh-YUT-fz
zL6KlAJmgoa-p<zS58_d8F0#&-1V}`DX~%TIBv@*O*3Mk0PO|ld(|~_`*n=*YhyKR3
zXx%O_EHZS`kWt<t>G6qzS-T?f66BAV#EzDghk~h^A*mbO#Z?6^|4D|K+R;c+){4vg
zXO(HS?#_zXMX_ww-10fhPX(h!r3#s;U^md$x#MGBY&(5F65eJCoEZ%b5o1l_oQS9~
zk`UkN)qT{Mx?mZ~KM5B8G}Cg+hSN?xRPq8LuFd>Plk#8>ssq^CzZ6A7d);XNblp$9
z{K&3yRjpGMu#?}8yR5@ft(=&W=@o;Dr!(F*_;gj{Q(<ug4I+Y^A_0Q}M{x!9eAeM3
zN0I#&O#(D#_WM`#<*Qf6_YBIAK}~Twudm22uGeqxT1*^$=2_*Z-29^z^j@WIGerZH
zEfAKCrMLnVYBfh+hupV&7IoHqT<rC`!MQje&j=ZC*n~=Cn@pz`<{gHub*PX|%(=ms
z)@MD-G?H7Xj7E{kH`9x$yE|BYraum+WV4Ym;-KQ?>zjZ%IzNMaRc|hY;GJ9R-`el;
z)!;G7MrA~`vs$@%iCJGQmR+p8eJlBxQ@)e@wAA2!(4tJoBQtooK+y}5it-w)Imr)a
zbl||lgS{<?9hWl!c|hB4ID(IY10@!I&zf`F&*;711?SG{)mCTWwMB4HL=fOs<%mWa
zO|Vd;w5DUTz;H1$MB*XXfMMc5mSbIWg@68$^{J)*U3-pwK`pkO9?{nc&#Ch?yO`-i
zN8MR%IGHJaqcUE>RpQ4nS1hsp3grQ-N;^F0<hYpPfGY^LI~BCii<J6-!<`-bqY}lq
zVa+n%B-&D(urG)csNG8?e04wM3qv7&!zIbh_++gNSVZ>ht;<UI2D`Vq*r?*+T<iPo
zTLswv2t-|TP-p%)EQgU&0F%%P;`JhkWBN^0U0!8md+}S-E=?I>gV)TQr*xuy1J0A-
z6`FF!Vt?`ywg6hQMCPF%-2n?EGa{#Dg#_{yH6BeI0{bPkZaTaKbbAW;oZ_Oc+{={G
zc-+BbWr`F_UH3Tx63Z6P^3NHk^)*;<4xI9uS}H}VCT)PZlltR^FiaGRh#}dxVOU+Q
z^<#?(1Z{Shr_-}HQ<t!U6K-5+5Dynio1QQNI=@B|(3*2c*2)s%Yu@GI?=V9ilh?1t
z6J{77qCtBgoUlt~*2{C@l=Si#ojo)lh?U-u^;ErR=MVFJxHclM@@j~M5`LupOT7a~
za4eKi(fE7pWi5PcWVuC4UI-M7`mmY>uKm3W7bns7zA|4*&ZoHM5{@U`QiHtsU<@}Y
zqoA9JHabv3v}`+t8Y$xZ?kEepmqq%hvnHlBhhUsD(Eb{Tbe=#5AcKS4#`}3SY6Rc^
zoN1@kx4+TuUyvG(pF{gjkFDczak|F{^J}&1Rv?3jymuR-jEweHGYQ}9&rQw~8eE<q
zVI_1Pr+>@P<G|)bdJy5aXv2Ygk-$M@;hCR8(GbzGLO-S7$27=39wXgRp1YpEdoI2#
zRv^7E>>b@nU{pN6j;kaMK-kjxt~(u+R(=<OGm=nQYlkUxYefJY!;f-yc4(}g=em9=
zmV*HhpbI=J(~EFz{~<53$ZU{rz0JggTXu;>B8gBJlUQ-dZb=hEn%jvVi#Hx?=TGfc
ziC=L(+@1$^bdqR!eHTbA-+#@8mTYc^ogP(b3^%H9$is=N41xKI<u0-hhuxbajF^@T
z=q*+-@_=q8H_g}8@|Jgs*Ay*XC8ZQPxvUbF&ACJaQ=mcir`Ij9L1EVb?MHitW+!jI
zsPDypd_hUoWVNUXXJ6gDT&|IRpou2ab2Yj${KvM_p489&?*&=kDT6lQ;!QJy8N4`k
z*r2&A6Z$d7#^tINjji;~<~(0fpAP14Q_(_C>*iavWiO=brj)gqkdQtG6p-Kb+Bu9^
z^@VIc(xhbf9fE;^C#g;3k?WS*D=+1q!pgg*XIjT`Ud4<9NNDRdF$!XS`c+O>JwMAf
z(4}CNeuC&s`Ezt0XwL(EGH0+`q9$A;Dka{Oy;>-qA_n0*U%c+*SrvXBZg$UJ5^%$F
zX^7KhB0ug`uhLPzJ8BzSP<xo%P#i%9E~NK==z7bjIHGl1w1MCd65J(taCZ+H+}&Lo
zcMlfa-Gc>p2yVgM-QC^oR`z~pocrDwul_(YnyT)qwLYD5et-XOoUX7E(J-_h9`=)q
z-!3F67TvFu6*36H3DxKNcIvu52yvtM=KR_1yU4bNOEu+x+4M%`;(MlOY6*#{@!#;A
z!zo?zu7<{=64-q${pAjhCGfPdXiN<iG(=Rag!zQ7E7`vQLQX3k+%eP6930oX^OvZ;
zLCzF4U?T`OYGUVg{QU(51c3dRkOKVG0w)qZ&zklNd#Obmu6E5DN+Y#=^xijo0TD*x
z1PO`UmwvoP<ncm}kDDuH&hM%9^f%(R%WI<oTJOthLq5%BI=mpi?Bg>Rr~xq@ZzmT|
zKU6VDkf<amW0ewuEU$7_HtPek1}f7a_jgP_&6}gV0AHu~;kvg=y^gErkfe9Dq`){h
z_{Z&nWmU0_&mI{1t<Jb7J^u{b+l-+RTk3W|0^z=W%s3ZildQBTTE2&f$>9`ykHYQ~
zmetNU2R?voA=_s?ozg3Piog1!VZC}!V6@en(euWK*toeSswQxw?@qVQ?C=nK#f30i
zG8LDMD2%2{rAx);Q$6-@@q!8>mhiApICOCyR5bd{S({suMnm;tBYT7>LX#3-ysU~J
z=BoHTymXy>zD@o2Q%`eOz_V`VW9FCMS?)>2qbcv*nOd0bo??%WAU~;Lx)NrbWTZEz
z-vzz1zGU-l)&mL$5FjcFS0(q$&6StAT(*Jh5l`<RIh-pD@?1PHv!u>NzaoH_V~w=;
z-ecfF4y{%m%J3?l-hpAhWY-bKwj->)MM8|1?A)g%TKta65@x)$!U}+}m6v4#sIz%B
zF=Zh@u*G5Dwg<i#Aaz0caTfplAV7)+2|7M%h!`urX`-Wqxl-(=Q1JJ$dbJOCrD~P<
zR3}{`89Fi|T^hbuy7uT}o)rkfAy*j3U;&!nUw;*R#`yvQDcZ$qAWCSEi2(9%6*f0o
z>px`gn?%k$k2J;VbJzDdp@^Xsod)+Gqu8U3W*Sbw0)ijl9wv>XGRv(lqdrD+Yw^1L
z9Q+U$dzr)cQ=Fr@LN=3IMMDD~#Kgp;p`oE!j`{q7#Hn7E8h8QxiNoYOSg-Liv<Aa>
zrDogA6JZbPlj~Z!pH4QR`C65^G03NKxOdPv_mhR^>ltZWq5vW81)zWL)ob{e{jshd
z>Ms<??|rxpRy%qh1y;Mo>iy2W@tZpb6bRL1@5{o?rmm;Z<VC#sdv2TWGYoq4yPcwt
zqVM%O|CuU1$|x@g6xTwimu0YWvFf@Cf)8uiebXWGU240^jkN>2wuDC{!(Ya<IRRY`
zn4P8irr=rA9<9e&JoG%$iY#vr6>$BN_jA;)k!{fwGkPp}lDDeMp+sPkGxwr~Ll}tj
z%&scI<MOu(L1jgCbxx~Te~wh`H}F$w@r%jr1-99^uH&piryEW4&>0>7oSs)KbArcq
ze}7oi@@b~vpoOL^q+g#F<BOxxd&$)RO!W2dygXQmuTipb(NB}HHziwgsyLbA1O$Xh
z2`nSfeTNLxeC%hkSraFmY-gm*=iYj22Pk{HxB??=PTctYe1elVyp|_05>M6t`N#68
zC_-j>a^=^kNgDELXP&a=DPU$-JoMKn939^;jVs}3PeAnKhTFm5+K(xb_m2be>J7L{
z+|FsIm6MM@+#E_bR(hRvplM9wKpQT&Hl#%eil1OWg5P$F^Tu?d&;ubZBsU;H4}B)8
zADyb9K=EOX>@$tnOJXPsC`L1`Y{p8;L)tps_TE~b{?<lBGl_=&QiujqW)4UOhJrva
zyzSwlj7<;?zBlX11eZuPuiGUYn<afVA3OUWL7;942ow}+r+_@NY@a6!O-)TpOG^Oa
zatAMbFeFD^U0wYTNh0GW_1ToKNKd2bt$h7dx8Az|$#^(hSLsRp@G03)&(r1cmwL^!
zy3JyrSy+p2yRYx$7sxak<nh*}*URDb+oiG(@Kuy3r?1lxlz=9}O786Ei(RYI>|5Kf
zFN2xY+FGaCx{LCg+I6`XU2q`3z}85sH(+wt`}yq-=Id8c+B#p2yQT16am+GL-$-j_
zpyW&EeP~A158!OdwT1>H55Uh}&u1RK!fTN5pq1K+jkSAS)-pXoYu#E-p>8e|Z}}R7
z?f$`n_mw*(c*4?TgzfTah+BzJi%7b6llmnvm3E%$ikN@D!Lk{9yY6KDpFie{Tun5w
zxUaPlM~&PBI+kA27~(Yqs<m$x0p9M;J;#;~x?j-_%ND^k^fM+yU5}Mtx39^gN?D<H
zze$YJ^sPW0BP8!EWRH!ekRH{jqz#?y47|kY`)>PZl=H8SHAlK`u8GY<afK+1G<Al?
z#}!eH2?QZi%NbT2e&E=lO-pt5KhkN1#Ihh3P9FH5BJ0prv-It1vo(BPtBld{WC0w=
zYu8zJd}p3G4yQIhAXoa(I`8N&RPLY_#KIdex&55_b&P$PYxuEM7xM@02M9O8z*wU~
z!Y$GGWCB+FDh_9^8fq8`Zf>L{(*srdr4$Ljd7}zfP1&1;<+eMPxT$1jr+5S^ln4Yv
z+2Z`ks>@oAP~meXJpE_!`FM`KAG)8!wr=^F`OV&z{&NfsNd+4Kv!d*>AOLTep|}EW
zTm}2Szur$zO@V8a!mJD4n(pu3#4okvXxCfSshWJR_0}-??}_kg-XCZ|AUKozGWXIt
zyU!4ym>20qy+6nd?QaKsc(BQseKhkg=Q7bKA>RSHHG18u`vURv$!7=<+2HqV-~00Y
z5L4Gg<F;n2^qm5_UibT#G$>FE<1pf|?(!cu2b|w=1(l~OV(oRisjxC{UFqVIz_7HA
zTVbj;dV|%w`&R}VDxa+^x?BjpI6w-^n#Hi{wr1mY;f8p8)wI|YDY!iN#WncMm`bd+
zq18c&kflqO0;8j(_+<W2&qv?q07+0jtUqj6Paz(}NK^UX9DWxhtfEZ<ig6Hqqo|!s
zO831<Q}NBJc$q$klPSEKzcpn~Q?*2J!P&q_%))N1H)?6dFVCXSutNgc|M~w@*mQ<I
zz3r#dK#f$_eM!kC@v{*yuuOe<&J1j#QFKxybkiZHixoZGJUZcFCoEr|;s<?l`r}Rb
z^Vc`X0|%`YG<w24yX>HEaTx>!Uvf;{IQc(kQg7?I#1Bvju>xck2O;A8Cdm&VwOfUr
z<^U1APa)I$-URb0oD9cTT%0-Zim}9j?CT4L9PBULB8i_MWY#u2iknbjKz`0n&=4Tb
zC}_~9zoG6MxMg2cKRLhQDUXDS&UY67V~B98XlZ#)qacXE$w2gsI|vsgP;+$ARC}cx
z{W#9tDY|pwWpmg~xR@UG^s%%fuyS;%H9q;V^Dx;ZuQ7M+`?Qq`PR;{5TKV6y7=2&N
zy$4@KeIM0wkJZzs8KhIPiE-|4S9bAel8)sX>+4+u$Vd#%QqJQ!yymJ6v`@)s)=$;B
zw7E=oONq)>yqe3)0UW0yGY6M}6RUG!jsCM4bg#|mx^r!y@sIMnK_c1(=xGzVp;<ha
z85^qGI&S<-+HZZ6k`9(PX9OH>sWDdUDptC8oKf7o_>!YW2MX|IH)#aOrC|i(1cW&U
z=xJZ{M`Csy`-Pw;#^)civa9@BH;nYU9+oU!oFbY6Na7th+S6O?cc7k==s}w3B?*S{
zCjCN*1knJvAuSG=tXyL<%-HMv^_p8IG8C75@NWo|T~f2lb3RsLQBlg2B&cDFwB;?G
z$`afx`<1CI&6bIv-l4O>Y3|XG?Te(4i}mbSe1KzIm{FM87`e^*0m@w!qrQa$PvWJ6
zea2<3?6WvMEG+2PN=V#wL3}wBbs7m4iVi0n9JemOVM&JaJC7)^0l~xz_H5;McP0OT
zOikxg<9qAgv<_i_>x`!yQ4W>W2xd@e&19nlt5F6P*J7gM^I2P>09=7|op$!nw+fke
zh0<QnJQuv^L1{KZ8^*dDC=hixhe^LY;K=4P;2=SszxLr^9fb2$2ES`A5#(l#hw_*O
z_PFCljX48Jpv5#GGXl(Hl*{CHetEb&uI-)3*#M^MrKDh&tA<}A5eHVhBtUXpey9{)
zeA&5<x}+qlS;iWq_p}FXjs)(cs>wGYGSEv(=>F{ANbH82V--=n#_nzXP9<AJP5)S0
z8s(^Y&vmeyN_r0~!!{e4qt3?}Qf5@Nwj+<H1YqR@CzSv0U-$I%R1+1}3Cie%$Vk!v
znDHs&^qZK^ht|^~)eaytBgaW0flg{~=N6AqYi^$`>J}xHQ-d~w;3cjjYxqnbv!0c+
zW*32<5)?wxXuK(W9TmIsL0Ai7A=TuCtO_{;(gJaFR<5uT7G$J*Tx~v0Zp@Nso6Yqa
z&G7v9^CtqDXV8B1-?aqe2gXO*pC#g!_Ym_;CR1y2rg@05uKN-MR(2&bBnS6=41c;l
zL4ww{Y#LD2DlL4q-s0<oX=%LA+JD{YQhth{{qY4vtms2}uNi|O!Co;w)#QPn;TIdE
z^Z`_*@~j2B&b!BF0sb{>A?&WX6st$E(8}xKlAm$G<1O3-M=@#VwLQlbA+}v-F$|1r
zo|>Mfp`k$`<aK+^BeU&zy}RIrLh9Mb4Vp-B03KWVffEP#hgyMRsOP6ubO2Wh=DiCy
zT(v}_W%#OD5ElNSp?>VeBh*>e=XR+poaJ)?``Pfc{DR8A&WMSn02rLAm4<Cv`R-#Z
z|1KJM?30#319Oo@yR_{D)QXC#HGJ(%O}LBY3vkolif!Q+Xpt?Wn2L=s6hloJO`JOy
zU0Tgu9eedn%`1yX#44oO@<kQ5;DWG0Pd2a}bex~Hk{xjZW?<AAtOV?vTN~>0$rDN!
z&6cJUs!fHmq&vy_m_ZxI#lOKdWGhXiHX{WUJs#gEW$09;EIyN9p@GcuMAW{}t(P=r
zvX!S~V=?4xli%p0$)caW6?S_<-7hYdy!H#)d+N55q2HObt?oWYqZ8tPOn#<uAu0^z
zL8AVm_;oA+@}YH)Z}g){FcSpDv>huzUy&`^Q3b(~|LFbuS3A+J2%z}Pl%wqDK?h(j
ziJMy%1E%Wm=)|Sv(d!K`qlNbspU2vGtnAD1qbDu=L*CzX1RWh~=Z|G%Wii8h7x3m2
zt_4Fk*&LnQdKRH}Xf^9uJItrWMqAU1;dLt;o0>FeF)1l2*SUGl{$Fg53r4Yr@Rr1&
zz8K4TO<;$$*$2%NdPZ5vIYx5m&B9X%35UbSZR68Nj}mY;exSyrF%94P>6bTZbPZGC
z0{w91_ElsEHoG!+^te@CnkxboF%Q~+>;w18$?yB%ODYTHU}=fZPy`j=($e?f+X8DJ
z5rsI&mpxa0z4;GU4+lup*|&G>b+@uNNi2lq@xP5(DsncH=~eCSMQVpHka!C!D>BDe
z%*F<GuK?-`joz~3I3GtUV}IZ^9ZY~wbbo-S@ev{-V!Q;nw{h#{{+<UP5Jwf9zM-4i
zX<YYdX=u2^>Uii?Nxk2`Ih-ypF9%MQ!EufX-nhQHqR?P=GQ?@`zIpVb4X<=*3K~ga
zBK`7Z%QQkq--jj*b?1Ay3oTu@&p!Y%ST&>V^d^X9L%ni-$?5b87*}~W&2}13&9!m)
zL=pD(f$!|>y7l%L2KSG8%MZfjqCRJGWbuPACM3so@B7UR090EFt=AO-cKHunQo<Sq
z@q=XC{#s)#EuvuD&W1*qiKqHqMm|AY&Va;8FB9Jpr#|~>i6bYUb^g)=I2Kaw`d?SP
znZ0i}6WB3{)0XwQj)*SM&BdFMoWcivn<Y+-7Ufm~TzAbLFKXGxN<_43z6<q6I3Uo=
zYD{zfUwd%tExnP*tC$yJ597F=GfG*-kKU#<r(m+}fnsI0<uzQ-I_`7bjA(OQhnCgn
zyVD+@EcrhX|APepXx&<WBL{i5U*6Un#XrZBQKKW+%8T2Dta)1W4-(;p`Fv?PadslI
z;)kyuye4<kCgH7Knb_y(n$CP1+bj-zFsS|Yfkkrd12mMss+zzzg#A#;hdb-a_tNdR
zsP>HrJE0-`$yT_vzgkA)KGSTBFUc|Jf5orugWD}&0H|PYIy6xCYPwvjCi!+r$1*yV
z&pG@qKWm?yySW`XqRi9c7?q}W6^Tf5eKy+r%K8fEdHe(W!3I`f-b;FGFhE1}heDhn
zu?9(V@*O)*zs&XA+f>|VU%PiJ0dOj?GNHDX&2GisLib>=Rtw7OsUS;Y{Z-av(DE8h
z>$n8?t;8}DDVos<F<ZS*Fzsa#g`f8=WNb3~Vr&0r_LJm@T)vTEVG)Y+4v{{I?$g`$
z)S?mn&v8v!%1FJ(ww`yFkjq5Ze()GVHdohR-t{2L(^%BoH!Ase#y&ea&UPO2rA|OA
zpr<A)x2S<kC<<4aujJeKfzaGOQIt3GS2ORtiIH6lTY05<b12!Cy!eRkdDOh96W?|$
zaBophg0Hvnqs}!ftABJU@ie#pPY8jLTk67nUe9`G$dN-GXaM+5&+U+V1T2i!Hj84X
z5wl;5u4`lhCk;>?T6!5zMTz&cM7Pw=uI;*VE0K%k2dXThMP|roysd%FEJ*sqcE8$O
zlV3+FUAsmoO{BW)LTi1kjK?AUrk``aKg|p{#`*i=A3W^l`czB5bTIdZ|LR^~@%9C>
zf)G8Y7jDC|IU+cUm8FpnM~B8hJ#O9ej!4gwuAV@5tD)T0AnTabpJP`ktUm*<5xjn&
zz)Y5yKf$YN?#2`iFQfSADsqIXwD;V8B`*AKo$bqcH``ChC9gY19f4QT-H@TDtif$Q
zN*XqqtFnBoIZKo>q?Fi1Mu=oyqVr|BRJZiUt|;!WzPj?7ibQg1&H56-#t1^kI+vxA
zp&_dy0v&vuqJhD8cip|R-0b@zQzf)=t#pr_%&jmv0v%)RK1H1_TxjUB*Gh`(HEwYf
zoT3>_-o)XEtk{NrT_sgrqk7jWW9=~R`%o|EsXzggqFoeItD?LLG~SR>q;#7vpr6V+
ze3iKnAl`<KwBdU;PHqwpS<5|3oj0HeQeV|jgNm4_R`<cuMRjJe<yZ1yc+wC<WrMX+
z<l|~db0K{a@iu13wQt360?K@O&(;S&sUFi7IM8qDRTj$(4gID6VI9B$T%gx2td101
zj8L7J)I_#&R3Y-AmoMam1_yK3d+D~agC6h+;ba2vqT;@v+NjEIeZ5z&8_$gru0s^H
zdB#q-;4d8|-tF&f^=|p<oem%uP9H^#1ip#Cy1vQVu}3NQp>2)7)S6|v8f`5MgkS_T
znv`Oq%7I<yIc_ulSTWknEzhd?zF5mKS{L$ei8H?q7`6dn%<C!bPjX>$nK2~qp1Z~@
zoKNHEVV+NKZ#uqm*uGt@KcS7xST35M-ZptXlg5koztJZkd9!(RHPrI)8g{)@Bg+11
zLY)5u0Rjl%ztC5s*7qkr0E`X$B>@JGj@Zg4&yd}1i`%tyQVi(F=d0r+8fI~}FkMak
zh^U?!URLr&8egj$Cnt<^00!)+eEp5!cK0b{RHdU{*%tdryypKGelv0D8h`0sc<fid
z(zQcdI7ktGDd||y)||p<-dKCITf#Y|TPvy6Ub&#C#QLXiF6_mmIZRfwR$6BKsOEf1
z;6g0)OlLSsd<<Fn3BRGad5)eEal^VwlScxSBEAm+vf?Noqh8rpq8)#HUu?PmLs8Lc
zupvOqOp1ke20=$sf!vo=SB;EN{<&B|C@(3zoyI&H;YwLZ3$ez=|EvXa)$Fr}!f*N?
zElJ;rM9ac{VT4<o&;F=Xwu{WCCz*5TAp61yCA22cO`b#i)u#HTo#>tMpU=xkMT*z+
z*^hWes=3wlSXldeL(ud{bci}Ti^aT!x;T**gY7%|Lm>S0Z0gai91x1X)=pk~YQ1;h
zsAy}r6Sh<z^6xI&DsyJhOf0~nUjeHq3bG9{=qdvVZaPumKKehpMV~^;uMK%wq2gaz
zUryY%FsxpCS)M6fZ0JW{g!{BH>1We#J94S$M8+<cX*WTBs<DlfC`mzd1{pHXX1arN
ztX)n{IvA}!OlAWsRmcV6H4et}$81PEU0a<S7(MBN@9Hs^9q!};mgc10;HZW7NWoY-
z5bMB4;{^73ta&Iw-4}Xjc#!nu)2C{>iQSMmL_yRBRmQ5k;9+s*_BfTde2Btr#PC<B
zWW<!$^=Q`KR0{#_igiY<#5RFE_1sBR7ssRz;2Sjc%}sIh6&z^}jt>)S1R4GLjW?EB
zY|@LZ3vv9qjyYFju7nho(hwl?ilg7Hu=Mn_u|5=~k9Ds3Mr-Md=y*_m6`rdg5MO~P
zoA1}$<tMEzvZ~d_krjlTrkc0%c$-;1Z<>fvnYSI=j;SS(awJ6mp&ImGKuLnF86G*J
zu<XX==cXJ4=pa{aXZBTQa+M>%-ym;Tds!#TR2AfnhY*Ul3%u2`NE+VlHPtl($sl1h
zaz&UGM5=}M(W1{e+3n*Byjw>*y&yXA%BSQMthC#wbKcW<I{3)A4Qj?iC)MQL?Dq3b
zSb-UrH9h6;{z>?ii0T?ASATIrHEg3^Ewq&11g?PCE3Tl~&P?YH*Pv6}UVME-#|#-X
zlwf1<ST>=wkM?Z3vVenozDxKDZnpK2*YmpRn=CVnXGnmt#{uQ=H``a7Qx1EZJ>(~w
zK8}DuzlSd{z|zvZ<<C{~1-6Kzb5F*Jrsa-PZQdqN&GSDR13j8|EAR;IcVP;$5T<k^
zHd@RiQ9B;J7~yr^B;~(B(A@kDGG<t1S6-wX<|7Tr!_tvUx^XV#X1M&Vwsku#B%)|V
zR9>%qdaC+XxRMav_MFy0x67uWzOw)Mm>(5}fD<F1E*xt#(lPXH4>1E^P6D8k?bEr~
zPw(a8nw)XHhEa){_OzzA`-A?Tawob5h@c6@xY(Eh7o_+r*q)-2Ym3k(!$}JG5C~AB
zI}xSj(d=LoSDkriDXE4pIi;!2-eAghNW-0|@L^a2@*5L0ag2mES70-zfyx*Tyw<rz
z1+v)}HSm!9bu#T|uBF?VWc-!bB=Ya0a<`81>tAW8A2rH<`GZvxk}TmsZb^-VAVDPu
zDyCQitQ9QVg72>7I<?21bHm>;;ZAZ2jmNd9Jy#4-%mNXBMR>$eKu)+_9R8h!h7%Pf
zOQ%Kn&RwF9M8on8Og;7<Ht}G2&goK%hg8I=aZBmxvd(8;+n+M3ZD}<%n(eS5{0uKT
zPBV@JAt+<T;YTvB$*{CEU5W_C4S?X~y{&D03TY_i1DMa&&=ZB`5!AVnW*USsRT+`Z
z9j_Jig=I{eIPo2q^&#QQCGhbQdg&&Z!J~~@XkpxybFP(jb235`N)P8}iQx-Yf0UXn
z&%GRU9k;Bv5u-lR$avC(9~)w5Qko6Bu8b7AD)fh?)zEX0Ey%nsJBoSZCAl(O>r;-f
zo|S-sM+R2S^|%%LUgUWr(SlbUG;WY|>=r~bQTgMvekC)OuA?w;p6HzE1ci_RiE(Mc
z3<Q$WNQw2SQ`ar6-#};!U~fXHRM&BF@>TUotx5bsm$Bzz3wPyiKQL12oLn=Ff~8vt
zE*SZ9HkCu{%a16Snw;<|Qd*M$c^h=o(T4>I3OH?|9vH=)*`HRT*LY|>XSv`nKI$%u
z$P2=#N4Mz0di+p!zlU;?%Pnv?ta(2QaWZw2z<`)&S!L&NzlObY%{%t?lu-Qq)$mab
zMVsYBUUAW?(Db+svroIJM(JVU83X(=^Qd7ZnLDvQRiH1MSzUM1e#&2ha(RDr_oNb`
zQn|BP6f`Sd8`gDweb99nIU1OY$G6=;z&DGIZj1~Pgb+9p(Rn}xsjpRf*1W~`uA>$q
z>Oq2@o%QbN#CJ^`+;re(n--xH%!IY|sxUy<D>>1^(>lLuua4$yaF^A+Y&29|aHwUX
zuW_m55E}b}2<_ws2jmiNWLAZkc=KFQkQ?|dt^Co!V?(p+K1q$yF%f@UM3IGGr2p}A
z%y}mgFa<r|=NE%V?p^Q?V+(!akM@7JKr5PnHQVS6Ll$mx*43e6orF|{E_^hE*Lx~)
zqgtsyK5|mqt_lgzfaD8A`v+6}L#|xDI~`kyE<F{*p4R2~UN_^`Uk5X|^xl_;#MI4J
zic%DYJodst-s2<8BvH2Q#d1=<EUbM-TRZq}uO-kvv~0h1@i7)GrZ!se!1iXp?7~3h
z(cvo~KscC6_<Y+T><-NQz1e*zn96^+(LWRf%F#wG^N)uDDu*zjw)-?TKAU@_4DB+=
z=SmVAjwJ*hq;HjsMNplNd-Tbg?QKhQFZja<9@I7reh^Hb^jkE}-RsI3?&BHXok~|}
zf>9Q<M(Z`-(}O`1q83)%9$kT6{i`eZ9*DcAqrrc9Dm9u@pg@~Vom<CO!M3(8PM47f
zgdkE8$Ekq~R@Q8j*T*sv3{5_>>!M`o2#C%piiS*0of!>vXWL!J8GCn*6uEG118QbS
z(6+1Us?tl_;j)!$dbW|~tu1!m@<g}8XnjleoDv6>9rGEEfood<Y5fJ4V}`fO0}<6t
z%iYFhcV4tN%x$kZ!}$Y|1jpf3leK)-&x;Y4YcdF!tR%Hef-0&5sz1+e@J(a|yhQsg
z0(>_YfW6f*7u9*lZ9LZ-rrq=OL^v&4YmfOuLf%GP$~Y_Tecc@teze4m@VoiA7t0a{
zgp|9i3R?ZN7ybA`z~4*_JyS%=J5N4X&gV_$<tdfg;pjdE&faNVA=W`me55AQQD`vT
zf6H@;N~X+S{W>0gx0;RN^NZ2`bFga`#mZ<=1)1B&a5u%?&yH%Q$U0&k?b#S-?CKNU
zyImcE@-k8;1Z+usur_E<){d?W-=!a^%m<>8No_Wm$($sXs_1f5%9j%ri)Z5G)8b}W
z7lfsB5jB68LRG<lh$Z=b8{YP4Jz=cItSi>OG_>I)tO9(C1c!pTwP*LU7)(z1eNB<g
zXemFYKCQ$d%Z&;=+)HD){R>H{;xg!oS=r`4>+g4fyWy6(3db6*SXUnMZLKI%O-fWC
z{zb*x`%jI8mEK%q=Q=4ippmtqE5w$}4Ya~-zr#!Z_FefuJ20a&Wg|ju%VCW5!3Nu^
zGqH*Di{+xAftE!l>atR=T4lgGq+dxqrIfy|SDmpwUl*c}0B1Rs#YVlp(UH*Wk*Sm_
z+)r%xCYTRKUifqO;tuF8Z~YCSFheZ+;@=)j%SK13S4loGfk5V;%AyYu0Y!jIhnx;Z
zu8Oi&hz}{7VH0R9jM<jPAKPq>&Kf*}E8V2!?8u*>ey*B*<!&jIzRfUMn4LzP9%;qb
zq_Je}Pgln)N0u-m1p#xB?7R<{GjTcBe+GMEV7N-vlb(+bOk}V^+PIpCGpNfxJC%>p
z?7@ImM{U~dDrvT>j#B7r8}4--8JwK&>TuHB`NVg$|KQcWgGbOsU=fY{r|WXm3b}5J
zX2<Gq%;}^XS=nKmi)9e`miAmgu-mCnO+|%$vP!vi<t>MV2V3PI=ail}j8o*GK?g?=
zj2YbBXLY0vpFl<E*e`fNI*gaInf%T|!&ms2F7Ds%Be8r27WQibbMmcTe*bv+qxQU2
zge7ou7g3Po+uEU7{6WFVz%n5n%!h&A*2!m~=J7|36U&dA2<t`daU~N=wr!o`vx$zb
zdS@2Re7YBJ<9^oC$DrjCJ)YC}gzN+MDX!oAxVUl3EyRyQn-lFjJ<TtSp|%T2=;PfJ
z?d>`Ah5KdS8D0w+AmlTNC^WNB(=nQ<62kXKl>*5$On1D83FTwxt}Y3CkT{1bw|X2B
zjFz0u=%9~ke7}hc@Z6^pbcb*_C44W$1POXy4`I~%Oh58hE~8MGu$;9VF#8!t*s073
zQ-Zi|+r=3DrlOi`>(Qa@d}}39{iebL<7mTS-@LIst^DK8QPkhgCRMFAwMnuCtfE?G
zR@>`o3FOWPa|;S~ftuKV$d5i6fyA6l+J<a)m7;b|(P@g*%6!)F*(~{0dBcdR#Lv+P
z`epVKV;Lm=3f)6d(3X+lqrPmF62gY}T9MK;QacK`#J4z8Q40(A0!5X_>>;g2<=tIY
zIBJhF+e#sY0ur|$iz0;)<hm&=>74n$XZ!{LNEWdhWR?#^QYM=qFi=tsB`_<41gCD!
zU=iJ1Y37lrMaw50>`49u`BTQG-^ycCSNvq{{qINcgm-JJQC?9f)G%C**5U*-FC-}9
zw8*9o7W5=Io2fYZ)6;Qdnc#wWlhB*^_-^fK?26q@G<n?=PJh5nbReVre&K7kR|yxk
zdK>q4pwV)wWphb=OM^eR(v<v?X$&db`EdAg8*D&4*3F#?Gzg~0Rp$QVtH;9z3@)dD
zo9oU1-8Yn{iU<n0qlznGA%pU>ftB)jto_0hJ}Ek>h$MdNzqP96tr|SoKMnkPk@#DD
z{Vp#4V}DI`MqZ_4r5yA02-HU%o`HEmM8mMCA2q^%)nFd_3I?_7zC7@$?PzW=|3=g^
zr8fJJZ-e!bHH}W@P3#HoHHSYlah(il6^EO1D*coI>oAirF43EmwS?g`%E?#FwvKI>
zXT#ZNXbsUsnJW3OlEu}Qt<<yWbjnyww=17_x)<twQAo$rU#`hLT11$`CQ=Uf);;0|
zeGS3V=2+aT{b<Gc5`Pn>y}3AAM3#~=RlbwR`MJX*A^~TO&#{jur)m}F_zyGl1v}q1
z7C9vWva6GmlfD2zjeH7PgeOxReBGQTJorF-_MDie58}JiEA}TAdvsV4fdC;b2Yo&W
z60_*VBC*Q5no<3M^?{dge$hF#G_#p^`}3L(<qarC5UC@R)al&*0*_ZV=H~>%NQN24
z&O(-5EmzwT?bP6>oW?CntzAA5tgbq>QJ6xQgSUpda^W@A=812t!{3y@CF<JnT|;^<
z=D`pkhjv2m^`op0(oU6W_osbCf-NMO>2wlL@?GDpXe?dsR8RgZ<=m(tjX1G;wVroJ
z_{V!gbwAQ`uX#NSymDsk=^n-f613g5X+}mW)MD%$SO|rmSg@}T*tLKJfn*30(Xq!F
zT^PO8mkC>WY9}>+-c-z)|7eOOY$fw%RA>Hv(8kBXQDgh_)jZ9iC$Q@g3dah`Mc&6o
z)Azva!EJN0pFe%}t8M%WvIu28**?PKP7K7Fa^}lcPz~PDvB1~FSqJx2{5owZ#7_C)
z(pfS+&VBeFS@j%==m7$*cQHY<So}l?i2{%}bdCEcGS4Q<wGwoq3Z!p>0TUaKp49Ze
z7Wj_6YB)T3OFVrlCTNQ<ZbYS+Id>sc)WkJvHKJW}z1LPV34Fg&>e1cez*Ux9J0PAf
zJ-2i3eE<iy6yedd5#Mj=a5Q#hYdJ(GLB8wA-ErRS*%ZVXy3=F7ghGR}hZyn%Stn3Y
zB~Z_{imvYmr>3TchY^*u0k`+co+6F;n@xrlDb&~Uh^^C@3E~Md0bTSEBWD(ujqgq;
zh0h!dH&Sd_Ph3QHBzt@bOx=9bp40J7Wa%<R+#Qb=6D>zp2_+}nw6vlUHm;www<ltv
z(jx3#cyE*T_VUYZ33AdH-)TSCDd4g06xAxZi-^|)7$9-}((?LqE9<9iwZ@Fg?bK>Y
zfPgngzj7yQV3w!Ad_%ScB(@0p)WV?Uby*7&ht}QTsL`X?7c3Uo$IP1-4@cMQ#T~xg
z=%ex7HY+bv>pF{QnMxibYt$Y7(cYP=(p#_vo*#5ip@8*Lp}EI%oy(}S*z&k5OQlml
zQ`5S%2qT#q!eE9i_twYx`F3N&S<y`DN#$<zcdSuDRGhPRp6)Xwd@;-10EjH^fc4&L
zM2ZZoJo=324X79-)B%Nl1{9;ybah@z55|r}<6~HwkVq^-?~`i@qZpik>uIIrE$`QD
z1ZvM*`GL&d%v$|b?b@Z9_98;(ADDh#zkVak$|yt>-y`d0AqYAj?Ow#8{BhfKiZz!`
z5U@wP78Zx+9ujc3dECzw($oFv;Z6%cOg=9^oDo4o+sALe9f0K&>#?Ru8@>$}6jgYg
zJaT43Pw~7!ov+yBw$CZAu-o!4eig_>Crm}wwb%^r(<K_-4}fbFwTG~xMHN7}%7Cah
zX|j*~98~7FGWv3PB2&cWRondXN{yxSV`V|!_0fFgj*MJO044dD!dO9%Lfm%{+0Gy~
zAKDcY52~^QPfSb<ATR#c0pU4n>)6DP5ek%QP!Rl7wjA3TTw0{r2ZXRrP8P^?bnO#i
z?DZIxNr!1H-2!zkbf8BPwpf@)nFl-Rab<TCCnh+;yT-p_^rE-G!}L_MnIR5YeER6a
z1NFnt#lw6+l}Kra!Y`BZYA1ISs?oy9eJxe$6U<l-K=|)q_8fM7umKj4^SR2k`qkK@
zi=9K}9y3|Xir`H~>TJikm)6?ch*|T`Y@lKIAUU|ol6=V0d>b4r%Q6zMQm*H2j!TVA
zcdbvL+K}<sV1hi}*E_jaOWrnF(-3H6)9j(0)p%FO0wS%YLp8PO%4{oof=R+jPq)?`
zaX6~%O||7OZ)Pj}g$0VBlm4Q%@Ds>Wq<sh9Y@NKj?`^WPz)&C~rSA3NGlHTJHmjk5
z7QEsh36wr9O8hvc#9l+bu?16@@b{W!b;M}?I}LykO6|T~#WYavTn;rp99Lu=-Letq
zwtKMQ_FjXc^z>a*y$XaoTYk@{?{lJ#NgG#Eu5!<3{#(wdT+_<dU#>@Z+y9)1+v9Jn
zrHzVesa8L3CuPw2{=N&WsMp2Ph>bUq<R%eXE&q+z$?q?|j;gurUsy>7UeORhoe{@(
z{dF(V0fZJFhF814^C*dLR=?>P)K#dgh~{eO^*yRbTiI$>et-Ehalgvw`pgI-UhMYE
zJhV6Ahv0z4+j=zcm>I1~Kt7Yy^ux(rr;6vhlmqC*B!OA@B=DYUarbLc9JGhO-;*vR
z>VFZFKfM$B_qK>&eFA|BQ9F7Y+%<^$;9SLN_XH!YeNRGqkv=lrEy74A2L{0<oh>~D
zuU2G27)F7yh-lOsUaOVF&CF_pe*{<*IL?Jz=jPHfFn!f9Db4;p^cg<I>Fn)~cGNpQ
zg=7^ci%kf$F4ys|xpzmKE#IJ+%g=AZJin<27cf%YR=FN7JiIV5g*$GXjko(W@Iq*r
zyiB+notn<xKL(*Z9hsjPdr~-TOj{gW#jW}$pDZmkr}23D8<_WI81he@5RH9Y7yYPX
zXyH!0Qu7|<AKyuT?Vi4ubVk8gWs0kHyPv9YY0bZ{(OB;?E8q5ktGBCrK)o``;;u}t
zKy{RE)dnUaw>0A0*+;JHv)7fSRU1~P*^4CXiktL$RKn(#&sslWHj-BdEpMvFBbvZp
znA<j#adz1`c^{T0l?3DgVMS0~{gvg0)C#N_{>0EV;?Aoaj~V=GU=_>z!RW<>22?%G
zFjx~V`#d~R)<im}<w&8>d7u7b$5JWEM+K|Sg7`h=>&sWYL5FMft;^?CZGW#CO;4g$
zO9clN*Q<?THMG)ZG~eAhcvs!S)#CY?hG*G@<n0RA{=+tchWjz4HKN~%9Ep~6gAQAc
zw?23q{o054?D^I{TeoAmwqcv8FY!JhRe@U5^wzQ!rK)p^`E6*f`eJ*ZpQh<9``4aZ
zLffdmHquIYYgJf7&LPpyYYIS)eB@ik@mQybp42>hs-t-e9ZHvCtNK&hL<<9&`FP0O
z;JUI{!WpKhgbo_)?hyFbP{pNejV$i3XX0_O)oD(K0Wt3W^l~MNLOJmb#8i3`C{{>o
zMC0(xhX4goFH`|olvFEM0byFPs@DswrE;cXTelNP5V5J$7k-(XU+m~NLF6wkeS>R1
zqqMnsP(g57{_n7!zlP>Rl#%C%qgK##F;W}cEy6H6NgftZ;I|AKMWpS8WxrmxJ^XdP
zHmU=*emg6zk=hI-Bnp$T9HOJ6(Y3uhI3YJT(fNJ4Ihc{LkvKzqZDi=mUkd^oN4C)?
zB76`q3DrSxpzZSbd67Q;TJJ_!@^a~`RsGl(AwSP`?46rtoL2)}7`1_rvW$ciw$S4&
zGlf~KT9@FxwR)o8S?{LbI``Wr-@1&2|5Q#8rd1f9B8Ag$P^#^~R0H@@VG^tH;{1+1
zIq%G2!<ai+?YSzl&g7n7uhURp+5$+Jw7KWA6fq0M0?%xmmvJ^(`d;R2^Y$yo8+`B6
zm9(1NIwvbbjP0|D=GQ-9lX+sk5AJK4jBX&BN&DT)#?hiuqW){HPvmXdyaOO7s^ioj
zF{^sEE3<arRm9P(#iLHEARbh=E_zH1*usW9!VV8cs=t6<EJa|zjrqj_vmYxJv7L)a
zq|TE@RIlDh23v4d5~X_dc^qxvVLDywp~yL{T(QcdR8)#tJ9l`84FHr!UyPIuxq|rc
zFLO;11rS+01gN1)+$!L9m{TgRrr`7FSCvSZ6#OVsB?0d4Wd-FHqX)eE=U;Ou<0t??
zOP_I#+k(&dgynmj+Yt!oMd-%C0BA{tpVzX`RCiP6Xf|?j9BlvX77+XJCm+yw$63A}
z&A%DYzNdSg)p{n&9y7EFjZ%*zEL>i<&OYcl6nYgRog+e1l7jk$mek@4*ZD~-Q8(1n
zULA4q_#?WovHDF*WkCoC6l0w`F8)Z((}zEJq&&Y?L4LmFRPk^53d6|++cpNCO0Q9g
zoWsLrb#4y22r+Vbbs1Hh(ugpIhUBMwR>ddWivYSaQAXbSuY4>NQAgQ=NFJV^)mUP`
z<f#ChrW<e8<3oOcoWx8(KmYxtqVw#~QcM1;^;rO+jmYe3%)Da$tIVNy-~V6%Xx}7y
zGK#qiYGh?n7q5vPtnukY53|(93?ox6TpDFHbV^MZ9L@Ny9M^VRR(gn!(+N7ne5|yz
z(uH#wIXLz&-QxFI6crREXJ>b}whXPUBh-*?tkN>F|G)!-9>|%QYx44X0J)|4c`TkC
z_;bFf)jOLx-JS|TwA5Afd5me63k2TS+1c6ECdZIbQ@M2Z(?%OfTiZ&_@;*}LOB`HW
z=F}0%hrX7UmPD!D1Zt%}D}&6m4(sjS%&8~kjccgky>hx*j*h?p@qaR*@$u7@6Axo!
zvSHmBC6a#^78Yil@UyTVj{E5uJDjKRykE{mV3^}i7P}`yxZ<tP%%O5|jIcGW`}X+X
zIgN^oi4=cz<gDB+Zwmo}>w3y^a&qG0uy=rN(tn-~M4#5=IgZ77OQYSYR|YjINuZBL
zQc{wxu5OWBc3%ZH)}9!CMLkaxo*<g4nwpx?vCTgM6d)fI3+N%ef@5pF=L{GCfpmuP
zfq{X)e?L4MwEjtZ4eEc@88->casU*j!@$u0z1oBXQZ=AB^xy0LySTab|3_1)?>`z-
z|9wN`-qY<da6_nl(O6eyr*D3X*8TgrM7a>!z{_H&HI*97gz<JMF~Y+nUUKmQe4=%{
z|5aqdjnt-JZ?s)sw(S74nA(r$D+LnIHx*aBD%z%xsY^b8H;=OR>3U0D<O$0%=J=`s
zmemdNkTyA%k|pGJuZI|C05Zq6E&!V!-9YR8-Ofvw;O+x(iE48T3oh>`)_-5<^^A?3
zeaN`x&!0a4kcWqZqp`aBve*;xX1A_x>$8Kl@5qWTs!=xZZTkM*<ky5CJ=5K{i7*0x
z;G5HWczAGgb5{`ozh9YKTaS#6dOx26=XnNd1z8X(dvC3<qAC^|9-0;?ASe_D;M$Zy
zzJM&cT*<FrkUS8%gCqZ&*qEbQT3fF)+UkM9^x5OtEM`&av{?(M^@$Z}bUdjUB=?B>
zC_2+I@eaRb>!eD!xM;n~s0C6}9nX%+G4;7QTPk$V1=f?WYYYx!%kUR(J&=23t2i0m
zdcB`#*c0|lGaeeQl7db7x4*(8C0wPt?E#Pb9PghVCSi6{6PHW3QH$NCu%hBr5F%C;
z4Pkn9HIqyCH+^GcV?)Cq|3tjvlfYU2MTS4rT2yl1rfFE-bs4Q?{jUNnn1!3E{ltM5
z5r)MvFE9HUHJIcSUp;}>v46;&#>gbi{}zMVs`tCdo35hV(qd{!IGQRQ^03u}?~1CY
zdt9ZP#W&fYxv^L)ah>mom)f`v_-1qf3}~He#Mb-C;+WniHA}^=;Eq0KZl!4|N1nRl
zoL-1Vp!t|eqNfe#k-E4EtAskP*ZCd=`D*K9Q{?ZKl-KC8?yx1*;8*R+oQZkyMI4o+
zSmgmttTWZ*cAmBiO&axUK8`@>wjmfPjfd!j6GIc49-j&_DyO!Xf{s-kcONm;H&J&w
z(WUXbo5fEXT%{fG-jNYk>@*`!HS%kmv))<>(P$v~p=*srJq+C@nb<k)`}uP#_B2q&
ze~v05LOEw=HV`K#XY0+>_XWIvdjWTqbToNsadB~VmCyNThFYmGVt8%W;ZZ#dSb#@{
zmF(>7hSln{m|9v|a%DcOXs{h50}>L0Zfd9}3msK$#Hae?l={gtN7gmE8om2kMq?}i
zwH>nR)`%dG)8krw>D!CT%a8BygdhCA(s9f7A@L?OYE3oj#y0p^81v8_ROWi!H~hU?
zRG|DAgy<|;jl1aieg2J_dz@5-`!Y@&P-^4o5`YFiM0<O?8=D}Uj2E`8_mF~#B>&oH
zEIQtTtZt3`CrzIWIghiD#~>y!8OV?KK9AGc#%e2BBVpOWwWzqhK;tvbbhMAMn`MYR
zR!hE*#MN!s0Bmk!GEcHK-O6EV;=WBKfral8B948iY5mne8KSoBxCZ9XHFXz`_;Ahi
z<FofK-?ZbFpH}v@jP^zz;ND_H_$=?u$)gicWzGpjMPoAMe6vax+T{|Zq>vtZVcS@+
zznDwh;WH_M|G4YcvMmVYyxEe~?wN3>+rCzNT6~0P&rf}rE{EZBGT0sPPdA7JQ2I=j
zPD)RIMrTv!=qi<_xW2sn4EF)8*y4k&P6{hI*{QyTU4%rVgfkU+kq$l{*_X8OkC-z_
z`}ow9$)}-5|LWW%3+NabO{}LAi|Q!U3eH#G;77uvlaMO*p*x5&R1;X`Mb-wT_QDvt
z1EE+xG4=`_L&Kt7(@#yA{0-zb9vvM;^Zlx;gTLO_2U7_T2;Y6&E|+U6T+*Ckx%BdQ
z2yb36y=kU%)O+T<JlI=t_7)&hdVC*18KwW8bo3+LdC@8mf~c3kTS-kY^$P4nLUS<l
z$ZEUlA(geYP3V%08r3@9+)BGE9;D{<dAUXCGx$srueB+Bi#{RvN)jahY@OQTWve*X
zk|ksf`u%+!Gp+n~@aw4NetyfTd|{<0o!j@{MOtUvE_$R4Co-<lt@15BGFY+&2)_dE
zXPSL|c4sImD!w_%`aK6%ac0wmRvO0rS6w$P;=${!I|#8~Q891Q$;CxBo!zE&&$G$r
z^@$XE`>$a&n0tP0O}Ac?uli|Hs@a0h%w+DZmF`P1hF=3K-K*za3Bw!@i1irbSfLmU
z(J#kHhLT8PxA!^WNR<GzZ<bgRa{69&8Z)#HmjN%#rVfQXZ8GSI9<XlkyEKhZYB~p^
zD$|^DLCdy{Eq44}T<Q|=lk1p$TG<XPt!U96$8DzW?PsHF*}l6lWuEoEv1(5LX;Uun
z+tQo)s<;(E&e1{sac8~%^2Zl{pRX!wC*%xChWaOwa}k7s*7KB6*U0F6c=io^*)wCE
zFD6vd{qfq#Ym*5E$lC8{<pgqwg`|RE0Re;ro)FJHq@UDqcRpsnh1J#(j(-NU6_gER
zD;!h>=h9dFy=(MPu2VPNZ_G8eO2KXm|2jE=6J+_{&+xtvHa9o(xSmxlo>)v5Mh;i>
zWA9JZ+igWZU_wKW^L&cWD31Z}TF9P`7dxyaP~c>NOA0JNTkW<ZAK(R*SV1@PAdpf>
z1v{jnIr2o=Z`#UvhA-;m&MssWn%U|a8lxFpj$erCi)ecBUWJj6k!56Ls);C~*VfjG
z>UObhUkP%K==J`{lO)q54y8L3yoBNU9pP(_I^!@WezoXn@Ka@j27v^q6E&ax7EefS
z`2>>X;)Y`H#rcP`^OB8*YJi%{YsR7nIjuUTVEvIxZURmH-P*Wgi1H>`h81#<z*mST
zm(k4g3N_*KGF2!lIq$gqd*WQprlgYc(gF};7XX-`NC3i?=^{BJpz1r3&&H^Ati&?0
zZ14Tsb|^T&q9(En!*@I{`4oO{T`gFjS?2JHdvu~8?a2c~5f2SA{cm0HAV^&(Z_@m*
z3s>St8|T+sZ-^{1xJbkw#8qe$N)Z3qYu!KRz2oWQUy}E)vUjg^T2~)-Zmpi8c_;@J
zKGxg`2@En5Von&xJV86X6V+h7;?_;T3(6Ac!VZEg?UK~vOe*wDD9s^$Yq~FDycW<l
zI`e<={5LaU;@dQEOiud$$6t<4W0?|slX&Bqc$B~D|9o4*EW4qG;^BtTqjEAdLd&hN
zM30J`KfTIrUxSp<_|pc$@BFKNA$oSKMUo^8yDev;nlMyRWiXa@P>f@{V6{jm-b3@u
z+C)x$y~);Lz$f7jGr&YQ>qx_WZ#vDqHJSqwuyBN(Nu%|(3=@d*qrP0Oyd%|m`j0&H
zw1jntQaYtDNxc%|?KM^U5QE4PBOiCSn1DS3KmLD(AXgKkWITO?$P`DO>O>ThAGs35
zX(iyY-a8|>3Y|Pt%uNgj(#tZLj(b1K>)C&~9A{!KeD8_`2$<941|&ZUS{m978m-Ly
zQbbBHps6SS6+xit7o{4@<lc-N20UF_8YH1z@(ZPsKG(A;)E*vJ_CP=A0Ml1|I)Iom
z%QWGt5H-vBK7;Ac&emafbG6z<YfDT4{RquWc31mZX&zP|!wSN9OH1u+k;B!kYwp-E
zU8~jk+mXbia*{k*V9iVS@)P2$b0anH4$vHhz`?<ZoWEB8m%JGN*RcWK7a*(3xdQNn
zPSs}W7?Fi1+q?Y;`5&Kx&5Xt2j^fwVIn$p{P9CkczL)<{a&(a_f0{#VAbK-=Msx;}
z@YaFdG0cjWZ}mIV9|TIfa8n4YL++q!#!l6rz5uq(=Z*&NPKoB8EX1G+Sj#@^7tHU<
zq>tl{#d0;M*R6SNf67{hx?evVKz9;6E3jQhEl@bE<LnH}O0FCXl56c2#dizDX;t;x
z&hQ@+y!pGHHOT8(eFdj;=rx89SYJpX>alL=KG+67dJ%rACh6YVhC5%REs@N6w>Li!
zpxqjh4dULk7ylP*c(As$wE<GK3n&NGmya*w{Ld%U861M+FAFun>aWm$e=O82k$mqY
zFZs<b<?Da?<09Z4VSa{6d9>4M!>%mYw42$Q{G2GRwNwE}X^|RO=9Bb+0Wo^eT^bmk
z)+$aSi6ycku`Xl0ux+@QWC^odgTQ!z4v~M$5q@1Jss$at(ZLiI#3)fg#_Qf;7Domi
z!HDU;zjka1_;U9!76}a^Hggv*qHt|LP8W#cFiWpRsQ;V`FLag-7%)@qMuEazUwqaj
zZHN=+L-$OB^gO6Q$>w}jha|HDf8+lSCy}dYY`i~PZ3zzM9J4?rus8b8ae{EOus4#*
zlDf`6?N4ck?atP<b0O~g1=D~h-YmQbFtswc|Kgfz-n=o@p!KBTbidv`hea}wTVv22
zk#2Tb!`NneE52)+76b9HzUMjb?yJA1yHc^~!nIkoB#fZwJ{*ab%faH<1Ta8~hu-N7
zyZNB^56x2xD9O%hDQI*M$Z0jh>Rh*KMsLQuVA)XpyqKs?4dt(Fnp&eEllXiw@v;+u
zdj*&yC@3g^1@X#Q97DDy`JW{2Rf=)#spXG<Ru|ToTlZH0Ru<H8(Ii%5cDdN`rX}+4
zx&AlpI+*|QUy)kA*#EVbKnl{T-Qn}<KfLSUZe*1A7AMKS*lQdx9h+D5|H$p$eEZ*g
z1=t|E4vvmM0Qx%oFZi>Tba7#KnctZN_Q>pP&?}=mBQ_EXOKnljuG_`G>)w#j($TfG
zwlel&@ZqF^i>OxfYmnlMKYj4`{sANh^ML0k!nP$4qs27Rllcm)Jzxbcpu`{j6eytU
z-=F#i;yDP##=%KVNzoU`p>&`I^Ea-u+}+(J-Dz)5Av2{Wv&jwfnE%Oe-7#*0K_Lh3
z5iEbUK$0O4kS?eJb_(uVi>RWs6!Cg?Qhf~V<io!c!hhPj?{IVFhX4sqBR5KonHJc1
zw&gHyAFHG6{k46KB;^qvd-QFSChmT=3iP4TOe=(d`=QUP&t>4Dx5O(my#z}~B&AgF
z8<uBnq*~`^-LA!Xfd!^`2t#cz)D@YvwKWjZ98gzy@&Aj}o)G|}viZ8vqXJ~mlm{)2
z`yWBl>FmfYkpG^{_CONDX9jFH@=L9jzu>{Cx>YVMz8_3r|4$yA?pssyaiJ-`-ff9R
zjR{GjtySFV7;mEmR9J1jOH<gFQ_}l0MC})NY;h$+NLtw9eY$+yeTcCoJMvMM9t-A|
zb6Pc|SQcH(@X?|D)~vt{vA4I+Ls$P}M{<WMY;Fil05k_ibC{>~`C#uY)mvdP=-ymh
z*f~1(to?b<=#u?c`|1lRmY_8(M5hLucd5Egm)KB^j&few?9w5RdiANGVpHuNhtQb!
zHbrE_lZRoZJw}wWQmIR4%kV=@l`9fL<utKgBjPw}{86_3X(^(h`G44Y>#(ZY?pt_K
z3L@PdN_Qw7(n?ErhlF%@OM`TGv+3?e>F$#5?mWxqecs<W-*?Ww{0A;C_F8w&Ip&yS
z1g37(t99WOuJ`+fy)aN06&HNwNnZ?+Ht((TE3!&HUm8ly{;7dani#=Ee~qF^p$lC&
zl1o@5S5h{XN%Q;m{pVM}Tb{E(L9`W2(o4DxNs3WQij0_|y7O23Q<)4*w{=i8_-fJQ
z1ENgGbfAD;sP!_Xi|SC!7CtIkr!`Uk7`0Y>;}wypNY;D84M<KE*ehOH`OU(D@XR~X
zas&!Y`>*s`9N3kWm2vr-Gla6n93nzP5wBnU-9Mfz?4mHtKZ=@Ej<!A@%=GB!1*=GC
zHrPL*d?QQ8oA99es<QX<7ycV^<?pvby^b-AYm1$-Tp9`Kr#ETI@kWD-v|CVulApzd
z$QN0p21wp!7LOEAY=?-Zye9-3=ey4=>AG#FS}5ptc+ihReCMXWHOdE7cZ=o&V^Q@E
zM!#5GSKb68CS;n^PIOI7L>zHlqX>jo<>o3XtO`O+%ehqMrbBh0{t!AnCA&QEYQ~~h
z8<OOflUqU^!u!P-m%&iB6YPJrwPS>&;Q|wiC0g|*kn=X{kFstEbeHH{eLEZh0YRl|
zDwyAN)aqB_Qel9p+)}&5e{{56(HR&R{vtx2f*TR>1xD$UbBvB#AcJSX$x4zwgUDfB
zu0almdHUCiM$@(IxC?R7SIV)Sb@KMY44jLrEQoL0^bUDKnQRW8r@^w$MKqDuK&}d5
z>{0Y}-sE>ljb?UZ@!}d&9uzZ~ToU=<7ssoyz&7jf?XO>lUWz~9qgwDgo3i~{Nol=b
z3Tf!B{OcamPByNOU}9zbsgiQ6o++xB_NNlJL>qh~*12|NDc4*%Q<@kBj|y`39L3Yw
zY_#3t-nsB^I8p=+y4@E@pfZ+`VLH7$`<TC<;Jw%>dQ@H-Oz?3*nrCvc?~tF1N*ul}
zcT?FL?e;`rVC?+eZz`qO?9^j<91<yLIsh1{Vio{tm%AEOVWr@jcg2V2Np}|YZl8-z
z+<a{x>wEx4%cuV0@Y1S*{~bM0$D4?V2)tB-$G7IAqfbP|+CWdvN(RI5e#CSZ*2NAj
zmamk6>EMNoGOLPzAj0cx6e*iASD$>+6LKWML78+VuG@A3a$HR?EY+WX_jYDFb&#h7
z4X%BW!~)Jc>A(*sCa&ND6A&Cv6a3kud+5g{;C@rk2~CI`Hs-%h8_j8IxMTQK-Sq)8
zmC1zZ=MS9EnzdEcIIppBa5Q~f2H$(^o%Po<TrNeaDvsJ(+rL+Yt1Q`pe5SQLY5o{X
z_(f`nq2Qhg3r=2gr#zVJ{2epV;jjB@9_6x-&$5w{ze=W%wTIb_g3aT<58t7wB1<)~
zQW>Z9FYs-#15+yTvGK9AQ-Hme%jeJEz<(~S@c__7ZggZ`vN2jH@fw69zdBGFUv<Tu
zB~U&bIyMy;^I6A%+VmGhWbAr>PEP)bOUVC&N+zy{q^CA%XHE1HC6{~Z<fB;##N*R!
z?~<qT(M~hq=MVn8q5kEA^xw=5m}Wrg#26y}`KE7k_)kY;<*}B7q$rFgU(!!jXFsVT
zGw<((<&)5(@>LTbSSY&;uDF5`BR6<scId6K_(I2({V8$v(|fu<9%=f7#eaeKHdeF2
zv-|Xmw=4c#k|YJThio}229Uh&iw+^xXL-G**LWm6E_xRIpmeeFazA$cta!hksC4kv
z6!+(sa%yUn=OFs5lV~+0@fFM<_te<><ID{4DKeBy(0jf$?^Ko>vkXV#@p&!%gh5y~
zH5_Yie>#xl&dxor3E7+NH_9G(Q-arSMZjOHI%(^FmFzKCSXo(FSUNogWy4Ud1mc@O
z8VEaFbU9?Wr~wY<Oo2ffMzA`&szuw2pp&+!`W;keATfX*7~VpGcM5M{Q;P{K%XwZE
zKO<E^dQQ}!2msucvBPpA6vIueQfmIMQ7)2sSqSMTWlYzv&QAmJvk)98)arUZHmfQ_
z|AZ6N;idC)X<mW{BIdfeC`&Rm>9AQq9SXcHbjl=zv5ZAxKHhph&+a|m-s{sWC>8a1
zH=(VymwD_~#d|n3y--%+5fqw!O>DGU3*BPX|ASxXi?%2|$<8d%WxAHa-Ot32g+g^H
zfI0Z<S7AYci7we+7XS>x72<ys__GiKX@ch%f6u4K+KV-5a8LoAhCgB5Y!vq&d|!Y6
zlnh<?!3=l`Q;?1^wx%yb0{~Ib*1<1t8vjZViZb*w^PJD`BG3GHH3P9!%lLH5Zva3&
ze0?mUcF_P`MM6PEFP=^a1HMv_+;|%kGp?V}`ngR)EI7MY3INiB{A1)uWBh(l(Q(Rh
zy}pvJzmHnFCR<JXP@F*$`{ks|C+kcZWTn}KqbfVn%bAO%5p+1j#d=V?44ZFBYGYY*
z5)#Xjq9SnSJM%u|W8%iIhs1!mDkbN?Jpl01oOR{~-P)ytcjmT@j6$8xol5X=JBXO-
ztwKjgNchVyiDl#_h18sCdF{%M>#1g^d1*u`xmZ{#uip{lt7*%3chE(Q#Yua4UAKs}
z<0U5tww>2}iKindjp*EjT;U{rE!Hqw_AL@q_*0gT5lJZ2&ZST}<)?^+MZJ6Ot3TY*
zi+GPNt%d#Lh;|gaX!obDK9YD<sepr$B$O%sK)E*2qe@X-+${rkMUcue0v~WgF%r`q
ziq?~}^MM8D<oUj*>`JJrt(5d)gUIUE8D2*nU#GNl0|7w9VVv>`2HnrHf>HkUSukDK
zJMH3E9gQ#WaF)Z_tcm-sHd-Ws?7~!i86TLx_sZqdgzFh1|K}}r0MHgpjHL;6v(r$I
z&dySsl#`FD;bIlfNzgN<J=3r#8MtTOmU9uV&d(xe@>DQn@ngf~NFpJ;r}cQx{eCYj
zJ1Z^!L-7Pt9VD4*#Z?mquvb$zoFlMmiJ>W(7-N^_C^2-KG(3{pF?oPy<@NENxBa*k
z<NLKy=q%@p@CKR4ZhaRf6L!q(=yn>HhUP|#?x&Z>f-sk#a6{+ozSTR14iP^op6m)(
z)uepM;H6}wmjy0$)}N{aDvm5ZtLJ^)dpU)v*;ig}s9ALM)SW7Ljk1Lkw_FHmb)<LC
zC+FgZXokMV4CG@S1j|dHR1$lq-oM5Z(jxxiB2%nUM-ijuuK-=w9^wU@U)=Qud!d?c
zZV&V27IU(Yr5HlWF*p&6r)&4d#SBh@4K}aq>ZN(fy7b!cxr<k_Oi1B0PZ?9x9MXNp
zk@KUrChc{Q4wn6D)Th_u*1k6+5m?hyR*3=+cy4ZG)_WO}oG1Bx5PfKXMc{huS+Lp4
zK)HRWw!XgJOSysj6N}qH`mT|Oo9_MbQf+k3T&v?A-NjD(xmXo63aL;#-nM=)T@LR>
z&bzn>^xxHfI9Rj2`se!Uk2F|rXJ6#K=&#7Y<C018f>$$1fcP^{fy%A2%tBJ2xOoOz
z6d$H2l3G5^;9v>v64f8l>znmY;l;!#CNHtjO)nRB?qINTh~Ym58*~Bgm%j-@O-)TI
zzX$#U6kqgRr4dr)4MXkKgHX}Ukd<#W?e$#oZXm5vS|IsRstdC4t1EOypXSk5eKY?g
zuA4TAnYV^7QTL@ie}V?P%rN7wFek^83`Bw3>A+na)PQ5AH`0nU-@j46ieZcCo?3)4
z5k1Vp8c)`sRbqqNJgx0myL}p+^|S#CRA9w+WG`gQ>X%vYlx2UN9j(kSXrC1o?d=kM
zP(05%6r)8d*XaoX+2aEnD|b&4TC*oqP!WVqAid_@K0VXj!-I)Q-BoA^XA7<I#n<6J
z-6C(%Y0Y6tfugF~0FB5oZlSe%Xq(Mn$LzC~`p5|WFp=FBza9+xBhH6@G!$MTrY615
zlgG>@q6;!nS;x4uclYqvSYKajYlH){JpHavj=yniy2L+4!2Vp1hsMUn)6B~7rY)4m
zKF%+{x3`Ux^ReCpfR!xgX)f2_1*m`1U!HTbd>Gj^=KX6FIzS3})TjrN6v6AWLCrA>
z+xZ^8g^~1<pjxZSf>_?(z^A{3EEEziAtLfA&M=_l`K!$7bN8hi4?4$C<Sf#P$$9Qm
z<I#;43~bhX{}I2;f)}ezKa}pUyIyaEADYr)yyr7D#7;8<RR{fCerREX9Uyzokn9)Y
z^l@V?yvfGkaOt_BYGe)YZ2sxX<;)BPboBlG<X2YO%q>BKsVU39yG6F=A8^WiBEMmu
zJVn3KdG1s!Nor<p&e`j3wIVk>eE8~M)<9qXG*=X?<jsHQRc=0m3iXSed9~W^T(flj
z@R0@UwwLg7vN?8OWzw6bgo%4FD*suLr^QomDld?_br#l2*lhe&<_G*Qh`D$efH%#X
zl?MlFeZS&jnVf(Jg<C4Fe*)^Ixw#)P2&De$pli=R&NKhs*GZ`-Pd?+aL^?XF@-q>N
zUlUf%z+$O|1+TWleu~AYy)joCskUx^pDYbnEfgEHB*~?v9RsBpd^S~@YBE(thp>h`
zFHdG*W4L&8&CCXEDtm>`1i?BGV}gM|<|_8GjDq^|;o$q<Ko_PL&L@<>|K$ByRM`ZZ
z&M&iFm0muCx9vh%rsn1wXU^N(+c^n@ZfP+|5UVaS_mB@lri9<#{9I8>G1<bYlc%kC
z_bG?=*B7Z~3V7-Be(F7@P1|9CIEEK!Z|q`xDM3%XXWc)Y$pd$O0YSZrREH#&B@^40
z9?#+L?qz4+B@$-!Fj13tB?LvQ<o@@#2bbX(E;B`Q1o7tM2QI3Q@7sJR6YEB@;}E#e
zIxkozSVtIVA1_{}2}{jrn!Khvx{~6FnYdbfd{i#!s<p6En#SRP%%GX6k}ooOd6wF=
z%?^(}j!@e0V*pqL_E9jh_%9a#JRJVUk3t4QL$QG;34sRUkNU;xA#aMG&pRXzo+wJx
zL+JCl8J@c{bR2J)4!JK668G20wV64iB${Sc9WGip_+|N<S(FBPjz4+)b))<H=lUeA
z>@b$6EO&LSEco>notw90DPm*1TV3M+_#s}}s}xgjExT!n1zvxoB-ePy3o;=t4}LXX
zZ*L`g@lQ}f(x$m)R}||W&Cf`l!-C6p>pL*rhThp?h2R5PElP`W$tgDLivt??f0OC=
z%yI*>h&^?5Z`ywSYPOXWhWSIPx?aC>vekAc@&E4;h00$htn~8EFx_>hKuU--bf+kJ
z{0t-2kAwa2nh}Tj!N9HCeC&VvNcXJLlWatkwS{3iH-mGZq0rO!*3|F+@4P&*_CfXE
z(*5OoP;3o!XAOVy27t_i+u%hXHR)A{$Sez4@sYjzxt|ebI*s8+Q}rA?v{QU2>a!YJ
z@+66+Z+O3w*yXcaD?Y`0(Iyyu#58j~8v{{qrsCS_`;!D5KC=pk4sdXYpKnsY(V2_c
zofV(N#6|N!2nSQ`w)H$TE-ZB(@iHO@28JHHZ@<n0pHooP!fv{dGU7}VBO)|;3J2!@
zFle``e1ZW4pYm#Y+D$0GYc^5l#p*B2M_*B71kq!aOBZFnc2sIIzv4ZCBpKmPbVf|Y
z9=Z$~tR%;tU$;$PB}|}CvfWsyHyPKL1-_c!A(Kq_K6tR&PJ4h<=_D25Yro0Ooor^<
z%t5b-t~S<Q_vf}QAe{g`bYBfQBEr9z)bXn2!{@vsXMd8{f2-qvZ>9e*(WMsZU*=6R
zmR~C9*(sM7PM(*qm*47#mIP1tn95hTJVPd(Ql#O%WM{~OXbe?U?%FjbUp#hXgZ^Ax
z)sdq|;(=s?ugEVC!<9p1%pt|`+3h;om#a(}{q!o}MvFZjjUA8Z)Mo?)?~W~lx2khw
z2;boBp9_7cx`{7)Qm{`KFOEgHOuCc`Ak98Kux|Tqpl(A-sisf>rLBW}5GgM6NcP8E
z&3bjC9{;@Vv+#QMq%tAO7BH5uUgzl^SHIEohlpIOuXOwCBmPT+Wit+DE0gy#zHNTL
z=To_y)(ou&muC?>4L_gieH#GEl@*zEa$Fs>KCzLoWA?nL<&Tf)N(%lT8UMn*Z@}Wj
z3}g6wG%drzNcYBL;eWPO7M1?DZ+lx?AGj1WStjmL6`iiq+_shd@0KI1l(d|Xa7Wmr
z)1;ArOh{bWw^>Ih!TP%5cr#X3<`glhSvYn#SK7FvILn)kp4+@6%E^M={e8>tduD8j
z>xNJQvtK{dj9XCon&!mSkj*BKD=%uR7aXy&cz%ojZQnZnVku3uI&EKjg{wX<<>AnW
zxiPNcR{m~dv3Xcb;p4-|oGDl!2697V@J4S77>t~*pF7ZpIbJo)E=?^P{<5w+T$OfH
zF>))fh0~i}9mz0Xpxdw2>U(P<u3%!P#WyxRJ4It}-XE@PZOxoC^Lf^ufKwGO<z{wj
zT8xedsT2|hv4(~LD|s<?N1y#5+~5v)Dc@d`M7GlXh4o}RwVE&9=2UeeQ=lgQ^}T{O
zwY``B52#FrQ69D5nrEVk*1D+t3WPP<*32v?<P2J3a+so>ini8|{GE%56&mHX$(<VQ
zIRapXv2J4bB|QPdVck&4bXV&3St6oq<Yn&Tc*iB|&>b3D8=kqH-4`c#k+%SV4|`xB
z^g17&R2%schq6B|j>EeH+cd<t2$t&>u?q6utX$+qXSlJY8ejgh`3ZCuOl;(Lws{Pk
zbO|~7gKO=f`2W4^z~mGq)9fv$OgruS@3Br(yMZUNN$98dY$u#osa#hILP!8GRa|da
z2=z7|l8!gX=1mE<|E9d^KS@b$*)Sl{wF1T*9_M7;q!Uk=k3H3=Vf^n3G>*dBOXkU`
z9`Ba>2b&#Y&~Y^q`=3d8WbJ@m8eTh|d@n9Qb+~3_r`s0ah!v6_al)>9*mHh@ShtTG
zqB*QpR;Q@wz&^@R*ED-}_cD{lTs?HhB)P0|w_ml;fyhVq)WE`ql$kQXt^QC{?3E9l
zLz9mF(2fuWwYKl{L1VK=Y`irUQ8DFDEU55Zp5ow>P?J8`=;bCen_o7xlyO~`;&mUt
zO!t>ALm2~Z+WvI0HBY#gyLO=B7R9SCLbM(Ai$XzRHasvkqw`cphz;(iZ>Fo5CI1`9
zcx8}0r6i4#_iYTAW5u~+(hKZ?XsWOO8F+x{Pa5249KqNfj{1TM-I+?(3=**dmSb}6
z@pO&-vcCwQ!d3@K6@F=9Hu-2HAzuU)9ar`$U01?7*2n~DEO!qU2-Mwrvs1%IhG_m?
zAID?_4C~Z3@5#kEo=W!PLtZtwvQii^f0g(zK!Va86ba7*iv5At-;=HDo5AJ;mkJDH
zd?%%P5idKA61i5_oPZs3WEsmNH(68K(t;;x3*+zb$dgFGE({d`;)%?ePVWk@-4~Ts
z?rZ14f`Vk771N{A(9h5AJ9b^pTP5-2bozB<ico@1;t>;@Y15A7a~<I)2L^+S=!{8j
zU!*cyH*A;r$~r!N_et{mbdyy5p)zE<HE!~`kA!P&p%~KmdM3W-l*%KC!0vP5nEmvU
zpZpHrE+lH&t9T!4*hIWBS?0iEly}-XGm|Wcty?D{U}8B*&Fr>IGJbHbZzO*0Q2+Sr
z#5tq|a>tdA1=K7tSU?Bi4C!=kDuWx-?snCRzZCvrYkn4+4FT|8ISC!eODG)wqEmK9
zmEVx9gY4!qRp|`kq3nb%3WQ%>Yrn5q)yL;UWva=mX_vPL>&8vZM}r2nr;>c;Dbb#}
z4cYI@so5IEgb8fklMp}ZSA+hA+=nu{#mAm+0Pd{U-u{0^%gKYEK`f`Ep*--DKlS$Q
zY)j3M9YsiXD#mL?B>dz(^p=M~*diG3c<4+5Fc+VW#9a9Dn2uF(Rw%&zFB2|HCu-X6
zBMVa!E{u1;Rx(Hph7^34do_3{T}of01_<J}Ggo^(&*WcsQC6&Lv~Eb@!gRbH93HLp
zaB6&N#x^%G2Y_9pQgyC(6O~LaH6V+=LtzMc4G-Eo=Jml)J3)I%f#<83z5MOymk+3%
z6!Cmk=Z)Er;x@$FcQRcI4kyF_F5BB<4Fd^Ws3jP)%Nv*Si<)c#`b4XCvyaQD-t3Eq
zE9rK*NA60I?Q%D*1qpPvw~y1$R-fP7U9mwbi|bAn66uEjn25SjfNB!R3SjSQ%wXfo
zqtRQGM)J<6-(yeOGlQ-j4zAS8rO=>(N%YZTy(~TUXcF5FjTcFsCyB-K2_8;+v;A)!
z5m*3K3M9<qi02^><-3w}$rC=}*i;;gHq$a)het*752-rP&p`p+GiK}zK%L*#9%VSb
zsy`l3p(^FgEzJiglnXg{@I~?F@8K>EVU{AvsxICL*>O?4XWs5!rptYKXIzyC>qnHB
zx^EI<vdhWb!}zy&0v0z3Umy4cE8uM_KmkpzV$`k$V5eveO$y9};`2A!7j~u14-8Aq
zv+Sq5bu>M6Y=NGm9}PLxu2;wK3Dj7CnlbQX9Il^olbyEsEeHAXJW40Nm<-kAdK^9>
zWAIKGI<rKh+~#Q65*TwUhah&$=`QS^REsYzAzSZ!zw~MAd%&rwhg$kJi(K=(m2^)-
zBNhs)Fzi;y7Pky&i3&_~uwHe+qJt~^B4Z}IpirG%+mKaFQ*#OAO{S%!xINup72Kxr
z5)jt<<Zz;9DCad#y8I6U#R?)&$<HsdvBX>jxSB=-(Pi2lGc!eI_X%SZE^tV$1m=b&
zAs?iC=tzJ>at<iP)L$_Nkb=RB5k=oIhm}O%+?sVM?Qa)r=G%il6OAIv5Dk3bf50j)
z7-?yk@SOt#kc9a+;vU;@`^%4femPur)5K7D7I@}pl$QA6;}82b9{AFrT2be#XkMe-
zqWH<F_x?_4sx0rjmX$+EUGuN&=1@<^goS1GT6M`~h^`aX97MkaO8^o+Q44D0Xhh9H
ze+rI@>cB;3-RwGv^ucY+QEhC=Kq2A(Wys3Ll@v47GLpnbz@g7|>Ag7mPT>>jOJlIm
zz0Q8jI%M?6&Ry+j$NBzxx{8(-`|Q^|9funJ6KsF~H`?x$Ys(@eHs$6wJHK!<n~h4o
z@pO~wJh)vIXH({kI}Rfg3PhBaz?ckdRi0@BIW0J~R-m_}h(^Yl&WHG;bf1QY3D(XA
zdD)9k9>bcGW0zhIsAx1lIt=Xm$0R1kG(V8m|K2a{vadr~HXau?X2yN<=FL*)!I(RS
zzwO30W}Dd!$;A}_Oyue?B<L(x-(P7ueE2LM3%Wf*wgwOcyh?hYfJ~aXx@R0Z<rBJz
zU7C>WUuF)+L87%wKYpxT_g1d0r;J*17EbI+R5j%?+il%Pni;5+rwqRaq69M9cpD6N
z5KC^-$9KbXpUuWKm>^4P_|bLW8f%ycB#W<G+HCL6!yYBoCxqm&Teqju-9J#EM500q
z3N0)Y6h2y(ta9g#Yfy(w6{A?Ha)JWQxWNzmd>v=7f3qul9VyDm-!W9;hc3RIx=|AE
z8wi#VDN%tn>*YononP@RnIWP>9ztUuvAtVs1D2Kg7E;a+u!$;EC<bRFpnxHY`r)x}
zE!D^({1?ztTeX{?!kL9TVWCd;Si(k`W*Htj2Y1ht@&CL7-QnTq0PN;Y1Trb0BH@4#
z&<3H!)bSjE@NKU_n~go}p_wHVlMD(PZAv?Jh{vPGMJHldiC_zBdPf>G8GAPGn0BGS
z$JDgVCur<*Ij&4vYHGP6MR|}oK0f{>&z9P<mEGu~Z>fvJ@mM=LmMa`}<QH#M#@uaR
zJlAUuFrFlnR&8?|ZBoSsy!Aas(E8&cUTVovE7xX1*!7P`0=Z2aUp@7=M}Bn8CQw>{
zZCCywJ<>srR%Rs2N`*rMJf!X({UbXVtc0K*c44jqmsP&(!6KPI^VzT$;aXdp!P8-@
zWvjB9Q18;va(e^}0JtnXh!@S)A8JNd_#J1^DZYQY?*MgwmqbsXw*N7^ZDNus0CoHH
zxiUQsY=!)R+RYRby<%^&7{r2mL~Oo^s5@bgzPlMCGJ594^HC0M#7luh_Ag7LMu@j>
zt+_(l$*6iOvO3S}?PlUvbb8dz!%lv`3-6CqQ4vE(M=-SM@ek9RPSWUHy@@*m!Sh!f
z%6*)gGqDaKPx`o%sx}XPDG*B+I=^O9UY?T{t^-)}Vc4=BJ%N{Sn$h)(ogE6BWyiCt
zkYFqkcXMt}EKY;Ro%=uhKB)L=_$jq0-~YtrGg1B4?hqmt+BLR_`=TT#hl+*u0r$1D
z5d~2pyF+@Z>ctu4!?C!(yWVy`V*MUnO>J&C0cuv1OJLU5t$Y-wQmU)Cg$2sBH!xdz
z6Ub^s)_o=7jF3qcQ*#<2VoDrGXKqfKr<~kjF^ICbuN)&v+ks1Oe06=D{T#R6K$Zs0
zad_yL-`w#(zfGAr<RUe6=>{yZNn`axJYEhL($u9td^133R^{1$81WVT>&Tq-K!zT?
zJ)FX!-`tSx*1widd9Tg)8C!=P3#NzfUE=<zj5c%AwusiT%Kdsd0Kj^-Od+`2yd)5v
zCmPO2)&`a1dpclw>rx49@mz)W4XkMQM&F%i47fgA1S+g@Xg`5o;&X?nZ8-ym&zYKu
zc)hQ5hf>MZ^rv<_tOjn0IiXj|(Va=$r)PvY-hYb!Ry5qC?+QufFVGwH_`;mEbB6f{
zhiHVP#zDsTg|tr#WGVpOiO-3gD?4SKO^rUsWBr3pVfKRx>3#;j!!!1e=<_U1wLc-P
z+S!cJ5}+pNxT#_FJjeH?hWewco16AgC-)befw$F?mP?%_BINS4ao`StZ~G*KpTB#J
z?-ehHPfXXklK_LN-?+{nLE!4rC|CBQ(<=KRz~!Iw{}?gm_vYDY{(j*R;5l7vQ?e9E
zZrH2Amtpk2AWZVwnI8Vk%PHI|bXj!XhZGk8+*gwGvY6D>?c<e6=g5S_W+g!#=!V9u
z{l@gro8NPsjDF84{cc7syP-%7+yx0kk?(Y&dH<{qD=&+ZUJOfIc?>UPo7+OA&|p`=
z4NI%N6h1t<nM=nq!vnCs5T@JGZ>B8Vc&unC-QGsBjU3*5Y-iC)Y^fiR9C<=%u8k0J
zmo3ct^)J#1I(u4Qld(_ObM?@M#$3j_*ILyx{fvw;3Tt-zs-Sh}lsjW{)1&?bp%ZqH
z7|ZB-etNh$Sv~P|9`&IVl?Mb-Vc&{eE<y2)0L#k`_m_LZ7O7C|ReJxzP{fqTDV=(&
zd;y~fe^y_7TVXv#dRi^fBQe&$i>y6ZJiYwDX|y<7mc+Kd(f`OMv5;^&rKDs;Q$%{R
zY!N4YnHFq`4ML`w(0Mui3^MyJ#D1Ls05(m$@|MpLTV4#=4Ooi2zrTOb?0T~5eqz$g
z#Gc3lwbH>4Q~RUsaPkMK?F!4R_r4o(lhDwW*T{&8|DYc5WK-|>-PV|{oz(1Priki&
z1-F|m@gpz1S3iNC55mIT-t+?&9|^+6d<5k*G<Yfq_D)U&b_4f?;rS%8viB0~y}Sew
zQQo{w``}<xUOj(w4T`jXOd-O@=N1ZS1z~<rH<Oljpj!5TPdv}KTCHUI{`S`2-~ZBs
zY_Z<z<bifq9uAvH`z-*{r9Y-vHh8k1=c$Ruuj6zx`}}7up$Z9by|S9NGl<6^jAX$e
z&h&bDSe-4?B6!TJvRwQariq6ChcJF<Y4D`_7{6T!m+N@wEeR-A(rTMUf7IN3>|<&-
zcR;9W?J=`8d&n0_)uGEQ7LHGW9ZG@zyCb5CJN7k*^j;`Wr5vjl?xK~x-bBO)BSHKo
z$D@UScJJL48F|0lr3?!r@TLIW$ur}P+3}$S>4|@7w;LH5p&Sb-`j4%#f4=TYBJo<a
z!BoMSH+^@K&eE~na?vsKReN>mp(`T_(t}PMvYYsUmiA4`>R42u3*CM|r|xc5N&A<)
zuSJUK3fi7`MuctP7mJa`{I);e{Eg=J#hVAfNB6w0&Zty{vD>uznDPNXT#!_b8WW}=
zASo#coIZBwJC`7Pm)X(>oze^^c<ht0Y(?6foE#P<!ETssh-t^j`!jR2EaF1NV&7(~
zs4p2xj^CS9Tbx;DDA{W`(_~7fL$>}o+n!y8>Z-cz9XoJ%K3_w}TRL5ztm-H$Uk~a$
z+eX=wUJ<5WQEfL_vOJnrer!%$@n>F=-DT6{-)LH};{UGO7sz{^>om(ISNBRfX8zFH
zcCIZ;NPy+Qr_{)>y9MiNuen^9cnYsi82*Z$mbQ5CJ5L&wzNL_Ggv9$~4y%<PN9!T4
zQsrmu^McXP1`zcP^gvE9^Xyd=^iBULkKxQ5M}-9#$y#&-t&{bY0)Wio@Z))jj@sdI
z7Okp9d(y^w%+rs~Q!=h00piwaYNQ`&6fFrkS(N78)2~iRCa3Knn<}&IcTOIYX`i@S
z+}DLOocZN`;dj)M4r$1fTvAx|W+|cWUpbS5&j!+~X7>Y)0s;vsV}5U(RS=GKSkFgz
zeszmFcX2~v`eoMy$!qD{@)WV2XBD6{5tMD2T8&akOXWJ_|AcbOfIo<&w#q}rhu2`p
z?zsX_23y_N7Jaj-^L7oo((YjGrXr}_X^M#E2_%rZg?Vif28;CACjQ!&wVFz8yN0ax
zP!-sactXvm95Btyh7(-v(*NmTOl)#KDv88Dg#NVqRc-AZA?a{!#%gJcYL+qDWX4T*
z{4tb2xSX^=35{|tv_KK~D>LxqY2Aj**@Ktu9zLY^y>*v!<xq2h`GE>i^@@XlnJ=GR
zm7xZ-?ARi?`M?2+a!ntF#P+>|K+K@$p2p;-5>ox2vZx!KZKBoV*LNrksp1+5Vb@(K
zAc2?ix3;Op4EEFQOH)QbDb+=r)0obz($z?MJT_&rPnyQ-_8?wpPpzZca!XQ#)EXKo
zAt#eK#P<e`sb0(5F5eTnS2DkuvNt8>v^>2FXsL@RhbX2^_mTROlMYH#=7ch!y%KSC
zql|NeQ3LgwTjM$ruMu%4pQWn9+3Rd!S`jp=|Kx{LoLynKL2;G}e4IH@Po<BLO)P)!
z4F@>-^|~(3OuQMhFmL(x2fCuyJ>3Z<1?;Ab9hP%#F2ehv+um`!(hxJ1YEe;r8%>~(
zX&K%I{rP>y*>@$3QKQ$o^-RTo7LNKB8S6w)QAbSJ9$MllC6_-`*Wz&}{D%KK#xc9Q
zyTrESZ#B^NLqYXu?VZjA|FNiQn+-8w4rJqD_j)rctJ7&|V+h@8<rnHVo|I!+cq<P~
ze0^-avzumMJBW6ox|$fRE$Xv<EDpt8HCP!~8%kwFa;PH5yCnk`y{sj;*ctbpSDp%?
z>xz?cO71HN1x})LWbUB{F|LNx7X-OKaXQrWu$2>TkzDw;KSY<5T`xM(IrPoc^+W4z
z4vxQR#PP0WeFzCWMG5E|plw0}ek4{Dee>Z5W9C^8&HE#~(AFa(ZGHtzg$z)Lyv#gq
zhu;*RPe&@f$TC~7VfSoLq3Gl$;pH|b$0Lot{Z3wCr?SMZ-2DZNzfB!<UOvBGqj00q
zn}97p=0MO?;t0J_;b5;VXaDI$#d~F6oizd*wmY7ct}=((b(TWeKq8-qh4ZW|Sr_@S
z1N9-?i;LCX{=~de`}vPhSzEA~+b`s3X;&_hbv9ckHgGd$P{$4U`a)C2?AdcSyJLsK
zXK7z?=K{(U{(*$9ur)zT;szCV)4NB~^vZ=l+8*1dm&UTqTQ9~yadW3ip{KndMhA-N
zt~DtnJ;z4+Pu{ZUJ#{6I@jVsFtyW)k%!`xWXNv>O9IDeC8p-N->w?dUG#Eg~_i{69
z<&VvP`fzxiEhFGZv_HDpp!wm-am~)Rp{y8xpo4N<jP3CW!J38Fc4Y=XZObezC!jk7
zep?*A1?X53+&k1}^@_;ki~kT48=?Oz*8+5}NZX2wx2;BWPNnKP#`mi0wNOxz7t<B-
zzeR0OAoZ7{Dwi#BKbe@ku{)s$CwR6%N&j=Uw%WcWK%W6bHuG|6XxWGDZT0FxdyCo?
z3W>YEZpX#L8g^VMZT+d`!JtBL>iQ#!U#O_Ky(o&Wjiw`^!tAB!?#%3}7X9TjhT1h9
z)@phsv%XNeCM{O=VUui`htqNY31$wZY<48VVLgAzhS=}E{`*^hv}*#_m}m+Od@R1-
zK^}{$wt@;5YD!x%krB@>v9j-6jxr+%iEf>%NK!i2f?Tg^KI<|L7JWB0e*Hd(hVnFX
z#`-9OBc*9TLn}OX7-7l#*e!i!#_OhD{U;6XgV2vhw7|T}?P-I)l6m#j)xg`UmiWE}
zy2kr_`RB-1RBmq6<Tl9dc1Dv`BqBz}4M8Q+d}4hz)UV*vH>Upgrf2KDe?x?0wIPr`
z)E&KXXu`2FbHO_E>!%CgC^_?IK|uy=^Px!j{i<SR<T2$F5iQ@<g*+ipp%G%P-Z|^`
zRp%Amg`Q*!pswi%00sA@$F-rcjYnMNQL1~-jnHRElY@80;JQxbb^y6}5~40lKZNM(
zod}ED`95K)8*xtqsm#BNFpn!Etq#qxTbe0JBcOEaiJoU6^|eRCLsHj6J>y;xTf1db
z`P%j)TDU*)z<|tJ{l+7A@B(|Ql@xs{rW|X^OQ!CI`_F~;-QBKHPAi~Z!nnF{>!nP@
z_eej9$*~a73?~Hda6R~0XY=U>2aKlLgJuJRjde$rr^FfE=)7^N`9K-lY4>Yw4~<Ag
ztC!DSUw>VSElcb62ldEY8Ur13EYUGnflT<{_k{iJN9FPbE6EMRm;h(-VKkL?mDNJa
zz+ird1Sv;K0aj7UGdN%-aWgad<x#gmsEM-nCY1<>dn>`m@`eu{KD~~w!Lq5IlRi+x
zF27NJ=flT^NUB;D7pT4N7?Jwb_5<acluh=TX&9c5muK_(9|2fc0nbHEd;i1rv_&mO
z;8pV?d~B0GGrOP8m)qOVn!qW7Tn9u_$IGZe4QpNevIcv<xKq#tZU_-Ez8dH_`Rrm{
zd8S@8zIz$NEWZ2wZ%|Zn9GLEYx0O(7jk=$c&eN$k35TEkAAF&bP>zFBp}%Ui%}8O@
znnIj8v#+=g+8d&}$ZeOJpk_{E<JucgA4x1XS(-0+P0f^j(}oIBT1_hF{6xC`18N9g
zxKA{MO~#&ioWu8<fR1L0&3x1q{K=@n(~5h9*HH*rNsGhw+ixKPFz!F1u0<8&*>@k^
zXLvUlVoqV#l6>m4m8M%Emf@1{Qjt56W_6pM*#fG$qcM?c=yX3C4QUz8S;fQ9K_uGr
ze(4YkhYB8Qe+{8~NBWQg5oXrEQez*VEi!54xrg&xoEPl;Y0xNyo6%{SE!)LsDEXj|
zJ`M{I|1Juj#YpdRX(jthnfU1$XE~B#EIPyKS81iy8ef&10J1bZHn3GwU-v}#a2GhT
z!I_3U>2-;8+53d_aR6bYBre0G?k2HLZq0Ch2(Vw-m%gphLF-1Xn3J+z&;l?Wnz2%6
z2AtX<-`ys@nZqUI^?EF~l78K_iBS{p7S-y-vWrQt!nbgI4LH|wYc%sq?CQl3GQ?v1
z=-_EpMo8iy0Jk{;B}|9WLEZA~f?%UJLsa7l?)Dbl26Q3>3DR-!jbY{$k6_7O%ulwA
z+V%@Epdbdnvl(w2Qdqv5Ma6?cGnPO`gC@itECNQUO0-Hl1r>YHNLv2|F^>JIVbKoM
z2ju_D1<*;F!m5jh*LD%DR_h7Ru}Gjz(Mxsu956g|nT|IzbCK1OJr5sd*y!XGI`MQf
z^OFP@7}K}gEroP!w6Thkde4jw(u=)18(_=Eg#u973hq3VvjtxKYK>Y*DgC>Pn4ueb
zeE4lTofseiYbMXKYeRJx#F1>!Di@!_i5~q4GR_h70WfQjs@&X4X)m;9OrlQY$R{hQ
zDw^XqV=PyC@`#B(dMcs*IF-U*{@Nz?HV)nwRM!zP?@I|~QVWrOepZ?(k@?}qsw<62
z00^zaaI6%&LRzq#-uk2Ne@!&)JD8+p+IxNuKkiNs!hy#ab{48bk5Qh|;iVzy5L>Ac
z0<#!ggWPBXAoEVWsLiL1IuJZx91E%07wp2EXQ*!&vRj(0@Qlx(^FPB9r*Y%u_#cP;
z>}-n3`$<)f?WMmV_C-t<8xa~_?Avtxj>R%#U6Ay|xcFwe_>LA;@mrdNe*b)YL^$7O
zF%Da3keh?M1~;}p|LVvhpzbkYd{Q0!x#K(-DfTm;v94=q;yWjfzp=bhg76#axG#h?
z$XT9yj8?t*i0t;@gt}a8>*1(OJu!Br?3;6|TGBYNj;hQXZM>(6UzjZ}-EyQZbF}ew
zR=B%#SG1`mnrkUKKYs9J#e-(VAiTCUr~4t++N#K^v<9ckk*v`<{nJkIf#qpJIiV6u
zJvq%7ut6cf-X-ckFl(CBs{x!pSn|xRM6KB;Lw(mE1bUWmPduiF3p}ky!p-=V_R#$i
z(zjuz<`1-V%h(wQ^JS+pd$$3coRv3M(lUVYkK2(V<wSpkE#xJpF?jAB!^)PH3fUXo
z*#+)~1s^cN&GegkGdUj;$mC_gx#;W%1L*x>1>muyNe(!=M|ofcIPFn>m{yVuF3GuH
zY{#G(xHl&Y4{fn@4xEGEyL$Z~UE2g-zQWi}*3wH~XlwJ%&|vYq#Poi43oa}mSO8md
zHxp-^Ac$XWtO;NM>JAA<PU*R`hE7}$Ri7~<`(f?k=HIyAHv~S8OF`M&gl#azf3`C7
zoQl)B@S?BcL&^PhQZ*p$(<xi}W)sd9Z_MI;96I{4GpYMhn2ouS-$z>zJZRJXyS<H3
zL;GaR4+svj)$w|MJa6}dvnRZhMRj&=;(t%a&5fVJIL6#<$m+sN91pXP?Z;zB_^?KC
za7U78hGDmFaZ)g(`6(&yR^7Z3j<V-skzBs&?kk@)@t=MNH;mU&qHu(%op3k~r>Dan
z4#O12iL|By40;v$GMHkngmW0sR-Ymld-Yr#JPv4IdE<0YpNL{Krh7>Auzm#KF>~@g
z^`2UWG<)l?787OrZ{Zh4Lt%X$cZB#N%kZK-EekX-oi&Mnbk}D`_Y4k7lA`KzWx9bG
z1{!5D3*w6NGt($cN3oWg^IX_*g7ak#+d+sdLHpBW+yZB1Ml4(ewz;TZS>02WSFo4$
zZFmVqTmrvMfGGNyZrfRK84J6`z?&Ch6qi<Lj7OK%spr$IKt2Io%s-}0+0EBT{%wz@
zh7voQ(W1A`osdNiMKLkzP})hYhZ&n4IiIbgcY#gC3ne7YkG=F(qTckG57IwxBk|le
z+@!Yx+Ux!K2mH9ZBQrRh?50LY9@zGVZ!V`FB$GFu@**qN>Xf{G>oR-GhShr`gdzAC
zyhoZA&3ma7KxYrsqbt8V3-LqaqO^YtfD`B_C79_wmDc1Lhl@cCSG<sU7j~KcvKuqm
zB7Ug-_ofUQ5wLJ}Bk@1}1SWn#Em$dxs6yQu{=QVrBI#d~4qETb#oTN}9#xsT$1kMC
z*776FLsf>=53q1V{LfucjB!Qw**hz9>g`5<{vLgJ1tm#o=#A{_>yc2S)jPAGf3v={
zC;tY3#OBnN=j6NEh($tMGh;^H^^MPVHmYlMgbF)-)Vql3XOVqM7&5a-gv-GmH<1&n
z3xiUk*FVNM>ih{}{YSk}%%O9H6$z^rKB(NwwFyf-BY%PGG2zXv1VT0k#a^#bTn8Q1
z-N7@T-|GRA{iE@Ni9iRE7TPzyOX02GByXv}QwhD@2BAy)0f!kg%SMuDm9YHfxz!Z_
zuqAOxSdFC;f=Ddxe8tUHOJ=ZQ55WH|&|AGu5}C4)Zuq8;Av)>VrsP=4fyjIHYbAYV
zko0<lR#t`ai>k}s_OvA6?=+Up^5~lq&p3<E_kDqhYxSwQ?zkl7mFv&?dU~$+#&uCp
zfWOY+;0CF<+l5pB6p*OhlE@Mh|Jzk3H1fq?XCakjrI1%s839-mrG_RqXc0e``Db#F
zg=O{ULdIG_;1muHjwoNZl9XNKXrUI7XW^1D>ZQh;cS#o~3`bTM`qVxD)ijO@&nL=M
z^4uNkBDxd}+{1#dIS(~rDv=grO@%&`{3j#dA!=jIu*lNG>(?1G2>I~jjb#G|;V1F6
z{5KaYC5GAnX-ox|?n|~Ri6J@cN@~)E%ewYRd_!no^~Fd^wM@l~)Y=!q>x<RIb9#O+
zPi^#mIH0@%!8hvVW3kmIiLOFiZvQI`de>aCi=g|k3&^05s&3s@hE)spt{3ySSm)AF
z1>S=lo!Pc^)Dfa#=I61~&SB*i9c-}tb17H(XiS|y47eFo*ZzjHpp+dla-%fNdF(3>
z4K*|S$kCLz0xLUYMk3`+lEhCjVfUF#ui7!7R^Sv}Mt&%j&=^v1$oNWSQ@TdyOLrn}
zt?uY@5CNG%`Vz6}w1OkHIH?{RfLVRQD=88v_2BU%)RyxISScmuK<H2*tX3nOpR=%h
zRo0WD3nhhke4K_c-WikQr&SLch{#Z-ygM~v;}uUqisG+R!*^?*w}7f+O4vEF>6zG2
zpCdD%Y)3QXP=^e^{rg+C2^0rrE77Noq<Z~GUjJk1pz$ac3sF-E6%%@Yem!Raj6PoX
zMu&Y)Vo-;Piy{nWT5*UogVgm^0p!-04&K=&o1Zb7%PH_={L5s~Mg9)v&qUqdW%H0l
z#qH9_VE}k5Hk9+s2D5;C2^YOSOM7UQkaDxk>ZhwHp=jkoCn(`dKlwGqq4!dq%fqZI
z{UKJ@_u+eMj`;9})sl)VeTWT;M92TDz`z2Wid$-o38lRcWtzc(vG#A#ktXBkwCV&)
zUjFwZ9_Z@I%7FrHra+}fYe3sFN&BrcqQ`x^9Azj@$NoWTj$3lfrwUvV9gd=%n+coW
zISu?duFf+WocqKx>%G_)0Szip02siv^9H5&M8Rjdee}4ucUlRN#(~d5>!71#QA~5*
zv5|Xxx9=Reb3mU!Pa5$@+}?>U1YYfIrn%sU$ZH*kf@*EDN~p%jFJY8!e9m`>T>7Km
ztE9-eKBx173H-#HH40q--hFj>>5rz5YOUE)G0zU(VN!79Q0kx3ZxxGbQk6^^*0X^&
z%B^cGY~iCC)$Es|lA$9LZ(w;uoOj@SQD5q3e}v~_vsD<9Q@+-n$6TU76)Gh8A}P5i
z(DryP$%`TikK_p^mYzA_OOSB+AB!t0#y(h;N}Cq4B}q}qruWzi-sBW_6~pu`RWu*V
zPFuYVoXgm)JE85C>^R!Vv$Qadik&Z4vA#`3^-LIb4i@#t75E04XHlg;X-G;+R<=V6
znMN7tqGdj3SCdKoKy4zqjDl8;8#`UI=Y}ZL#ltj_kj0SPxob&?!4fBvPht3VdIxxm
zkvh$NIk4VP1Hoqh3a^7x0ptsJRghr3>Q{Gc43;K|Jg-gp-sVqkDLjK7HzR(2=b3s|
z|4E9$yBrCIJlb@n#bvG+YZ>RGa7W))kijq&0Al@B-9@|Wy`hz0N4y^xefvf8th3|3
zwHMmQbDi*NF0fDkQ!s8cckk_NTyD?5NEYuPaOsYXp>c_!I6<86JTpd&{S|x)Vaxq7
z(s84_-S2ON1n~YrL&XjkR~u~J0f*p0cJohP?x)?ctZ7~5wJ2!#tWU@nGa0cww2<=h
zo3~2Km4n_r`TA#Cets_7ABfU@k0(=!wI|qe#^;t5mf>X6nQTTkxGL?Imii?nc<of3
z;3Z%F$QZNa;eW$IZL%uc9ZUSvD9qnL84lK1P;OayIOOdUcjqa`^<I^t!C&2k_Ubb0
z_I?7d6XGbwX39cE+P^5l-}gn?oI(UZgU6+L=^*b~mE#|itE(B$&34B7=@Tx2fBM-9
z^X%r3D=l_{E=kh4)KSnNBIGu&t1d@BKB10g1S#jajWOV9DSxd$-q-cgpy%E#gUYcx
z5FM4q92@m1<7Srx>XbmKVz43@l$Dv8c_BjBg+<2xOK-)seU~&+h8<jZZqF-DCOPS3
zMXL12r?2-rs#SN}K_|HTbD>ef_~*q76jNE_J>7`sL*5kGJx&^0+K`C&$ET-Cv@0@Y
z_S0X(%W7ur5jDITtlWyf(@ngyW^%_HgQ1(O>UWv_Of*Ex13>Q4@-UYDCq3!EBY-M&
zQw~;I2GcP(a{+KEcDGy+w_gqp3^YlCeLl!(2op54#ziOr%|pv1;t9J`amXnHgM&~V
zrtI+v2~KTIOKDAFqN0DwA7F%VIEma7;^T!#gp@|(-;NgA_tY8Mf6w;jum9RW2WKB*
z#78ypAx{{2#mEk>W~V~c2>kAm?@Y5F6H+vM2H(b=>HCM(?bYRb6ns~KyMN}=6BI{n
z`{sI-&G7!C^)TnFcs?O`J`uj99dP6T6ee)Su(u&ky3Dv>Mi?}Rw@V#_%t{W9RGew%
zr>@o2)h9C0?sb}PIj!3fUT@FzGkLU#s3=X`m&c^`mt@fdGZem_W{W;=PvakjI&skV
zV`(kIm>SyJ$1(<dDRfJ(2aYRwD%Rh!fr#Sf?-*6&-A+?TlV-i>w4!KZfpdV!UHe@r
zvp<5aljc*WE1bJJzzNeJ6MFQBqHI@z0bGD4s;-HyyUy||DnVO*sQ5L^R!1cuK<P9=
ziyaCY(SCV}q+kncv{MDWKTXQ*3q`2D(8ig8TQn4hsc(Issp*p2q}09!9}Vr#ad}AA
zzHZsOwP-)5c}=JzE_yAaT??(euZ=t%`q$iV|67eAhTXWS*JiB3zgserz2j#~hLf&~
zNsw|e3JkY!T+t+cjR;P7AXD6z>26D0DashIencCmDhF;q_=_2S8U}{E?wQTaO;C1f
z?HBM4JFR0P>#K*ylZuYVU-wh+*$K5XbwKtBHzDsh1ZPgz{Z=U+nTvi||C34iF&e_|
zp#%8%&JDc(JE>zo)tpJaP<|KVZ1io6)r$!LMGyULVO{!Y%O(NT&GgBc>@tfsqYHF_
zu8jTB+=8UfU&LL0KXO4gB@{1fPp?7qQ~963F{((n65Xbf=C@uT^4M~lhh0<Z4uDXp
zcjHR&Of7Djwm!m15sSr;JDSjLXIkJ5Z+C)@XYgzA&gnz|Uo5NHG`Ls4sUKV|g??34
ztV0?P)NE`q$73OQ2k;_66~`w^+|CS)j305|f+g9P^FbyyR@RExK=S`NxIln@S%(0r
zh1<LmkCUuxvknvo0p?mhjIP$U*Cu{%KljnL%pzgYPycz8(X}OLe)h8bt8mSzG@9Z9
z3RqL_9(-wG_(2lG;fEN%$b&phKrRNSw90sV+XcoIK>{<a9+T$#lU4LZk{)&2e{sgA
z&o&ji!@b$KA3YVpvhArWZp*X>cA(R+5ImAhV*Rz>xas45NqO7-K3%1%RFH^s-i3tl
z%{Mr2P{Z&>YgxWMA_+Xidh~5<ZCR1~S-QY=B9p|L*U|z`<A!rjSRlESI!WmwavI{J
zDB&k2fzhd>U=W3OmMoRF%3#KNDR@>LN~G{z!O>ANrgZhpnr<pzo9X19S#rr?>$mGz
zx~-SQ{2*p;-DU;9Xmiwl)%-=H>_bhTMC)lac(g$Y*z68k%$>oJv2*IJETH`P=cD8G
zRLkQWaQ$#ShCuHeX7D2r%ISp%u-17#q{I#dZ&C}^Bu32!Z=-4PV;0z5-`)Y=r3aX0
z?@u~XV41N7I{^{kEM1w9ie}!x(}RTpM9i~?dlS^lrfY>r&S$=B&((2j#84b$^tHU2
z`6+5p7hwPm%xTp_!BYwgGQ(=hXtvOE5DQTFy9zuW7+41qliw?M_!Xm_4KlSK4q99u
zv7cgyz$1P9JG!8zo`Bc$<K0EL*ap};RIbx5^YW=5*8XunJqB4X>uF}?nCC^81^MGK
zLFuGT@dKrRmZJ_<XfODl<k1pYa{fU<FdbAG?!HEII$}9z;F#MVS$FH>eY+arO9ULT
z#Ni!+x{Xl*21gU7>R`^Su)1~+=eK}(XtmYg$ytC3(~J(N_wUVfigia%IGLCS(K(ml
z3+jK1g%*=7!X&f#<}^pB(2aoOdyfz<`m050Zyu|m>!U@ewXFo5zDYB%rX1vQ(25$`
zt6#}UMg|3hiRJt@w0&KExq95DZtaAOcyy!fR8X1jCFmv>O2JFIM6|W^w9WlEp4D!a
zKvE;|KHE;osUzfAM@Yz5;{E;Qr0jBQAm2l<0o9qFCduAOH*#z5UfH_+J8W!Bp<Tu{
zLOd8+BnGjyV7`K80GAjAkB`1<>&(Qwp9+t!OlvoW#qZ**Airhwt%AxcsV!*0<hR}V
zoUPp%Lvhu(c2PAkqgk2nT0PY|PzHYm?M8}hoR^me*2mL1s)^n<5m4w>Ty={IRL;yO
ze<bj76rZ8u7pW(_n0XL2DF;W~{_)Mx9x+i1w0AQMJrTv2*UdskD6(GXyC}FwDG=z}
z!5j_l_S@<0Jr=UNkvL+O4dSdl$sh%Oh>&w@<q62$qb#+OphiougEJ0t3yAN7HzbIJ
zTNl9Vt(o9slf`1DfwGgNn2ZeaHbgk#w;{L7`TKYnS3m&wF^PP#EoJkyQchi9O;C=b
z`Akr)nb&yA?=~{NHua8cXzQ7!S8FhU;+}<=ciW9<xSLQaVP^ydWN;*zE!5fNGc=SS
z;@+wJZCMCN{<YgEUnx;Bw`9PWCw#0x-A4nXuOt!jtP_*au{DmLC*vW3*h3nxiPI*>
z$>@OwJ5}`{_V{D;#Vz~MyAv0-CGO1%t-xRu>XIW{!M>4`5@8E&ITF~vuW5Qx`sN6Y
ztHz|Xp`X7ed3|0sb!35eW0p~TABYr#cfve)CU|0BvLq&I3cnd!_~d0s6u2SS9r(XE
zd#k8AnrL0Lu|RMqcz|HRg9Vr1Zo%E%-6438;O_1rxVyVM1b278oqz8$?j85>)Z=Of
zt5<h*^{n~X^g)e!Ioa*WbmGwIzp5?3um5Ah$Oormo%$<yYo&-i+4$dCjlx>S^wz#1
zb_8nUt%{vA_TlV>BPn-2?r)S00aetZR_(#kaLJ#%c7pNMjDk+|TKJxl6TE~{h%a7d
zh1N_XhgD}CM`2z)DT4mvr3nsT?3VM@BAZ#dtRx3=`GNPa?uwhRfWL6YJ|yzjesb`X
z8MW)R*okI|FZJQCa-8eJ-9_1z^o}NCXVZv%Mpalxc*E>c5-!~ZCn#d73@f6H)&Z`l
z-pNFBNzqbY1G{j<|0#XRs(+Stg0%yptKQZRGPqcxXAYVp%dI`~2y=Jz&WwjR!y`=X
zO6o#a|2K=-`d3YFE=d>sh2zF)PjVCdC$C1@AJv8#_dmbgWctXvc)E4WK|eV=85Go2
zSz&$In4?~peD@!Neii0wPGBe<`zB??x$qn9=9po#XI*e8)RVJ&(iDk|uqq7ar{5E^
zx`cn3g1%h4wL~I&s0T-LW}*CWa$J-2*+i^&sJ9z;S=F++pzq&o2o77<!$iK@3oIuj
z!;T&7v1p!vdKQMmc;S$m(Knk^)9lb6_Dm*l8CUi@YfkmzF+rAa6*_rc)wPYm8QM$H
za2)-3z7?_v#6CY=cY*sX<*2=9aQn2td!_itK(@&o?^xRT9JlxVrpLyA-mfG9oE<Gr
z_Zev`>F$oMA*-Vk6RV{jL}gH%H}GeShOl5!=QETL^!iBcQQ(2xxv}_jCEW|^mRJ;y
zhnoO=cl*dPVA={oLp4!LiTvX8%Y+lWJ2Qr=eR^}`^I72=hyr`w&%Z$r2$=O5oidjG
zV0dgkzj+U^%>xBu=7Oe4`!<#5$Yl1890b88{kG${oQ@xY1tv^duHf)jrO&u)m`=I>
z)v%8Tvn(~3o{Bo>(#uX9vE9(I-6GP+SkMo4j2?tk;6I;dsheSu(@{yXVUxS=xtD`T
zKm=<^xqmDs5vp-dY$B<`v!`(lCdB`An{XFSfe%OU%rAEz_(%t$eq0>!WL+Koj5i+l
z1ueTKWD)->?&KKh{{1y-6D&J^Q7Td#mv=e=1ptKWGz;yOeE1D&-m?7vm(lj~f0vE!
ziOs#e;6TQl22FZeXcouG&Qg<S&aP?|_Yca5s%V0y6J7tK0Ke-!a8xg!LwmY)Iawl)
z7PDpN|3UDhvqKPkEcXVW7{6@J(+BGbn?jpgTlWcmkXP5xaN}N+Te1JQk@?4~6(XH%
zak~;R#!-<{2dkoN`~;}>O~x`fr+(vVJ(6$&0%VgAYmWcJ1(l+32w~$Q12>b#L8l#+
z*Tx1Np!)G24d}le!oR99TY&eRh6j<R=FS-W)hw2}Yn*X%*h9OfXB^|3_G-`<6Da5e
zdAEmH(uD76vn#p3uxW<f!3+N=Uz5X59fj!Fw13c={QxVhX2}N7|CBsGb9!?!y<A_&
zZ9-ZJHB&3q_ej_Bu<X|`IrPPizI>{cAW^3OZ=;mi^^N!*OY}Xz=|HCTTBuQjCicn`
z^(VYjMjJv-0%45|)s)h9j~qEXe0DCg-X<pmOs~qF?VT=CB$$s88f9_oPxSmHcUFsk
zN`DLa?C}eH$cU@k8ogGVs_W-;dgY)41`6|g2G7ZaP`Y>0r}tX&?EoP9{x+VBCXX;f
zrJc!Zj28*;Ww{yfSr>2V$CLNCP_?c=hSSjBEm_c}AyZRG@=%2m(toy@*WdX^Unhcd
z7<Kuc74p7oLq^WU+uu*;Kobi}8FYFP-FOsL*^MPqgv<8S*B|<s%Fdvydl|;9x5n8M
z{TRdCNN#couR_S##4Nv5k7IV}Q}3RM9nu5k5O2T1Q=wM<)VQ}M7fZ;t=f0saJUo~-
z{>S(|1R$}t{)ZwHTJKOk9PEyY7QZGJs#Pg(aEvO%IY==tuwl~R2!0#xi@=A_1&ME}
zm~cq+>|w9Yzgn&L{@CvN(*4<;F6;hnc~mW++SDz(=cm(uk0#*VGfs?i$L^6Y`o6;#
zD|<ttiSeqV1SDHh3eTA_c*mr-8b_8qE?h`geP|?Qe}~;Uff0K<CMp_9z+=BX;a5gS
zZKnWaNg!8f`-nYFTS4UydCN*e`YI)RU)%4{#UAw4RzFSYKLE564r)I{u}x8P=y$fY
z+AQGRKG(~!FVvya;NZFcFE@PGgZ%^IyB`pD5CI}^9=V+prv3IG-nUgS;&>{)jU}hd
z+zmh7bQzt2DbDSG{-(Nv6dSyX79*x1Gl?CRe0vf7si~TgZ<(#tNpkJYdn}<wKTbPd
z%fF$JD34?;uf;nF3|+>(m&_cM!yU|NT|C@l-ckjC?fjMWvz<(u*%(X`#ITUe_$Ftk
zoHRgMpf97^P)nQ)@Kua`dVGh4;7goY$HSTc0~m^TW*n21{KRq(qmGM=NN;sqZ3@)7
z39~7g!-D`q%or+C$teNfUw!nWk8%lOB9wh$5cGT2FXuVsO+BE=L$j-HcaQ-v;@g9C
zAg%`hhH84Zun&9miSulBftRW~T{{eP5wTNGWcgz<<J5L=@fPgEZ`D^X%y<e(agAKx
zA@~oJI6U~i+GV#SKv%Gk@=`mUln8OKk!17Qn_fm0vB@gcoqc(1>0W)IP*`H-SPYg;
zo1R(X|L}pZT~}@C5|`)+FQ1O{T-jMKG@<4_S)8PHSUm?Py`T^E-3to*cXIVss62zM
z=5lE$t;P=37#~Kiv!CeYlYLmj1{jHX<r1M*M}smYuPM<w5SI_j30GInL`_At(1mU3
z$)8|;oBl*T9p!PTY6*hQ<|ftaY>F_mQ&=&R%lzNXD`P-ST^%O@GHozg@9^-?Uc4&N
z*V{AR$5P|tY1`9vlgc9j7Vq1uc2&%+Wj3B|v#&namhpjf@R^TSI^1^+4EzQc9<Yea
z_4M$t001hDJw9(~6TZC=qm&+9yDvN~E3k2y-Zrr^e`+uQ*KEja`#!@Q{@h83i9#*p
z)~1hp_YarmG3dPLnMo-rm^*exRl<snRE6iu<}r|h*j!vjZbNx>SM-;f*0(o*cW6j!
ziCZ%rWWM<`epGKXRFeq4`tp+hoaR$HWb^AIVg{py_~Emv-V=joKl#@X-~GknQU(29
zm-M3D?NSLxOjJQ(&8ZlpQ24g~<nx=HG#_lOvZCAuh%ic=mKvo{cJxN}q<UmGA=jnK
zy)$LPsc$<RvrZvDA-y^eIgT!CPj6@7r3V|kwhmo*x#HBHPC{3@PgZpa)o0P@0RC`p
zaJiy&_Zka>a&aFwt4u7C#mabkF(Ix5-E2#v<o2QXA@aw@8(G{Q^6KEJG7}cqzD0cA
zlk<Mcp&CurL;_HU7RmwO(EvYYe#e~lM6y{shc`C$N__63J)as52)<p3UI9}KMYUU8
zrKBPGU5|}n<%~QM=?CQwc-H0vyS8);3E!xs{kXmyMsRCZv{uY|r+GZG4}E&<KXun$
z^P$om#Oka~fgeYP%R7t+q;W}X$W@h2R(GOhw;7(?(%6IpY)$_pQrhrLw=rnX7yYz{
z+tVtYCem&+b;xeUGq1FmF@cx0SW2Au@l>j5%j2>WgA_P>I+Rb{C>Q5)e|_phDE9u_
zYG=^@;R3w#?JG68vLBa+kD-0e`P~D_-$R`Wz}-ZWkGDDvzC_lh43|u2YOQ!Z^TQ)6
z`#$o2LI<h(Ha*UEgkdc$Eh#A}zX?Pz{rDMU*4u>dUd&i&UCoXvOxs>p{K~w7e>hh}
zZ25toQ<~p8Fj%c(oi4lH?CVrle{FwQ7mixMw<s}R5iJ=DN+GuBWA>@j-J<AiuX{sE
zuu{kDGQRS%s|4Q~LYeXMY`W-H5UKgw<$IaS7v+`q0{T)^-|w_;50g2>pQ4CWsbGNL
zMPD+sQnYZmJOn%$GfQZOmoJZ>8r%824I})4qUK+%%sTQe%IPZfvS|fySv5H8TIny^
z+Qw_ywM^G;B>CPB2M#vgCtPN?H4DIm*2>G}+N!KE(Wn~!Fl<)hc~#Cu8s}XJw_Olv
zM~-}JGv{8&VMpQFC-G$&r9Df92j2L&4;X-9?O{5W2d}30{<@lDChtoX(%{i09#(ZM
zhX4DY1ym05&H|5<QQ8q(Tfr+f9~6<GgbXb|LAVcP&&~lt^6F%8UZ57ydAVR4&bs`N
zLBlz1@<{e%ur)2^(eO~+w5<78s!uX>0=L~PBn-w!LEKJA+nL$L071bI4J<q_J{0$C
zS}RxXB6={z?lw{8H(W!=XlU6|DKw*eWx<DC<<$uF=z?ayHR)BPT<x6?!{St#Iu;g>
z=Aaj>F^w`O<oKzu^T>H2JJPe_AU|Byw<MHS0U-v8Duunn12O)q940r9invE8RD2F5
z&B#`hOmye@RZ+5X+;}8LW`A^+E2f}Aw&q^jueagOmK;31@A?H#k_mA|W1?ugtzC-}
z&2z-W<J+sh<ueTNWy`8l!1%`a>=se`hEhvnE^={SDjyz4Mm$M~0i>VJR)2=ti@I(2
z9p}I+yzD-n$oeF6gno=VS+Em9$zX0t)lxdCV)HsZd)3KKfFz}0Wj1(#&g`QhHZ~-f
zEPDGx=P_ww(e1G|^;D#pHR^38smBUPmS|S-8s+?!Nxym(HTM#JO|X*K$yRdVnM|XC
z#wN-9%E3k}vjKJJGS}`D3Z1}WH+@?hYCn<jD*32U683E6qZw1fg_g=JUzsFD5f#jy
zCjf_J?yZ=ifq8-e2q+r&zga{TyxZ)80P+;Q+4$5UftdbGkJF0?QHB{>_kihH$OUel
zP=>^A2%S*4YOn(;cp`T5)cE=HS7}^$K<;uZD4jg|pg*dGGnzM=^n2yF+T&7P7cOi}
z0Dt=9M-`0bhpTbz2y?Ed8G@08Z*wtOrd(>x89tttb0#G<b&_N^f%q}XHtPwke;3p_
z04Ket$97G5OAc>GNrKhTke1(DjpWPhEzBp!Hu~9K@7)BFGrbxUPHQE!mY66H$_a_{
z$X_V13CJ5^Wx6+^GEXdyen3Cw5ZV-p#XwREn6S?HqkM0UU83h*`_<|w)0Y4xZo*D|
z?r_R5q7e+`pGA{1Y77OCTHp9p|D3L+<?v!CYq;67A2+rjRF4RXWAI{ifGC=l-PWP-
zD^|Ts9l>;|bWR|>IT=rQR<}T~_RxBqYU$!j@U40_N22QHEK^LX;h~JxY<~#|f&{{5
zbQmUe800et$6r4(9IcB^<tR+K<aHD`tzh86H(bzsVK%At!AbwjT@I@&uINnUVUmps
zd?e3E;xMtn{eEeeZTwQ|a#*FB?DjmcL<I?C>7QK;V*+UPdM>+PV1SN1J|&#=u?GH<
zv7!MZYQYs0xp71B;_BO?$mqf-qIr7BSjCbRNF*Rzvq{K~b=iPd;=8Tp5%8CW`HEKx
zjHb!sCHyLyoL#^|dD%Xe#-ipVA`A)WBtJx<O#4Fs3EarjnX9E{?QDs(pSi7|b*ZOb
zpJl)a!mo+xcG`0&C@DUPvsAlrNyL7=MuP(?M4wP#fXnZ&k<a4dK*ygv)xGWf;2T5v
zS2M4}^p1FP*~OO&zkGWD2nNG3-vJ?N%g%2U!tg-srl%x=$@BNzx14<o(HwLiDiBRU
z@<KOzs{9z+zw2M?g{iDl^)w$%hxVk7hX?VSq}>NZRX|}w6#=Rsnb6h)N?n_`|Ld~x
zj3#&K6#vT79eaNjKtLWyITVP1T%NjrT1%?b*1&z+ogVm>9kYwhmOA=wZG~%9_<e6l
znG3h9s9M;L#?)!}H%jTH+u=dCgML}1GA;afI(vI&kO`hGBx8iWa2RYh+G*7_A7xv~
z6Nt29{1<j`EmprlqsA&EO087C*oPA7=kqUG{*|_EJiQUen7*sk5*N>)=9ccWdUEmr
z9USzAr;Sfk?f%GBAACpDd_s`T4Tc&LbQ#^MQ^AvPXe>IH5*HV4R4$!WJ`x8}<%mP8
z0cVM)o5trHgq;-g2=Kt+{?DJ(8i|z{edRokB(;L&kK^%&=H+!wk718BP5zKtlabde
z+quQ7ghl41!_<JTNMCb1X~@F8>c-lam=daa&JL}Jz-n!&pXP5V#dGHA0Kb|2{5ZAc
zvZfa~3?QedGS4doFSv>zrk?QW<&(-==F0P*ni^t<s3RbtS`lZkQH93@Jik}~(6hTZ
zP5;Wm%L_MyC8wb!?cX9}Vv(BtqVBO-C6sSkDRQ<pcti#GHenn%(ATYEB+n=S{IhfW
z5ZkSm2d&B#piZ)<PujYYqwBumea{BzP%&NQ`BxuA?ei)9#^z0xX?CP1=kd!$j+5F_
zWT2y{oC2b!;R*!W@aojduzl-@9_N;^k-gAIEFtc!s>+mqA`BN(g5-*`);nzw&s8Yy
z`3u5u$BP5%(t!~GAi5fVMb1cbRb4+{hg5YqV_r%Pr*N`Z-WY4PRPjp&znBstuM1R?
zVOOkWaPMt~yY{NO2^9&V@o~%<1St83M6^D6DB%%d?2k;(0RacBSf8>(B3h2%h4cj-
zKiLT>$_S<<Y5SAjR@lz4cMxY~SiS?4Eie5;j#o<ykI>_p&@LGpNlEEnnK*c!OY77n
zAv8IvT+Y(-CJJUD@+(W7fXK}S9aY_Kz$yP44+fxZ<c2#X3W`S^+O`jLdH8O1=F(qN
z<nmnR1&CskAVr42%_S&Z+yCIvQ`J;ZP4jq<J3hAPfhAUZXz`h~xcTQV!t^^{|6y~F
zq1}h_kLYV^LAfWm3MvXYSNHUrP&jw`GweO#JCb1gRGVRNm=04J`+)FcdTPukawS4@
zOfT==OVev8z<8Nn!ioLI9+bO_)34f;81<;u_>T=<>q{ObnpP2H(52G}E>!T@l^$YF
zF7bI{R^rM1+@6pPnzoyfpijN&*Jkh2>$~a12pr%;n5+-NhM53>iaEgHwZmnxV!-{|
zTSrL~%1XM-lD9cKrw^XwYk&moyB}Z&4@7VOBZcPX<S@E|XuXQx)imE)HYnckdS!?!
zomDYk-P1${JFK|{EqP;C<W&f<r2g*#KyAoxdMXX&a$Kub)s@X9Dl#wWvS;1P)l`~x
zauqiXRPBC+$ihtnH1}OO2gsL9vSgK-d13QpFE_tLdKC<e@i@pku$|otlxO4xFqxe_
zlhKHaWcDdM#?Yv6%TnftQg2lG-51{xQ+)qM^gW^VE{O>E=$P__AdA#xFBlI#WPvpp
z*-}~w4ualLN@}gt$?>AfV@*DtyUJvo99{KtW>xeH4s`&u=_{v!yRo>70Nr=_IIpZv
zyX=*pS9v5KsxZ3X8YKK8ax87<U3~k)0$v3neeLJwW*a<~4{BN20AF8U>(VM4Uj|>I
zki7JHUd^y{waHemjb6I9%!T6-^iffKXH_xX!Pv}9X~(tMXHX<AQtDL4zA0bp1h%m%
zRH>kKvTX`+2h7W>re$QP>al!U*c#sAAGK+4LZBc4F`&F+ZuMuKjEX1(pt!%R)w+QY
zs0kp5B@#7VvPI5?$+d%|B}vlG<~TAIrNH3M{`Td27y5qv-_WCK&D23iKop&N!Todw
zQfi(z{m-+w`-|f`3swJukxbW98~i!K8w7XvlR04$Dcbf(#CP8W1R#Kk&QkDM{Q_1c
zI#Y4$6+S+@w}fX!CU3lv03b-UjqX=Hh0-9ZcoG1oT=1fuEW-EO6oFZ#;&pv}hxR#%
zXgJdTSKdK|pH-3jA2iSR#K8Ft7UWC}A^o`(sL!tuT$tBTe{Yc3mQq`D0^w$Mb~qEh
z-dg?fKvp8V@|P}5!70Ztf0uqdI7C{1lHd{=4sAJA*D}U8vC}nn<uUOwxoCh^Z*!^U
zE4FW&@)<N>&3_FYh}x0O!0owN@f(YOpXL+L;g-Lz8me45k1r0NC8L7?bi|2pSD>N;
zd7OJfZuNRci{<YX6O9QLs@#P|PB|jh#9}xj9tUpk#uWfSpwqs|`|;iVAHcW8^|May
z^ijm89pFLn`lOMCBiW|7ggWb*Jh<+JM;5~OTHMIeMH-O5xF7|xl!%QIC+dGXnIgO;
zRo$f#xxAlo3Ck~zeE>d*a{@UE3Yo`g^zd65DN=xMo9!MkJnJW*c+h97T#yWa+saIl
z@@3?u?{au2p%c?=6*XjUf_B;9%7Nhf$a%)|T?5Ci)!gPWGAC?}D`j1F$A8~U0vC`w
zyz@@<5|`zmdTDIMB?f(fp=A}FT{7wCpxtdzUW!**>ge)VM4b%@Xi4m%1iTEsB+F!Q
zb9`BE79#EZjg>9%`7uz-M+<ul8n~SqCALmKJ&nb?ULkgAomhK_A%3X7PUo8TX}$5e
zYe{W0^jwyLP@I2tTIXtCjeG6aI*?4X^0HARAS{<Cs8~%-f0<8zYsT%K)1K%&0$X8K
zVA0U6=BoY(-?;lX@;L0)p)^$PRXDnjIiPC)=9HNxo%1bxcTCaKZ7RpiFH*hy>9h8s
zzFxG+kU)t{I$2=Vg*@;v+~W&&1rY=76}KNPHTQR9j4jGL>QDJBh48Z(T7eY#6g)CB
z$||wIW$1<9%zR;c!x8PTnFH$QxU=^lk|d{gI+v(i@!UH!Yr$rCOi#-M>C5Om>$gg?
z4FDS$U~h~c+bV_I?lC-|pQ4PrjQkXt35ywhhZYEKv%LxrjgF3vvid6R$NgDw05WR^
zmzk1+o7rH~z~NT=#2F)mQc;l-9)jP;`Yb5$ee_0lb`-^kTD6KaWqZ3sZogU3haDDN
zz;}1zFhG=yESJJ04K_qA@wFuln8;kM+OL|-nT%%E9-QX{@<Nx{oYw0jGaIbS72x<Y
zmYKFDL*FZ6P>7J1c{<qiZF1m`=~4weO4WUG!d}G|>v@qiPkd<eRC{dt05<&2G?Gz2
zqX$xy@u~_!z+ueg7m7jfS2?a)Qa6}<AmlLUMnl?ZED|?6+LeGPit+kMN-Qmw#L48?
zl@sn|L+9n}<ZXilefPZ)ZfA*3TmhTkX=83NIyXO;496Jr08qUISAnRbTlaSdF-T@r
z(BS0lo*4*L#6TyCkW#&xTN`!2HZHm7$GQpp56n_UhRe<l8@!Dd^W=(r?A{gs)TV_@
z&|zG?sA~RFZN1tA@`Xy3KGVVnOq>f+v0KY*lsdmf^UlUnyqGtF=<jUhUP-BVSkASP
zunnDx%>9D~IWm~U-C1|ju3F`;6D=Yuk14EN>cB@^vLhl?qk}E}`~S{d!!6VLM(qE_
zTtkHZ4zeHP!&D{_N!$>Y-O#p?#kBF@a*TxC5b<l?XCEGwZ!t@y<R3vt2wu96E*7^=
zQ8+Ryp1LNh3GLg(rDulJJ!MwSI-hl3+EQvvSrh%>WIS)~;Abs7X0?3sd)gkO3?S6F
zZatOqE{fcRlI$M}`0^Mm+EGyWm>hfx=)7DujK(R@^ft$$P(tgC2|iDKSY8Wx9^9$4
z9+yHoIym_}=i`c5XX!@S-cyvXA|YlFJNK%$OiQ2*+@$|LsZjIWx$IqCsShJg%y0=v
zeQmyzjjK6n2w<_Ed^}m6uZt)qlXRo32a$JL+T4CF>^^!K9Y`+b$cAnux5KvFk4t}E
zLewj#O&$L;%BTtF%e#JLcGZbT*d9hEFfp?9ViuiUOH}@lk@UD+W`*FP&J_L2n+0~R
z@Qk`i#{7W2H?f~i`)N1m>%S8U{C_3Jprku5`T~A%B1Gop<ptJU{@o8f-MzBP$}=aT
z*ugwdhW&YVGFOz?>w0p3-;}`oFUgemo0C9%c{i+<va(1Nj34UZo{jj>ZbW0C&GAYr
zDYXM0Nm{g<Az#%+1f2u2wJBG!)IW<5&Zf$!|F1)cH$BBqjn`Y^(-n;5%%n!<ob#Lf
z_o>tXkWFegoxPg6B<h0ox%v_rPG`O9>LCD8blGgTDUE%)@aCjNI_Z9w57`zx;(fbT
zH!YkX$HeY%S9yRRJTSe+C(+twv=<gY==i3YO?og7TgT8du%qo-TUVz~eGM{$PS)G`
zNJ#V^E_c2e{G@Z2PEG(Dw-O{LZ7LkXF`tAXPzBAF(~GHE3tIg1?iGaNG#3(C)&#)r
z+|5nIsO36;vL+GjaZ`eZSZg38Ao!`ZS~YCOoW&i11Tj)>;4$s+tC&@?$U0m=yCxAm
z7Vj~|4_V%a?P*y)trpjRB&e!_>k^RXm6i1uv{Ot6=@_UBnhK?j6_m+>Zb^bmy|Re;
z?b;Qr{*1JOB;@X@g5^-2Mqvwoy<a6(Wp~V%*}{Gek#*&HRcK+lwQ3qzLX*l>>>reF
z2@DJlHrj4<fvk^k6P<5O&RdBaFVm(hpu0e}92~SiY*S89&^y`_G~th^zd`Bzg5m#P
zIu8;Pmg}s#pJ<(mYib;yZclg+b*JtyoqmOcg-N(NKcqZ&wk;Ywhg54AJn_E$xvTWj
z(j(foF%KPASvqSc%5l=`2$rs2J^}MSE!ntOSR4a9GZByP$|p_BN=m>$P!bXnzRo^&
zcJ^h<x@@_TxHdrz$)yGetzJT81)bt@E655)xl_Bg0yl@gYZolo3Su#R`A2vwI^lut
zZLJJgh5sx|&fiSV#<o(vUiJI=bV+>?7fZ7x$BYYA{h|Keyw{+)z)|IQU6wV38CH}t
zb=~)x@_psWIePskMbo23x<6K}N3|9R<yb56oAuv25p0F^zYDSoe(jDnwEwkEyrF=`
zQ1dbKj<`)~*L`=edplV+<AJ-rUG})e_B&pLvJE8fNv`gGq_u(M4Mj}b>Lf8aQPke?
z3QT=LU|=A))Pju4{w<!An63M{{<vx-rpvMJLF$w?GjfR%If=7kn8^`ca)-Uk3Bn$0
zs+0>9!_&smwRvT@RoE2y@Z-F4`6)k%gAT5<4PF}&;_av->!G;~a=olCwjaIELHJFM
zeU@Bc)+6lu>S)6fNP_bCoMiC1{QUeJBk>60^SOc7byA^71i?)G&CbKs-p{bGcPK<F
zqdq4c8}&t^*?Dd72PbDYc~D#^t&U9`z)m043Q?AAx{VAoyw5W<G3GxvdhlCLGY#01
zYAxpVK{h858wyI4(Qv}5wfs{eI6Iu2oH8;pydJOJ7V;&eq*_v4OUXt`6pC$AI@jAt
zZty?6UN77dOUuhg2M3F?`-s7Sw&>X)b?<+$*1O`Ie*@A07BnlS-N6}8m|J_USnlB0
z5r?%%G$|zV#o2>)yQ`UYba6S$#vvHN1Vct$gdV~bB4XY2g}Jf+$6*u1I1U$S=igj*
zw^<Q<AC{6^)Xz*ZVjzL}`P?m6i>+Fa!>tST@%L_%uYsds@EQ2l>rWCw{FhHg$RuCt
zBC5yAP4T~Gov;d8T3+XW1<GK-G*GdxmJ-K|fNvNv?^g{1^ru~BO%P)ply-wU$F&6;
zeyyngbBV6`E!k2#bvxYo;haQkO;9W!?xHs>uqzz@5*M~TPB9Kf7^u^wjLf9tz{>nY
z(rXC1TKBn;-hC^f(;V#B%h68XWPn)ih05G@4^P}a9d1$8vrFJ(W;J%tf(2A1UY}+o
ziN~~=eXEa)vl0v1_Zbd$SWn=po!>YT|FLUDJKmT6t?Jhv8RRT6Cf<N>;Em8~`4+(%
z1)g9)Y3Wf<C`J_#s^Ds5`8L1(A#PMV?b9c#lRw34y~W~B^y*7=>}l)Om$8<EJqIL+
zI77^Qt8qtY4;uwYsk|w(Z$4)D%rrojyvxNv<d52MI<d7Hy;;Y_MTV&_77+YTUWe^?
znQAr8(*@GRGFS8u)ORW6cgocU8TsIh2?`E20E53~5Edn6Qcz5ikd0Gh6Bn#hiT}lB
z^Za1r*=dd7czjw>cli5~JUsQYJgTgrV5;T1?CLStU7;MlO|!+5ug@98^zVdF1LUZB
z{#{_MM9^&bUdnNX5rx;5CrCOS4O%p+DD^CWt|_*d*z$f|@hU-@m;%DrqQAQG;cbWE
z6&*lD_Ppp7n7_QVfB@juiY5-axf?p-5j`$&%?5kX@qK$OA^viLhhSxrxfptyaSHvB
zP*Bcxua?LHCb|_IMt*;)F&=vkVl73?EH_pCQf|EK*?n*byJR9RvPD@M-d|ef?7=cZ
zIf0>sm3KCfyEp!Q@~*m;)Oq~k0P`EKGUcrQcjDDL73`D`6&=Q4IuzWEA?>p17Y=VC
ziU-e_9I7&m>ke}Z{MC2vH6nsq`RL!otuO#UiTJ_1vMW_XL*C*YDx`0AUDd_4isW0L
zh4vyex=z2UYN*iCA6SWfOJ)tg_kHq<<>5SWPu|ieg=Ei#E5*XsPJHG*(g3+LeR+KJ
zgd}<hG|b^l#u(HTOOrRFm3XU1+ra4pC(pB|RV!h?2f3hF%M{&knmN1iz>B$4@3U+M
zy3*DHc0YNYlF20C9$l6p1yWh31eSps$$eB9>1em`7YKVZ3=Ov`y(1MtD>6*uTFTTe
z)H4%-Z|cH$z)QG9{=lQTw0IK!y>j|bTw)xB4Hf#phZM-)hpZ-7vqtW=Dv@}D6WdT3
zak&90EzF1Nr2)7^MYb08^0^fs8FZ$B3@AuQSV_V&(;*8Be28n#8zYeB@QX>o#FF_R
z_n;V@VVt`G0w~+}Lrk%3=1V-0-zLNDHTT0GE+)wii0s}q#U5}NN!b>~^vPB~{dI7Q
z2omw#ohb<tVV+-HyztGu`mMs}YT`OLPwW|;!lP^Vi^leWdyrUvegx08o5*q=Xdr}9
zAeR-U*T#6SThv-GCiNm}zUzyk6Q>tJ79hu<&-Wa75R=U8{XJ7y0Q4*S2pjQ9D}l}M
zXVxTQM|EQ2LO6F$CQkpwaAy9Jy8579JO({?zvOUnY~7H%t?>XFnK&MBXhlAMvh<G2
zx3#IqeGu^@={8*|er>|^o~4s9<)WOmZ_9%;<8o#E!T_b1%1`n;91X&8vvbu9O+?+g
z^rzlS4Jc8zICjA?Go53LJ=#5qloa)OXu$VfdF_hS{QyT0`f$F)xw5?x_j3p_be~2u
zyM@4IIIhR=O_Rt1qequ9I`q&z6qmsVi?K^ShQh>gxk~Ni@4YH*Pgb`Y$9*aWDjC^v
z-?+1B+xdx6+r93(qA>x}y0H(jM3;)Yf-t_uN8wopCD41<+uXcltY%>>{(I;yHj(lx
z^t(g56`mw>*Bgs7=LsIcv4a@e)Jb<&rvoJ?cGZa{i%9@+$*ref9~Xs_tp4w1s5H#q
zhvjz6d;|Kh?-&i$ywOV4|B$mPi$m;3fh4<hPd{bk%b-73=FS5xGnLq1aDNtfI_4;k
zAL>o!<zo68P6cHu!*1w)0GH(`WD)gd%zX7fH>*sT05E(g;lS*YH~J>2d{Or;RDkDW
zkel9m&O&!|aE8h|{kNv44`0nyUsTrjV0zz40OsS~nOYls-!~cr^79l(;9d&*U-b?O
zQ@_B{zrVQkfoCJ;c$(_a(+4z!*RtQ-o~gyWQ<+M!rFT{Mnx!J(<KW2GUc0fnT9!o0
ztic!4_rp=QLf>DCu|JkXnM2=~?!S+GOBvM}vF5J8JXP;^aWo8TFVShpyt<r^QNgmM
z%k>ujlfSakf~zz3Qx-dd!z8J`OCJI-9a2?Kgy0XyI+fU%)*aW-X*W)+WhKvb`E~w+
z<;j7<Zy}XpwHb7t@@Le%e@$pR4aF@{0Wr8X(g|st_9PZn0u_+6c`(=OX5Tv=_1Hkm
zrV+9efCkbQ<?0niJ#N&NE40`!ela~YL~GyU^1&UWIA4uZGF}uSga(QWFl-CI-SLkg
zmA87HHI)r)Z)s$jPyx5D67+;#`^SR$%)tKimcRbG?9Y2WdWT_pY0;_MKI=h_(iq6!
z!uYuq<aF!oW_ONe{V=x5qe-->O?+B7M07BKuEXs~DdF24Bem%l`-KTQlf+U}z;s08
zwJUEYmGnm<&DmSt*r$F9%%V2VD)$yl;JQDSqWz0c_rh`DYtyxC%y5^>9%V5VpgU{k
zBRSSO@J?pk-k7ub?EAx)#RLT6(%)5JUtEwU-O={9x#m&n-;g3ywSj;dClBy!^HjO^
zdC!A|`+4d&uI}rdCDWxjt6c_G-2B*$Q+JWW)9^1hh9cIfd7p=8+)8BGpRN3=r66Z(
zSA6%~>Sa+o;;FZH@Du45wl+v=lvG$AWwb_go%pP~GM$NMb-0&QJY_XT->UE4uRns-
zSjB8o<=k)82soa$l9Zv5zt_-qI(p*6?Y#9$O;>A@&d81ypZScI;>>C07WyMFKb7Z(
zIkS2?jhJ;Q=>4J$wxu-lO&JMdZ7ySsdt39&@Sk_IFVu4=*vPFMl&;F#674}qPGG9a
zktPWgf#du?TmYFzsqX&bIR<QXBLV;TQdgqy$Aywod_LAAC}<IX7ky;R?so~etL2`R
z;Reh99IH;ApM36~DC`~Qpm$Wix_yz!HAWB_9wa%vhv54dUVp3UKDgaZ$-BN%o!dvd
z@}<BDjqq~w>uc%*3@u|l@?GAKJvwC$YxSbe$w>nh`r+Z=gU)&mc&vN-Pt0cPF8Py8
zyg&rUlXrl4X5a*HZBeYDK5vhlhS`&cg}u-3wqF0u4R-2h|JX8fn;YzP(tK{P&4RaT
z>qO>MHISE;FBFzEu5hs>3YUGF+tl><xATW8O3@JqYzgE+#Nr`>@-I$ML#DO%YPJ5N
zV^^G!{}cfq8HtR);=NB(%G+XPBT~_n^6$eZi}0|~m2<Ly-au+G-;$OS4Tt?boFF9Y
zW&XtUy7G`vSLOy*<v#6^rCVv`(;fQ+2E}kBFCK|m`kR<G-M8D;#m0R1EYnQii|EPN
zcoyH?{)i`d3)Vuli9PEl7IIiXEZN-HF$>>2pdBg-hj-;ta9J{YJ+3#i&9tb3)qxz{
zP7;`Pc}$VjnEN|aSSFPU@}Z&LVt%vD$jPq`<nw&;?BCR@N)4(#kwK67g$<&L1K-xv
zF!YPci|TCyPj18A|C|@DwrWw?ytk>TY^WUe;$>rBQUPZiqK`FlM_PlX&nA434w^f`
z;)pXoAeZ*kq`n|+?8{3aPaG*G72=<!=udDK3sIpvwher}1XwCPxw~`rqV0$gG96%m
zbL}B&9<|KqLG0d`Z-nbcWH5HsZl?fa1fQ`<Hwm-LF1kS02LM{}Rrm30^ML|+xNZIR
zvp!`I{BD55Dg^&XDo4aqH~jF0;<@J4WGA2Yvp1C=v9(HvtHn5|NAf<gVPp}#JLO8#
z@XS|o5YMHb-6tvAI%#=%UW!SZVi`42Q#9$nhze1%UTD@Kyowa`akVLM=*i$ypkwFu
zV7|JXO=L|`khUvEel>OXoz@$!!E0!-nk-c-w}~$n;8R6G{qUf^UFQ}H>8*r(X@L3i
z61Qa{6d9|(34n_+eXDfpD5!_y$qM#mtpYyHKId8(2n^K}ix-lYOz6J7T0Sh+Ebv(&
zJUuth3`P=LIH_I48e7l0Mk3*}tG;VLiAX)yGHm#06|97YQviTL*H)#mJYKtjNY+O}
zrjdE%XDTplM8`>LN_QvE;#aqGp9Lu*F0%1v)j}1gZYhr!8yP2X0mlQoSbdb=x!-=g
zJSmpdF<NP>FZtcTwuMdT433UYCmu=ZPGJ!b(@{c*8R^n`d#=K&>>rY}969kAx-`?n
zk-N0&%n%0hogf2|0%doEx+j`X=SQK#H-PVkN&pc?Ku9;NseURVaF=lx^0fC}hVR5A
zG4PKjTL1J<0(2qi%+Y3PB3lhLa^H~sc}_3v@#1y{egQ>cNlwUU%8jX(kgXM1Af#>7
z;EJOr1WK1Z6#~yP9!Xa*6}_NV)9WxL1vo4@21XxDM8i9DEmS(eJKG!VjB#h)7M^k*
z%Cud{ENDC?cmHK%XM3_TVSP8pc39G44_2SFFj7KhRRfh>ww3Q^=-0eYVAR3c==-1a
zuzgY}p&4bm?-(C_WRe_r1!RIDW5(~0LNl-yoRzCzKc6cDzK)koTRv%t4rTma`Kn&L
z%tlMQW!q_V^7n=$pf3<YMMVV$rA!Y=MbU7Tu-P)oQT~=FwHV;*?{pQ5@Q&QG-ebd7
z)I=cjYAv9solX)(0x8#zr^Wta?s9S%<s?#(je)ntwJV$@A-rLE>-&kub^VFuenQ%f
z-e|YkUZF5(X-maQ`$Pz1+Wxf18*%uybC$8V0yRl2tM)2gu8lIpUVrS3VZVmYvSYhH
z#1op-j+JdajHcpM1Ti-jwf1Zyd4FS$l<JYch8Em=PILHuER&DVaNrk_H~MKnAsjvC
zvIGFhsZ~|H!?#;^_mvELN{4&+)qd$#Lp)Z*iX+aPLxucRUoTjZg|q9cvqrOvB|DN!
zuC?}h@sCwoIG`d8A6uj@3Zap2HNhvLzrSzs`)|H`*9pU8uFesTHu{POy{}qz>L^-l
z)d4IlocFnTt?Lpqjk%<J%3rN*r+o5q-qKST|2#yJwbndCM0MRWQqYydDLDOl4l`0?
z5B#i@4GDydXZH(vW#&eOWf9_F(XfHf(<2WWGzsSI8i_UnHZ9=$uSX#1s%zOf*8B4f
z1pvwlmZtpEa$(%p_h)wWSO8UoDdw<)iSnzU`Zf89lJ+q<YjGLAz91eH4%}qC%7NOd
zp}O8JjLpn-e2;r;luEv|jA9hcTWd*Bf#zw_6E2(L#;El&CKwpX3YlqrXTi>6ywSvT
zUraKyT3ev!bq!wXh+C|vcU1=Abh^6`Gs9(?jZ@v--~KsAmJo40QAIuVN2eWIUah@6
zXKx5eLQf}D{kE3n!Ac}W#B_k>_8lrP2XFwOwCT6gC+dX~D{1d%F;THz%ZKCq#}UN@
zi;nnuA(c_9Nwu6zf{)9}N|2+t?+(iY$@yK;cw0QEe-8{QLhp4@e@r`DKuYTDf*J|A
zvej*=r99U++^#Dlo{eV4a_rAc3P?C-g>mDgAtU#rTWYf$hT#owtIsu^R^;XQoLbAd
z^p+s(DCW5TRJ|ry@Y+Bz8yy-VJ3Y2@IBo8n=MSlTWi=I)3lKhY=C9oes9%IMzK>1j
z%82_ys*v2oWEO1x;s^ti#aC_j3NPKWJF8a#BTk}tC0L)|hCxLRvGB#IINCY)0kr!o
zM9Cc=8=8z%7N@ox`?sO03j1#md1OwU%p{buD#HT{CC?Zp*F!GIdGyMxl)3e0Ecvvq
zdzPw|*7)vvL-Wg0?&(ATOTN2AHi6@}uYt;ldVedn%2O1O5PoG=*%@d=b>&S$meMd(
z`*%^gP8DtMTnH#4!#>e`;L{1rv{(AekH^Ov(3DCq-u<{=JI-xe8`P?7(05Mbe#|8g
z>J$0ww?m{TJZ=|FzL#XMzh~P+xq>E>!MjvAT%*rZdX?dJh<$|f*iLJe{iW!`1lR^Y
zc5fIQH(?f1zh;fHy-2S9F{`CMF#6VpWAn(e2B^rS2l31UGHo)E)`2cQ{s?p5@bQGl
z(Mj3uHBmWs$%nX7SFv<#JOlScS$9sD39Wq59Eu?Bl(dN$K8+ll(@!6Mmtm$v^LJmq
zs0@FDcpXB5gh9bKOzZ}}%l^eq{{4yOHkavT3&L}ch%@7Z>lL2kcFz5^$`<51O1CKy
zvhpw=m!Ms%n^wa7l8;_T+P(LYo8|n)Ii!XfbrC5<bDk;!Fyk6T+*hq8uvV9Gsu43G
z^BCCSDVa{g&U1fOUd+_y29P()>oaCL8jFvHv1vV&+q$M>OXv)YNUr8f^MI_hM09qa
z&KX~)xI!siS_T8GXba_cTum~54(JA#n>j(&hJNs`WhZESU{wrv!xDFx4pp2DCLWEe
zKkpT_2IgBu^{c6210^%PFznn4xoHHud*2{t{E>Wprl%b{C^v}o>69^mQksErn{|PN
zccO?&?vOxnQuEcX@x<k+C@iAY#M)M+ToY>H`;_vx5M|w9yzKSavsCiGdizI$H%{4%
z7c_@RC8hrc*p9qCUSxLMmjgArv6qOb+Rt~2BqtbK8&ZnuASi$S6=Gw|&MuA%B;tRP
zalLIK@o3-Jdg(~K(oS8y?wywgar<YLt@6^?QJpeVqTZoIhxMd>mu=nnKm;8c%Z2FC
zf^=v>k`DwzL!z8M4UoJV{io7Zpgp<0JB4!1$L9=(a^!R}b-DHXal~#|TmaBivG4NK
z_Cc)U#a!Sy#;qqFTA=GU@Ut?6@Fj)#=(^fV!(PVZ2|}|>O$yR<)CQiW@E?usuSh9Q
zCvF;$U(ZmV7vpycA1KQ_C>c(i>GWN@E{@)xtjkHs^6&~5PR{w$bT`NYsJ8_M?+IJ4
z+#MaB6v(YZ<caf7PxE}=OLL<(Nx7&gBA_Qn)i^OS@YtUwWbXY@j}RGGs0qI?XHV#%
z`FatI5K23&p6w;5@!uWPb+mov>hxF_7k^-~T)koeGeD<3ha5Zw57CW+HO|u^K%+x;
z8MW-nnfgF-H~@$poT2Z>)hMmB((+4~z&yZhC1LG)V9U2x4(KHh;%HJ(Zgs!QDRN1m
z3=Ybd>65R3<57rI-QL2~Jy|Ia)nU2~BjTH#o#ps%L~=v@fJHgQ^;7XnBd(8pQ*M>k
zFk*-5U;UZ29LVsV!)H+D6`x$0C+hrm(-wlFexz`bL8ykeM?1InJ3AmN<mzU(lvBuk
zng2Z|5x_e<Vf#BVdc;*2@O9p_+?}~!OD05JB>{o`Ws0{7$7PInp5sJ2O7+m*)NDC}
zk~Y7;f3ar%gbj`4Y7<U9AzWj}9WGT*rOVYqR*=JGDfuIHaph)Pu*)A4g*Cp@w^NO~
zKM~b&xC;jmHyku77XGo%r#pGp9@ud>x#vx*+En4S|B|huFwB%TTj<6qd}>H`8==Wz
z_}_)&A$vp_7UNJa_r<auIw}{(y#w5pvWPun;_ci{HP`3zIo1j1fEsR`PqA*7i~e1n
z*h)&0qU1ap^X@{%_f)dZHoItELZ%weN|m!0t$CQh@02t|B#gBf(~LYU`Dnjk&?jJ&
zNgIO1_gyQ<stO=N7R6IG#T{KrEzR-Th3NRV7=FD(1o&ko&Lu)Zy*HQXouXsu2^lsT
zTvx5p0AI0@MmLr8igogQVJ)z;UWB{DXERI`cDhu7z{BaC7}m!`4BF4AmR3~ox}JZW
zgE~CaF413`!d)&bXl*Bd<fn!Dh~#0>7o@`0(?+Sm%qK;?b;Ph|DH$R;*Y3SPAxQo{
z?BJ^F6}S0v_F*RYisE6IKegFv^s@EQxfONqZ^*^eHQ^ZpaT~oiH`G$vXy_R+Ic|y5
z<|q@B!&`&b#UE9^FLr%jgX$8nJX8NrktkMe2pt-x@Wj@+t~tL>_Oy?q;Gc{J2mQg|
znzqL~_~n-oS&k(^-pb*yRjB4K7eqmwjPh+OsuPX}=oeCGH9<Y{>h7uBe^pXe{-<2-
zyqlhw$eJ~`u&_XS_iaAmoJs6F%-u;AyA-IcfeAZ(+~wB;d(Z(y9&D-Q!yXH85C9ft
z=K1s5f|K~cMzNkXKP>s_Ynu9bXU=<_F&Emz$mA;TZtnbCD6$CE_Te6#2KxueWgIAg
z94W2M{-tb`nZ5Qj3i;_Pt3dn5htVt+;G@~j2&+ZEO_ch46C_5p8_LsK&UUgv41s{E
zI@+!E;m{xpAZGw}#e#Zi5@Z)S+ufW@M6l|;7q3S3K5{;vS&a2il>baQM+1sA?=z2~
zeV!4~nqZ@J@a9BOFxqpP?;iFt??UYBC|p<Decr%*ua4)1aI{zeG4bK+|3NJ_4-Yx*
zHiZZ_!-xiwdYGGfG}hupEL+hhZ)5Z+D2b9<{w*y9>=#Df@Rj(aU*deOn@0A9%GDKN
zYNGtJ1zrwz@o)DE)Eo9jxXYUwR)n7JnXEDSdu|qPX`she(|Fj`z@1vESjt7KUifE>
zlm;%x<Tj<;;x7<yyZQWI(1?k{R0sb=P+$PWKK3H}^k4r#>-vH@s9KvUWhZN|E;vM|
zuDb1#NuwYnHUb7Sws<5t7Z(-nXGCre(4i6$S+n3tQ($<96g_qSPc8}(oB*m|_V4+O
z*QHtu4}SHk{vks2xoK_qe3ioGbu*YO++M5T0)?6=!Gme>y_6i&k<U>RT4vs;L_C}D
zZ`C-;!VK(Sc}<@dRsxQ^yq?ifY5K(V7iAEqXilAVSAbUuus%fB$NJYFf?LgOK6u&N
z-hKwv-}P<no)4F)YvNK;mZqkFZz@q-v(qRH0=A=T8Cc-1?WE8smvux{sw{$<!$1U_
zQ*SR9m%C-ULtCop*0uNRP2NUE`CDf@a{{t=O;<ATKI=XMD1UfCoW=u-i0BlqPXBur
z5&ZuGU?`IyE_ZgB-Pr6?R{<hn!}FfB;+~L7TZS16@DQIre?Bc{TCo^zoTvO|LrF=w
z$5i_81UKgVfx)>a7eqxa=2F;!Qpg)eFqW)bb##y*;)-haZ}2ah5%pi#^$_}s20;cq
zpwjVBi(tHSbu?u-fv(y6#of!RH8D|T&p}{?GxDbM-&c04ob_q20YFGZ1UcO1>HOKJ
zHFLI9NeKz6F#Qh&H=`aA@79{$($dnv0uh-QuXX<KS!>1H-@vo3t`37oX7NO+ACW{c
z+ONX|hsaF^N#Aw!IST7$lLfX#kiAnJ_iko}%H=i2;ur8n(s=E^=u%Bib=DAR8ja+-
zI$$e&vGhfO>C=#3-uYxPJ7U;-ia^X4j`p+z6I?A7OY*K17X&&P9PX&6f)H^9V+_Hw
z7t99r{^{!pD=$b={a$A&c+aNpSS~IuF)hIvK9p8vZEb7D)u0VVpx|U6Ik(Si3)$LI
zpc+i`kBYE*WzelXZm@@|#(qcQnOO*K^-*h@Zn}C-qmF{b(_zZm!tDoYbS$4^B(4Vo
zpC<3Kpm%jfx!mPCJh_d^=hV69-mt!K2P?|1o6WX5!RskGWZ##09oqgU)M<!QrZsb}
z6m#xgd$KJm8_TYAFyzOaJpJO~LKl*_@f#>rwE;^X6e8}B8_?x}Pz0mNFgAmWmxY^~
z0Ji=XE{O_*XOrezz4W*_U&_?)tvoHF&z?@`2Xy{n4!eX$Qg7;0`&HHBlB*_7jqXQP
zhz_C2cgp)G%8WKC>%X#h5aF?e0&I<*3uxMZb2+QGxmI=>mELpAKUHRKN?i$Hm~i)L
zoPoI8{yTr?w|SkBVLEy!v|Ko)FJtUw1?@?R9>odvD$s}q`*R|B^Jc0k(f$?ZLTy3$
zEl?sTiJQ+Y`+B-ghnp4!3Bw;UWrsfyH;J;dc{vZ|5GMyjktNBDbL~wxAeucN9>7~d
z6=_s0TE6>{J)hBLAXuXPxWH>sBF{cEuVKYqk;iwk>8wLIeECNa)*LS1b3wK91@o`M
zH?Q(p5qn|86OyAj9*u7svs0xq^y99BkAIDzZb(7zgQPGt633(X?B#?}b=N6mQf(=2
zzF)AwgsjZxZ5h#{MgcJ9#=X3DY9xA4gJ`6(K3=olA@gHvFJSysi;n*QeSkbS-Cq`O
zT<A^K@U8XhD9^6@vdw&_umGz1aPl-8RMa{CaQyl?cWmo+USq?EObFZ?pRExcE}}X+
zrn1IQhGm#G&3R6X8>eQN4rQC>ygJat_IWoPw*7>ia7M6QMTofHj*y48S8ff}?&UC6
zrjFBh&b4&0g11+@#Glvxv?v?8ewE04X1DsVK;&-&%|Ia*G){-A)vWT;Apjh=7nef<
z8C;9wSj!H<;IgiMaM#8|k^kmm5(C*;vWSC&bw_qoX!r>!<ynE#awow_9ue^6W!Sk{
z*ftO`X6&BF#^ehARFL0yuLxu<a+2*QeeUGo;fNX+lTIOx_}kdCnreXg!9iF{x|+<o
zOBBX41qINBQOqUE^J7k^lnp;D9*v+ImFClTUxk-4Q87|9({@*Wo3Db%I_;aCb=vF+
zZFLAKH9*eM=1^<YdK>Ole5<fztZVB=1G^#DEV7T<9*IanDP4HBXHS1&P#^C++>Lc`
z1YC-UE!=i@N?E-!!q4+GGA>IQCQ2LzUw<(D7xv+#Lm)s5SWVl+qb__&`OXr@SM>l2
z#>$GG1V=siG_J$9Zm75X^LHt6rl>cdO(VpsrB~$O|12UPje6jtQF^~&DFAYbi}%W5
zUYcJiecEd|N@GL9HOV7!2O$8-(1W*L{7Cauly4yeiM*Ko;XC}pLD^M3gO#K6BaFty
zs%_P;spug2**Kib82WEZ+~F4!)2Hg+>94)!P?v$W)=%E@?J7M-I%f3>>8w`HqX#j&
ziEG(8vyxy(6sYT_IpiC*{8LkQK)BiB4)gud?Ii{g+^!_?rCPlMglSv7+5933$iT&!
zkfV<2fHC%d`t$>sxUssCXCw;GB9935czvhO&LVWo8=n;esp_OQTrd`{R&jr39?uN;
z9*8ZiEzoh8d{+SKObur8@^+pmPBQ3w@r1Q(Y@h4j)QTK6yH&N;6785b4UC441AkPR
zqq@()`J=)Is-m>Qr+RE8rn6#KHTc9On(z_r9}T~H+(?}6aL2eiry$&+;dBIx;zjI;
zR=%d_xO-B*$@(~*y=!k(yMc+HuUInre+YZ4s5-i6Yp{S2+}(mCxVuAwB)9~3cXyW%
z2ofZ?dvH0py9Rd%?(Xi;mGAcbyB~UtuBS6NhpKJ0_ma8h#K<VS{3VrfPQvv|wAtwL
zAAeR8^H&^^Bl%ii*0P;~g2v}<xs~UE&f_0ND-%J;mm%|`wB*0nnmp9P&i5XEki5hs
z?dTUWKBDt_6x{`76F>7#;SKDx(i6hGP$F(=)H8DumpLP9(s-Mc#_p$iI8|Ak!l7K)
z^=zhoiCXZzYZ{;qA-#nY5sGt;{uyjJ&#am>^!Y+KX@Jsy)9DiHm+V|Il$hKzpS-_<
zO=~`OOM3t{++n*ar>QZ=3Br0SC+0`10j0%f)1NE8L5H@LrlsZ*LmWqldAjLT4|}|e
zXp%p2O;6I$?j4!N+wiShZA821K9Vf=B!vUszL)g-cY)LA|2*S`+xKF=I1ooM1xbsu
zm>`2kykq{9LeC7a=m^Yr*y>84L`J@AcGS=nl0oYu6i36OMDg~q{j*TGH06oE<ZU!$
zi6oacw^zoIOI&tk>33K*mp05yV`MQ#gk=c$t3ZU3ON#P7g!zBeYwB)C@hN)O^1DVi
zJyva)?=ga~Buf$~(e+`dZ-~Ai%Ko}^_ZX0i&oKW<7q#%~Zu-RZ*D8O-K`pf3^a9a-
z5qZX86Wz&P@Wd6KI=tgT9C16_*l|$Dcr2>ql&39bss$7L>|ejC*)!-F#oR@pD31d`
zQ%>5GbZL*B@>&NG{clC3Uj?&Cg<8dUAY|k3)xmd8+)0mdZ@*rw$oigl-_z2$;yK_`
z8GxmP6S0w?riCV|@?dqY@nq?x@u@-{y*d3igZ+8mhtE+Ww($;7KCgL>SABv$IY!D#
z;&)ci>pzbAYm$|;@SOT-<t5CRJ5E{6l94NhUa_G`BE>EatmC=<9C?ONEUBd9I|2>;
zty!4y!>KZmI%A)169X^qU7|asPa)3s0|`ht<lI(MpWXRzcCg9ofBI%z*{l@8TEPIQ
zqqK7#NCFt*(h}u8$yAFEYQVU%nx1QRbS<Ca3R4F1+<ra`M*hX?K8h)|fuGtu)?bS+
z{~}+ZP}PqgexfW+jfnRT<3;kWY|`jDym%3hZ8LH0Mq)FB=Pf~fWV%}vS!@j2(Th~q
zBUyM7XNp1}br`m%#A_`p^Lexynowd98Gc6#b!)$SXU~KOwEUsGo=v7rIMO@X6isbr
z&LP{4QawvK9cXD-DW5$C0_k}TKh$^sDwu!Mv3bVFToUqlfZ54z2$1Qfn+nx*ef6Gr
z`ySzJgL!^_!C+kunX*2uCQiF1^t0oaKfDF^z}j#8A(AkgJodjXq9g=(mk+;+ett%i
zSE9ofLf#Z+VxyD!1Y^!gSmGRK*UV2~_9Ob>vc?mj1^nwX^|Zu7^DTsiQY!c+a>om&
zs=YP0?!J!ievpfhosoEg^U&nH7teaIcRR2`L<F2QhV(*Z>CNI05wwTY7L^h7IH`Na
zGi5>}BuDe&pfa*}A^914)sIslshKDBI}ERvj_&jgcbNpGcxCNhwGXkXw=dU98!Nqt
z2>Oa?$2~^>J{=#NM?ya7%)+s6v?(Kb<9v<H&st!2G&yLZ=|_vQ?o6z#7W%{SKts?b
zH<jh>Be3X-tuuQu?_iNgIY^#ywCx?i!RO*-c&3Aj#VnJ7t@DoeOZzht=P^B}4#T*_
z`?*t1y!EO3jBBq|)P-5$6=ttzMIlkOA|yf>OxH41>nNQ#l<-ttpElL=lZ;bbSiYA#
z&d1#`Pdmtu8>Mm2+J(ZTMqjP3ERjZE3h^2>m2^?$L^JkY#hSTkABx<dHT5cYnMq%Z
zWmYdL+GA6Pl<bfXUf;k0DN^MxPaYPsLq@JSpStSM8U5FXA^r7-gqloY+KqF6xVPTr
z5Fy#O=I0QmJ!RmJ*DtGMiH!ScHg}R(w|f5bCN$p#zQ5O1{Qe~p2P_K9cItNw5|U4a
zyck+LVvlwE{Dl_RlPrKWpHA0QGQ1bnd+3_6!U04x((V$ZdrzDSqPW0Pj6(7YvmJMh
zs-Kajd=J78NCWqHK;W1Vr%+5epYzFI<}H8m?#u%uU^ZoVW9jtTdMK`Qnk-Y%@A#0y
zoOj&5C`(;UaaG$hQSy5x0tUaqqNt#fxHzn;$n6}x(<a$8>u8q!Zv6TvppaXu%WF0u
z!THis;xMGXviSbAg>?kI%s_)l)E(wYb7n!s-xslpQ(!?}j^|rm)1Ym+UGZCd3TmxK
zQ6}UGxKb0%)G|uzUG5m^)gj)HJHW-&sNfL&Bv4)&TH{-iD3rSTmnMKl*i^v&k7rSj
ztL&okJScA1=jvbw+DCjMw0OR$qz-HaS?Xn~c)PaA?FL^u-I5W$eNX`OkGICI6B1QR
zWv}d$IAEzSL6(c$%CG6y+kI)@SiSm`06?WtL)>@h<x%JABO<`TZz`*v5GUjz9Y|E@
z_8|}B(wDDR;S-d<(CL);A|l5c#J?}?H#gVxXCF6LNir0msr25U5~=R(JNM(+`w|YX
zc3QpH?qqmUK*#ea)6nSE(fKMy^nQu}@k-+y%hb1moR8*{WPqua_K~gRQ~TH`i6l%#
z%CdvfhLO?_W9BP3o>+m`?;8%8Bzjn*3cv$9ZFN3XW%)Po09w4QZ)nZA%arS-NrNg)
zIflrGNW=3~@^09+>t64tM)jTYR%%JY>|Ms_rqmqeu2uL2D!ToxF4Es!0jOW|{uc}2
zKf-I)C*4^rnTlH@;2E=eY2_&QLsD}5I3q7iegW5otPd`WhVte(H|%#N>WH!eU~RB4
ztY&BgXY8h|2A|!pq{L`d1*J#TV6)=Sw>%FSyb5&hu;!f_D>{Fg>9Q+9oZ^VdbHpoT
zUQpFE+8U&~r-KeuP<2>OEkP+epIxYWI&S(Qbkc2l>(_-vD5ocM9dUEz=6uGHc$3|i
zPdBR%)27x8{r5;V@(~8LKNJAGdfo7twJVxsh@Y-9KP>&wyuSIF==$m@xh~v+0s!hh
z`yPp(Aj13GurnI%xKCg2+eNF|6t75-^m$w(uAP;{8bewh><<r|)k?dBX);jbL(<hp
zPA-GYIPgX#zJ!zTJV<U{Kes2ShT?z=zY8{FcZ5KTahG}5x1jzi{-OOUF!-;g!`8*C
zFYU;4z0=`0LOE6RP4E6<^KWg%_R<!;Xp||6FVB>J9~8!_^*f<KcmzC8HGv3;{W*nB
ziv+B?lbU*%oR-1Xhz2Z3hqV}Sx^y+^skBz6^LNtp?H95-lc0Q8R-8v!C|sr}CuUgc
z)>Z(l`le+0?CW%Xk_7!RbGbsls8O=>(Fy%^%(3gTY5(~e*Bbd29*poAF-Wt%|DGJ(
z2{NgD5f^MFzw{Q#66rkS-p!m7J<Oe*frZu7T>trgas0hnN=6k~qNQ#5wuz>~=#`ai
zqHSj=|6-Lo%NQJ7OJa;42E|gxv_5pq>IJMQTwClxh(lHXjZXoZF1v7r2ioLX61-_w
z)E5^BYdx2(iL=P1y`qK0mebQ!z*>-^jFU*j0NvO!EUc$OlLgz&E=x7)2+96m%3iUu
zv}S%X8@&~g*NzsL{a)~;>q;fvEeK}zc00Ji3I+@C=%@v?&>J~FujA9nu}wf^r92qf
zgCpRZ7Yl9#-{UbWPmUDeldyVr-vk(1C4KcP5R8hD#AkM42ul;NyjSFUSZ4Xc1ic>z
zKjU?YGzxR~v9qle_We%&sC7n4_(hatQms!{8}f*i6YE--f}MedCA9s<ifavJs-B|3
zo5YZB@)27w{u25Aht%><Uma@2PDE7V)u7ep{lr3u@Bm|>_Xh{nu7|1I45fwoyj7uX
zLdXaJlMz8?W(iXoq`0cl9lf5}U(wBa0D=>l>5kz0>h}5Wwxl`14jK)TJ=P450DK!-
zLx`D_jbFG-fZ8Pfq#0>F_)#1{0CL*h^@g5nH|jlM#>J>%?U&%sb%IAO4;KYi5`>Qg
z>u2RS*~8lG^Q&%(CoF!e?{v<*0)W5-I-Ulc8|O{f5?um1O-|;~Djx6=JC8+spxybi
z8rMu7??lQ`s6k|7UB({2c}@97MWrSZ=z0i7zyBL%MBV&q7HHrQ2yQ4VDU~Z{FlVs|
zbtY&mGmB9E)H?tVLJzQ$FU9B&-qRubIA6kenN}LX9%PsCe@VNh^Orf9ULa7w(+npc
zaZ<;@QMc)c_HaylKwr_P<u#cK#)^Dwh?xQ%-fYL5*uOLg`s|A%0LcjUAcA+oRz}Us
zS?zrgjt#Zr{$Ud(L0Y0}h~5bYUPq&~6Ur|;ABvPCM8MuAvGSQ0Qvir132t6N=+pAz
zf1ZetT>TnN+V`ezh@ogvP`Wl!_x?I1<Qp~s7JCnY`3qo_d_pNU2NTc6Y*y@XN<rn<
zO?@2k>KxiL20f;rW0;4Q=I#)*zVuY$SV)9a+`CtGai|Ks>N=7?v<fBC%vMa=vwP|(
ztWW@rUIt#3-$Zry4P7uxID~?Xg~78bhUyc9aZZc#)!S*dE#Z`XYMYI0$SRfHq^95O
zp`ME9ccBpGnd!nLs)RYAIu?@tOq_*(@~jXvt;*dbp$!3&b+W4H;}ZG+35P$_mll<R
zi6l$TL$i^Gs%uD1!)(vqH3{}M*PKgGrrjcasFhRbNrTzgi*K2zxkbubj7*D>a=CF#
zGbP3$X#dY8*^lhU^21|^-xvuaaPXg`_RIb^kTm{NArR|hY|*~!5DKR?mVK{oS!g>C
zCbQ7T*6co15q2w>!D<_Gx^2p}fHo6RB>&cj$AIqOl*9!~C~K~p9cZ3q1%POq;AjS}
z!};Z*wgfz2P3GSMg*>&zO&%mAGtl})?A&O1lr+&_hB$wIIcorB^QnNFNZ2!n=vryp
z4qdv(>)n(C$Jp$_9<IM2=(;(T&^RLAb?r>NY?V6jSAye5GcGQGr|Zwx?1pXnP)rP<
zblDJYP5)3-a2ky5R$?}lfpNt#;aB2ADDwV#CgqZ{#Ay_y!9s*F%oFTsol4>byV|oq
zW__3d%XB7Lq=uW1^;)agU&Ilw$y8g!&;=<ONhET%aA!JJzvijQj`wCiYL=<vPwnpf
zsgw4`ig<fZ1}d-|Ujm~)0ytcgHA2k}^K@E!q-hL6JA#ikYvMt$%P&7y)Ax1}cYRrX
z5*D^j`_2i6NF6q()lUq%iuPUj*@$*5Jg_p}>v-H)6$~fObLHa6`xZE`)g-~_56yQD
zW;(GYCe{1Syfm8p2Mnk(i9hESj;to-^rg9x<7;E?r`y}M{8?3ZoE0Z+8&WUT{o0nV
z-}&el!<-{Xsi2jgShxd;xPb+vu3u)at(U6s9+)?#p5De9Tgx1%*|0JP`qtZtXI3;~
z4BD&`Gu9P9Au3VHE;A(&Vqp}gtd}VKx!4hIE%j*qHlZaKh)v9|n5Akp4eNzP^_yfd
zC7oCX3MfNPCnz(w8um-4H{^Svps*u0m%8BBlmtF3rM3QDxWJLS(9jD02}oo~Q0Y55
z7rHGfz97bYD!=_oBGy0m=(Wi1kop3*^ws@|ZU6eeVu!tvPk}33wnLJGc$nk%Wzn$b
zD0n>y)YEDHF#5r+=jy2>DMmqZj8=7wR$5~SJiu>>{r#C2JSvej&rO_x0Udi$C^6?(
zJPV+ZMVX^t3xOKcXcZFrwBee{rzljmz3uEC-|Pg{VDnx%h3U!jij*(5S$Y+ts*O#`
zxLlyM`k;+@T9!KqlFp*>Pe}J_@BB}4W{AhY4?mM?EqShX>KLbjeuVei0Pvj8+mAi#
z=P`UWFG(i0)x#U-`HUirJY_^{g466<H1PIgLku&c3#tts0tum1CwX^&GXRJot_0wS
zzybh9$_LxKLI=Aa=ds8DkeEH)ES(O+mwXb||1+|n{6qD;9#J(^Us)6H3Zj7Hjwfrn
zA&9JBaVYoTuxzh<5=RO$>TxUlo(<uX7nrfPSKHjs0Uwv`bSFn!ChX*t@e?Owf-uS$
zpV`>oi|2evf$7^we)pr@bTexuTzUV>B!vVO!~Vy=kDjs?yhw4w!P#1N@-*UN7ug|b
zE`&CW4Q%0Bi$1tmZhM|fDEw$IM}zS@%Womg?yy!wg36_#E`dTwY&W=Xm|A{eFAHc^
zn`ibRRwdM`|DyZe7551M&NmI=o*4yiH`nw@f6OLIL#xm={ge`A3r91?E}SpyND#(!
zS@&pNbwY;ixMRtGZ^`+lUgoH4C$k+uhX=?^1SAFBOO!t;<Ebg&9nX&?vOcSim!Byp
zC?L0OzFohSWZwBNFV(Zc+f$n;M1FX(L)lu~_M5S3O{nxbDsv`NM@Rd3|MJQzLqxs}
z3%`V!vx}^Mta0{0FXH2wcm{D+pa}GV^a08m14wc3oac$#aqgE5H--?tw!^2deJQUY
zTe=+DWb!v~T6V{`c0HzUujR3VoJ7ZmCvF_ppK`BKm#yD1wH1L!&aH{GVG%@HepE4t
zfO-R?OY)?w2LLzf$$2`&@nk?&(ZIs`m|C2~=dQdTgL=n4a3C|{--UzKBQore9w5S9
z@n8JQcdU)^c3b6Y$keNGc2@sw8~^(Lpv5ECZZvaVL!h$O&>_!>?wb)9_uj_mSvlR7
z2JBt6+F8PXx$Gs5-GU*B^{Tvp3(VB$FYaCr3tuOGHZntE#AX+3R9p*$e!De)Vr5$Q
z@!g}Tuk!kJxj<$n-MS2P9I^z`?a1kNrXTTOtc}|SdM#QmH6YL3KoJ|3n!hraadvol
z6&x~v5^kP|>s@u6+fU+pm#dtM5ez_Yr~aYM8uM-;Yzb4_o3CT8-Mlm@1ceI7)_B%7
z+YBae=@#I$Fg&MAYq#?VM@%Aisy;hal9}~i^u99OAfc$ne3UkCQ2up_Z=@Vw&*}il
zzb{W72_t3SxvmLF5ct-c$Vruw04nSKk$?<u^*Bdt7D(#IP1ev#ewl}($>eZU#Xr)n
zdD7xU0#tQIQ+f{};gOMFq5at?9Itw>hlEF_G@$%vj2YyGmK+)Qc=)hy%H>8Ucn1q5
z{p_Tsg%SA}!#r2ve#W-VEM<$`R_~(H<QOXFrQ%VWTBrm1oWe|cb+ug9vmZb=HuWD6
z4YBf_=hT-Ifi>o@<VjM(%sh3~SCfjzd8dAYH|O`0jzsS8J#Xw`1Rgy<iq75}!3|2u
zA58NjB3+ir!I(ThLc2HIl5|*n)#H6}*^iHnS8HP)<X_@)VVcw1p#>Sum~r2g6g4P2
zU5D^Jevj>sB(!7nHw;sT&!3!T|5Be&SUps3tk+u6TzIA|?aC^0#6CD*R0o0|XG$!6
zdOen3+ju<oHix$R_|xW>aRHDcfCAGx|01pGLi&2Uk+O01a1li@PwXpZX>GeLb8DQA
z>voqcnt;7AojisG?Xu&^5ytY2_VK(dp0pKjj<67UcPcKJye!mOb1-&v^bfdlJtq52
z<46hj2jSsnQ{IC4tFLrLZ5b&U_4e`ivNAzp)(LrE(TeNH218WF9rzP>ii?}h+2)PO
zd0L=zGo*(Y1C40#c+LI88_?+_;WyCmOm^;@wNE>HdyNi)_&?iPH<sk(wLy_PxfQe(
zy#ab_0zKf2a)0>j(qiG4O-f2{yl1B5okh}Z;WJ#>K=>YGo-8)5uKJnANa!ch2dF<C
zCVG&GF&&Pb0r3o51ufX&IFq9H@AmLuJue;@RU9r~Z;r4o?>Le=sEt<=4-JWvRp3Tt
zpTB4vdBwi<=}L?N2KP9Be#i5fXwf+voDRHe(15atlYv$-3BX|Wi&Ak_#kSHod8uKr
zGVa<Mef2%{86&v}m3yx8e(|d8=7wTqONMW+p!AxR>9eO7X+?7T6T7){Hu}<B`jl8Q
zf>jFtMmKirg}ep}P+%J5&Iq(gDG6p)1AtD9hg%ZfWodYzgOo+dK%&`sduygE7KrQc
zlG59KR!i-74qJlRNwU`RZLZ4QLj*~jzW#Z8?`DtIdF|)8l={@=Y@<<LM^%5n6Z9A;
z{VM4L2c{<W(Ay1A@Lq5I@(RXkuusL&)H>7Uj>?e$Ajje2WtU0!e)}R+4|oH8D(N#Y
ziv<;VB(9fvm<Y*ed}wE1<IwIzZeqQc(aU7@vl(1<YjyVXp#^?vN1ccH@1|E;Wb5W-
zcl}kcKyO46e+OL}@nvT&t%L&Y+pa!$DIzG<;LhXPOk%YjNlehwA5Oq-)EC}=iCn?~
zI|JSR@)hy@+3nurY5VJXFlkr!zpIk##QgO?33tEsAUs{LlrGqZp;cAKHs+7_?<>Z#
zl$k@01<kAv;bS8<88em~5oG6vzja*3tV~?;2;b}N?7Iv}oF|Bft9{WxzT&GbPskzo
zyf6bZ&}=^DS(d-_VXxxHX?LB?VkD{XOCOOPNOJQez2E4!?9S?KZ&};a)~1`S5YoF5
z?3o496Ee-qE2R%8m{mJ?7`mO5$scmh5F8SHqhCjY<ra>yE8-*!@3=*Gp1!TnDn@>r
zi>20u7k0t|CmK6pTv$8b5lW8%Zx`{mBH@Q+^++gs)8#2;<KJbx4%;FY{L(-L&(vqF
zGF&Vmam&@ax3(JRZEXCS*BPlRsSE>dtnnOiF4&kdRgi4{ztpmfj0~vLtyyP1|BGDo
z?5lmr<?a|nyV3D%t#c@mxjpq7lomNSI1msNe7xGfd`$3OsIdSloOVZ15jB)u%(oo?
zbPssY^Bg0$t?Det_dRxO-_eHkzsM=B$|?SV_#O~or(Tm(Sxr50RiT?9vTK=0m+zTK
zXcDk#>NRY1+(t|s`d7JLm+%-$sjLzo7uV3(IIw;3#hxd3!kjI+$;u?@xIO-_^-17T
z?e`Uso><j!naMLCF0^ufN$`IPmAQBSpF(9XF(XITCXh<9BSLWlEp6{y#EaH8@n?Ka
z9%9qxc@hYf+FNk^6*sNsF!mEzy9x4ME8b5LoYF;(I7ZIa9V<f9n=XjWgMQ$WZp4IQ
zY+QCFE>FGG(KSeW<E=;?O%O)!8Q^tV;BpJQW-rL66Z@etXOyczAdOWJO9Ta{?+2oh
zhT>(oQ)G4`zOT=(2L}3LOHzGtctK17IpJQLT<gFjeE;ZK-TzZOY}Xk8vP8NbYyMl)
z2bhr^R*z@&eY&T6;9}nvR~ctZ`A1-Viu6BN3r{i-xCV#z?kIg%7gVTMRGg?q;{q{X
zmXOlIV$K^xd|w*}s{|0#EV2v-R`vRWIT$lUe8_Lo6{yIal&64ro@`W*MmzHVRQi%%
z!PlU*-xT*iA{*%5f`rzEUpn0BCkxu~;<r*BrtVM^wx8d-60zFplw$hNqzzB-M14g$
z`9L?c<4u@I9-$}roF5m*ktUn*P_#4NZiruM2fmp{4NXmQX*`v4s16|W^p#g5=XuE=
zQ8J>Voj<@MexM$S1~f}Ym_oe?grQO;HB}E?0QFeB;j%&#UB3nA)B2A%(&VH}X#TX4
z>{2rN^e}C>f7nh=c8$u%uKsix5?i_v>9K>?hcYB=_T3_`woFI&zNT{?>NjA|Z2#}X
z;XgY^&qQ2BVLG9shT8Pq9p|p4QC?+smp`j&py!*za*u0*8Sa*C!35G&GUCm!_^Ba-
zmfA!kr?&Y5f@b@eF_O8s(Yq&imUWNJ{%lc-Og-%8_ks@IZ{*A)-)W)obHbGj@E6BM
zA{n2VA%VjUpUuCIkqpOzLN6jn8s$wJ2|`XQZYGs>hOMUt-PU6vL5QWvx+t4vt*NzS
zhxQsBM-InWB?rvA+r?J28XbG<+`C#ftMn89Z$v}=e7kPJ#P=R)so1!K#tHPy<SBfN
zo$%|(chWkEDc%1E9kY5TL`n1F*q6iZRfR`e+4Mj4?ki=<aKM@yLc1ZMYT!G5ih>UN
z7x@0y?t2*A*-~E97Y#;b9_Vt@#UQ0wdKK4lBHkm5DxbP<gthmjO+^?yCx{mou${1g
z{~)H7D5=N^;(^hQw95Dwxk$o=mO_nI#@9IUMSw#7<SaU{@S3flHX#*(l(*m^?cQ1v
zYyGp3V1l#7;_%C&<j{RpK2?U&Yj(>gkmttz5i<wQ|9o~uHc_weS{d19pxAQe$jfXt
z$A@qp9d<cuf)(=R7nM0`)4~5Aa)7(Q)MU&$?cUPyc#CmSC=MFn6f0smW^6z<cu_yy
zbC`}V;vGanHA|)v9UeKAe!m*NbmlqOrlCb)f5ZFVls#_4x74Av3r+u}&o>Y+yj^p5
zcL0u1Tmay6E$hCHk*;(XR5>2EZMu5^9`M&sH%~8p!q{?N2KR%EX74Fng3jdu+T%q=
z2?r@dqXh==(ZjX-k$#wxk-+HQQP;c5JPi-%NJep?M{0<TuOqEWWr)l}e1SU*dfx+F
zCqOn1+7;_@9K}lap69hcnNf<Nfl<-cR8^(7lWPeQMW&Yl;$M;iU*Clh$UuQbrIoQ8
z8cD80s{LG4B&L{t3nZH`Ko);m&5^`6sxS*u`x#Q%l!phdfdIR9FQs{6-wQ*5Z0%eq
z@NM=)7R;F@p8orXr5<4s&78u};et1?=_iN-^K^ID7X$^$$7X~s0cY_$oT&<mX=#m`
zomb*&n5e6BSQf21CeEGYfqmLhX2Ij%!X{5!FjEgvvfZ=i%QJmt1hN5S>nZMoNY=YI
z1nqnl&nigIEdU@xJyP)f4QctB$9KlvpM50rKelw1Y9Z)ZoaMR(FS0#8oVsV>#8TA}
zsb5D1=)>EJpzZqCOBjXY&Q>JW(fqn}&s;ViWRw<sSMukLP4XQegwTe)NZpdYt=Gat
z$idRqnro;HJ_sgv8bUN+$l4agno(kJ2kwcJV4pf^KT@9_PzXy#fY=Qz*&nKw4#0Qp
za<|7s%1gAaLs{nTEZ$ddIN{Kvl|@-5n#JVMPWEtB0ZCq%g0AlzEm2hqXxSwNe9Xg?
zBZ;K#c)XSIhSuQKrhSyv)JO%~E8H^pKmViWKc5nMI^g|l(^bXl)_y#uw^c9qHwgWJ
z(#PP^Y|7FD(63W7Kc)hDtv`j({~b+ssu{Pdvwu(z0MH|F+K79YhFCJocY>2T;Xj`7
zj8a5~e2EOji3+u;7sa@}rVL;O)2I26ijr=^%g7{1D~0(W2VU|z+IddmcOC$|)1T7v
zd!9w_8nLsgrAkV(uI8NaXaHELoQ_KFlMbqTOQyjN*t;7>Rwup3bkfrlZwDPuu4@}<
z9*o^bPX#cZ3V<-V3d_Noj=MTij(!?%eoa5^;X~1UCPG00*&>BImhjhxKZPvf(oBO3
z-9HPCXd#>fp9em0zg3m}yD@HbNS?o=Kf}LaTCNZ!wNf=2{MO%p9qN~&eJA4ACo7TV
zU8^2bc6?-Yw0~)7VU+mW8`$7n4UB^z9_Z*;_^kXMOK7?AFrjmD5IdA(GYP}tJ;y@b
z>>=osN7A9I0~Z*;{mEb<W&Tdn^!@70Kz|}J#-JZ>pfmC?dCMQ)jRc>c2#t8e5ZhuX
z7RyxYk5`FKno)~-a_HmAst9CmEuwcdZykDw5ztZ?T#5W|t6Tgb$R4yV>$D%eFdSiT
z8|MQ;5#^?irQz}Id=%W<SDGhk-dJA$ODe<L+Mt%!=9bZHciCOOm)pU{E}1#lSYo{7
zKBBo}V(*NupX_-Ud7g8Fg3Fil^qOcGp01c>b&v3Djc3kvu&4JV7A&16w(b-*mDADU
z-yE{T>f?OP|K!E)oSUzVw)W#XvI$8|Z7vjx`euQ=92^|b^WnY?YIOQ4{vM!XKiDm;
zO5?XM5?wHDxWJc`J>RV<li{UZ0^GN*l=;XOy_^2Jmi^xUnth)do4}S~EQt=nbdKxl
zP>#63`IZ2{$bj`uoe5Wk=+I;O`3B_a^=R2kAIqdo=**6<k(=DcGY=;v4CT#`%Ag}J
z$nZKZbuOZS_o+8`w%lI_&)F@Th6(nTrV#cjj;WapzEIs{&JZm=o={D{an(AjbaF82
zE1%6C=dFdWpA3<RwDdRK$({Qrs1akf<Q8^Gs<D<!A7R{+#X|wDuK0LI@ki@Zk{$C{
zM@N1u_<AId5<en*EYUnO>ClakKvq_m{lgr?oWOi99kVgd<qOk#X~RC5Py+U#5c8G4
z{ihmg3N;@LurU<3YVOBSkf@JS#2-mLO(wo{bB|ddM)@)^QXZ>Mrv(ZX(Mvfs(1k;U
z@2^(BCmh}NJ9W3nDN9hYRn2jb*};-_v4m?geF5=IU>4vNQ8s$yZn%+m?BQPKTuBpg
zoMnBhhAv16g+j=8n^<fk-@T@7m;_YUl&7HB2xDX(txf0I*uV@8X<(b%XCv%E(^T1E
zQ^8~xAPe}znq+XHT_Er9+a^p`KGSwfg6C|Ms63^)MU`cgELBWQEWWW;zRdL<ZT@f9
zIdofjAf*Em6i!l7!s~JL0RtlltZe&wT^F9`MRz2&Rm{@%ykLeiM_Z?PJ}~wE#Baaa
z$@Rr_WWG9466bf5E7ibRkcRY+BfXT2DULU~y_K<f<nJ*JC-%nOmZdBru^>-|${V6l
z1x)TP;-iVZoEeYS!7JWp`!>B?Utxo^3)Fb{!eGD=h@zpR0~rDV^l1A&qz(8D$GLeX
zELKg2m;}0ZYxUY?hmD#`Q1|$nx|~7%u#~#$1md?Ix}b)ub$jHN!2`YolPYq*Q874R
z(^2PEacBI}cK@cD4QLw+9;3@hM8Clj{~$?SwBNJPG%Q=!3R^ew^YFzrDIoTDD0kD%
ztBl;prTlbTw9(^B#buB0V&bYJF~W5AaT)p2%cwuqncc$vz<0H6Kb@yRd$1&O)5>x1
ztDcYPpSKnb)1O&?gi}Vvn{m{5%QURZ+hxl59s7>Z7yltEW`ZD0w#RW6xTIm<YR^`k
z=9@C8r=LXy<y!X%!oBz7JjMIFnI(U)G)at%F<*ue<@?Zk_1!{=9oHwW!b2|HEcu#q
zrfkd}_&6phFHfCQTFSry3*1rXPBefh<8--#xx-*olCZZ>z$+yYbE@FV{iyHZj>rIg
zXxA4ty5S+hi`iMc3t^(#je>j-yvm__vr*cI=2&Kj$0f*O*m5NP{nsB!F_}1XQ25R#
zuluu@Vdq|B7_1~T1x4On{pp_|`Z6ykhE)s<gtwFfe$DFM-K#%DO<_lc7SHynhO%X!
z-h6s0?a(+Eo4z6JIu?1@KCOuoE_l`Njx<6_45y%of7zBtZBbWxxHLhG3RRR00Zq;n
zMNYHDJYHEZ+2Jc3{(^SZf8X*;QoQZsz%#i{2@VM{H1%^g<-)*tZrRR#W_1#p%T;=J
zC7Z65VuX)TbXA0e)kVABy509XZ~v9wbr+iP_19#kZ=E>uPXw`Cn7cO#CK?GSNJyd}
z1VIQn;<J&udXg?=Zr>SR7pj9a1b3?lPn$g`xrxucc=vPMT`%!DyXlEzUMhk~&h&_&
zdT4S()@l#l65bF`>UZqtAPgypNl0?hP~2F1k4`D23N{u)n`e{^y^l%k6}M@5RRT7-
zOyOKERT<mFpZfpB0w|x}=wjquMRe^fxJXSL&3tMv-yCFWBr*K9V3lkGAzz}J#yay-
zjAFXCdpKH-q7zX1P@KeLGmHXP7gK1sQM1sHgc|Oza(SJCvW>>*VT+$a^iWt%Kf-E1
zx`3lJ+7jQi!6Kacyj6)csc00uX$;m@0e9~bOq`sY*xA{qrl!CT6J!AJL5ZEP%>Lsc
zjS@r1EIgbA1Vt+~bUOUAlkDGvYz2105l#cEFDz1~{;E8Co<m@^aD8^2nqT6_tAm>X
zip%!Q0NwNC^6J~-@kX47ayiD4bxYR8MJpt%OsTQ7s~N}E>APz;j%^fY9Jp0OuGGuR
z%L&xhkj3LZbhDSbULqo*3DoiQyET{7l~!{wY;z|1N9$Y22RapIKWP7q+2lOlK&9C<
z{?l6PnYh}1CJYS4kySTr3DS{!oE3#NEuP3~bq4cbT`@9ri4Qyun|e#O+=Ng~@IzHL
z%O+f@_rjA?Q;KHbZCzYNyEG0TWGPL_xLjgayt?o`=z))BA)d|yes=B;%TXDt0@P`l
z6pFn)E@b$}Yt~}M!i}~g*?KKiT2UKiYev(SE}*nwQv>uJ+qaMYpZAConDtg>byjMB
zm#LOa?Z*umQ$!cOzpRHm^U~AP+cmstYo>3fY1Or3!LOk8QgB(uNr<mm`Ft$43kZqO
z#owG;IR|{NjBmYO2_*T~l!Cp7N#dB!OJU(G5fjuiLFDeW4QIgr#DFc?cAaUp6Atn@
zSFvEK*Xpz3_nVN`;C7+9Yx~1zYC%Dg2pAzIG9VZIKJowbYyV)7pl}!p6u&%WV`E!r
z8fwn8kd=*?u{;K~1^IWo`Bsv_Os_1}l<f7-t>%UUXTDmpjW5MrT*tnj2K7&chMsl~
z-Tih!9o%|g{3m!0fs3cXMMFcQ?Ue`zcnC+G6=q~+hLn{>gohu2w^V~nUThEX;|K0L
zH+h}5szK(s%YGvmSdBH<4ae>J-G=YxPnZvnj?V5oTUc1MwJhxK?}unZWz~ON0Vf@>
z|M{y18gLHm)c-i53eFa3p*Ps}e^)vK1H*7^C51KEg7$!yGVo9KTEnJJzP>^W;7<x0
z1xj=wyAa^d$cT6<ctEs&T3H~^&dwHdK3QsnfkO=^;zmV*ZUH}~@w-LZ3+~IZrH+_`
zw4H&QJU@E_T!Awa+xq6)$TOtY05j}TRMjN#L3`c5>RtbgXN$V<r4JY{RGAE^>G|N`
z{dZwvV&dTi_nmXAsHp6Yrk4~KKY+TJEoe*=Y$Vxphu}*!md;-flCwE%_N|k=dGh+F
zn#pIs24L(v^KnSn{Z=iBjg6h?t#vCZFaP|oBfMl=4^~njI&7ioY^->lUcZ9{<BtF+
zL7@na6sQEdXxGk}ECW8xn`13h8>9UO{HP+<16K+@ebcMj^GuR{P|<~0uaBOaJJAp8
z{4}MZfybHXw%ZN&mG<G(?={sg@tm}Jk8VYp75RPQ(DTi9K|ukQB3OcPs%Zn#0=hO=
z-a>aAi%x<wF%Eq(8;OX7wAsy>U?NMU1X9NR#}c>^3oXP$mh<ZudadU1J@<zXfg!)T
zZxF$jfVD)`vRVZt8G_^>vmhw2oROn!uLP<m=|Z>(BR=Own`j8REmoQQr1qkOuoH|Y
zuIN2^AXU#F-FIBI6xgpodousqjC<$)XTsgd>{=}{);Jj%M`Bwq+8gf5A>O6q8;ag%
zgTWOme(o_E9M*{&@0HuE3#k>ir&K7KY|9T?d++_@S*k2|TSM=r<4xuV!JI<9gM8+f
zF|EESzclo&o8ntS(y|Lb_Z9OwAFb>`qJK*Z0Ipd7XxV`cyOvphYy>OWsz&bG?;F;e
z6ib7tU9RQyTVHQ7_I##R-K&V?eGdeey*6CFXSmjxnw!(K%Z5r6OnJQgDEE3)xa2g?
zsK(21i!kH&bPKk2=<CAjt#hybsMZeE%K!b+um08b@=fUqHOv1z7I=8KRhC{_78r>Z
zbw?JOVx-|~_|rVp7JH~}oI-yi_pfWd6457{#J4JCN=dUyE&ZjpZFB6cn*$bFsXszm
zuvCO@$tlEy+b&nm(^}V7v8iH1L+;QM+UGZ>#edRI4bXe#sq!+6sxO_9Eaaf^c%;|t
zt<58i5k>fEk?VlV+kfSd>6s*~B9&f2NWu>eV@6H$K!ctT9q6zlw>zfF@Q0*(2qZIL
zoL4e%Qdh6AyPw!ZPhJRc^Jo0Mne6*OA;fsSbY*Kirn8r|0Jp-%*z_xQt_Vq<6U`FU
zOz@jE+$t=ih<X!OaO+9W?5DEZ`|!G*M%m-c9CJbWM`}jRE0p#i+3!kt<A!#_V`KkJ
z+mCj5MM#Q)0~F5WPo(t~fz4NLOL^_zLk4A4#GL1inV5@3o)CdGyI=JWzaovR$~;8=
z;`(`EF5Z|~`59^n_&5&GTQ;ssAKqb~!q$Z-LD^xQre|NfH<XQOxyxzR`1N&>j;j0{
zjR&!yU6jabym8dXO$J<~1t{C7<rV$v#=A|WB4wIriE%1md>h-KN<!z)@tabc(CMJF
z*T^J{FJjyIbjA<>c!BY-g6yXIJ8KbkRh5D}+Q`VZ9V!W*GGy-kPpDOoFV5a})+Mj*
zTDOzea&k31t)I9(pf^od_giLnc^CM2Cn)6dv;>=B#&K&k)6>$znab_8P>mzK&Yr){
z%^+Z^44+BT%D2^+kK(I9++lHo+c^^YWh4p8yJp{<*|_bWhM1#@@^c&6eEH4P6WfZp
zp$3v?qk7OUA&%W$cps#srdn|jUVFBJ+vw&;$YQeA)5Hu|E5^poesAUtCJpRSgsz~F
zyZ>xEw)@<Jn>mlygye?@VcVBw9t4c&W!>}3&VVu^UC*t<YAsi0`6F0@o#ZiH$>qzv
z6Fn69j&n}`sCDME(=fTx`eA<GGPAPlc-$VycnKKOjgODdCIJNO1>oyPW1B8K=)bD@
z6K#yiqesgXXf-+0628eo^Tx@+;8Sd{4fBoJosCP5q89%kyIf(~AJDVgHNgjr8`SIl
zsO6{aGWOPhk>7?ASy$T=>@j4;;3bZfPGrjF1O$H7i%>sJ%ScOu&iVZOJh<@XbtbIP
zvS1TT?r-5S2$0{VC4dK2Wyc$zfP?3*4R$8PT{KM{oM6ZLr%gRGzw*kGx8dn*W~0vO
zZgXm0=NXLr7pS<Y$Hh=a1;USME4mdYujUT{`<mp6GucsGL@Mo?JUVXIP&h$EbxDS%
zFQ!uAW*EC|D;MW;aDGkJiJ#5;3j|&pgq|!Z=Tl-ENsnm+?wn90m0`Zf_&Se@cB_cx
zz23f<6%<QHEfnpnCAg^(g-YOO`&r6~z;ls*yj?GOa~0-a>qGN|*!mL`P&DTNQ$n&(
zLUKjQ7X@%la2Kk3#0CZiVDGoR8mkymlaq_4_9rJN$Is#X6-2?)9e3Wwl_L8$Ex!d%
z<3nh_Dly7rfeZnzRlB#h=)tpAzS-;hUGla|tKPYv@4PLYEVl0UW1TUa4f+F$Ufrk3
z7ArWloAsL#LWbe$TWaYX*Piy9N}w$FS5QhEuMx)NLoqNhRGyym;C4?7XQ>2vY_oo6
z-yLqF+5OSE*slvh_$H=&z8wIk<hI}!&1Rxr+N0}g+5Ln<<XXPm=wLc>4_2nnXqQ(+
zmu(!qd`}D#>sm)B`s)qo{I~zGc<2|Fq8XPSe_U5Pa{z^dKfIyFhtd>TK}?r~%pE%O
z3Aum=pZ9?!3;wUB0AW!SD8OIKh}-7QTweBWt5TC3V`6}lxr~q3ctBW4gC`74#wy$|
z>EZ5_y-j8~xt!uLV_1z_@1=KgIg<f@#rZZvtw5-;lPPU}Vz=bsD1Oz?SpI&z!POIN
zeIl2Dkjp+j?-Y{W?ALkWmj{ia7e6`g3uPsn?<p`PiN4x8N}}C<m+RE+<eQ{F1X`#m
zFQ>K(R}sg>y(^HdZbjF@H&GX=|NHoWCURbHRq24Sh@#q=d?7jD+hxtQyStl9vQ{@Q
zZ|ikE<A;F}J*4`4DY@xJZFQ0sTkYHw7NSkvKFnVtH80>7SKwAG^?DTBEv$XL?fZD8
z7tIRE;y9QnonBWv&1f;6fVloX+25O*JG!sRq9WgP8!@2ASiawKid7Sce>FKDLaOCz
zj{i+JFw<69?0_^E-mMC=C(>F1w$e)Ht~R4qmrLawM<1gd8n`0cDjTQN{9TSOh4h4O
z3Ge^9#xrVId3QV{|9?A@M+o(h2S<k@X%VrTVyssoRraUtbY94T(OqCHJ>&Dd+(e&?
z)z7>PxW97K_H`*?gcNkojh}N!gv2&l1>#-JkzIJ~7@r%b<(w%_pKl@W$QGrB{^&Y1
zE{QLEQ+h|-Y`LG)6MQQ}kRL(dUEq3lR*B$1XU*w9I%76i;o2^t1HYf*357)mve;bF
zfNc#DC@Q&C(0sz?K)tg0H@4zss$fCxqZyvsKN~`PK245dqP>NkIw&N>vd#B8G3u1(
zn%(0<DREg&Or_UMUiZd#_$)Xd?HvtyF;%35RE$+34NhI!<h#TOuvr&92&XVxXSD2_
zKfYi_EAZPbo_zCLyx4wGTJ^cP{vw<>`<GI)dolIZEfN=Q8mRA-|JO=aI<zbhrqV=+
zik3Z8i!Vjb&(F`zjko-q1rb4kIureGzpa6BrkaTpXO^Eozc`qA<@OnYD2s_<#!JzK
zrnP+7jp~Lx!ds9jy2x@yyG2c<45pN9QnHJQaivC-n^xQ$q*#t27n8+{;q#j5j1G5j
z&C)6G`*dFHBZY+gQBistppR;AaaI|;N(&wz!dX)lI}piO;X^qdN6}P%2cGxkxc*fB
zC!lo-s?6UVg__I{4-X$5StpoGu#G34J=P5vgC^hQ(HUrVB6X4K3Hce{M{>`lZ^0<8
zhg9b79j!oq>gcj_Dl1Q)`4#n1A?GWBIrl9e$T_K(6l5hlBqVO(r+j)tO2;gIJ45N&
zQWyaz*W}A-Kr81;iK3F~1V?-8bdps|6Hm6(<6SPox4Oy9l6wTU&!4Y38;tS(8Aag6
ze!D(|WNcz`P96uItHZ-vkjA%rFpf@6pr?9I{}XKa_;(^;tq#8S5>J@GV&QBd*!vHG
z&~q=fkzASdbN82E-}1`EW1M`5ns5;*tI@S29!9k5&C@=e(GEMa4GlwiVQ~@9_4UDO
zOH*!N$*#p(Tyku;bbVXhRctJd3MWoMoA&ur$tr?TIW95&?NZBYm0v=1)5B{RJ>k(!
z%A{zjp64Fk1FRo}oZ_((1v2XQu)peS{8ZFuTfbN$74h2K`8SJ@7;ivWli#~xQ2Xqq
zS&x(r2QQQU`au1kMGw5ddeA#_pMnOehER^k0b=Vi=rY|eC#|+B4r73sTuDqk#(Nxs
zxzf9;un>v(hNsvg-cMe-$$V^eey=a0R6^Hyi{Gv0Mo35J`Vli5T_01vP{K-^g%e@o
z@qe)R^yx0AR^F=hpuRZ&6)Xt3OJQu6BnH3QN@VsCw3k%144=d56Sz&3Po&n?y`63#
zhk3Y-BH_Hg{?6JyG~g{EE*9f3Bz!-V7UIX~<1S<U4{OjL;dxiKL5?(Q`c%A6p!`F)
z^?8-G*Xms+pTkl^s62$e?=2Xa2hNnhtxym~)Z$ZS=N$T4b8QX_E6~A6SpTHnd!ee-
zZDuPZe0d-%IV?KlCKRW5u4Lg6Axp@rPUJFTt{!9HG>bi~p{uH`DPH_-0Urma{V~xh
z0WU}IZol&0q}PmjQ5goVKTsmXR9EUcgWO<64{^7Qj`5|jk4m|Bi6HmF^G8zLcII#6
z=_hSXO_TQ)Ltc{j?3ZKBW~K|rC|pF**ql00Edu8~Smt13&q)7hWm6l_Fzy|Pe>QS;
z<!Wa)Gd9lBqQ9fAo=?lnjB@oqp+IO?B3ijwE~Ys?D|99G%O#M&$t9WH$yI@=?S`v!
zvS*Ag{D5gq>2K=Ux?;|eL*m(+@GmOqjJs8~hNXARmp*FF0XjczC0U>sgpV9j-qFvf
z*S2KLcK&Fu32KDBdBW{?!2Q{4rza#80suAd85vQ6CBgF?G;ssu&*Jdb9-F}b2j#X4
z%FT(-K4lO>iv?y^0{;aes{C9X#3v6n1KviC%ZG)Cfp!3BLUV)XX4`ei48P0vceUT;
zYgV9rkOlk0V9Swr0m@sUVZBV=9{oQbYZ=Ld{x0nhH7{ry$6E{>0ln|Yod2fr-<8D)
zb|LpJy9TryXCZoHRScO;hTvd*dirJS-8oxR!+{J+0sPNZMs5e2$p!j+;o->0d9Z?`
z%|8G~Kfs^KfYB*k=(yPV7e^q0XB3S{I2F_$d%t31W1~6TYOoa|6u^gcp?c<GgOZmw
zcNpCya{CcH+|9pudh%g_bmrN#w%@zs+p7#jq0~;3bM&d&vF{eKv-c7bHFlre%hE}h
zd|1W*WCcNaPS&sb4n?sc3W4KI0R}1du))c7>rl`ffB&!h@Ba%Fw5|F|RWSSn-LTNb
z=F*c^5w|n^{QwOL8~DlsV!u`M)#@raLV_Co@HN~-&E18}T^ffFJ(*8PFn0l9chlv9
zUC%V0_Q6s9@9x~^zstQ>!lrSWltK7{sVP<V4-lzgPi$7HK8|W%!ar{Dm>^P5qkFT<
z$%cY~VNsh?`5jhz);{dvGew`pj&Heh2}jp>J|fJ?5M`RmuzAiM&8*LQMRj3Ir6Msp
z5%W7taWUt-9+S2$S4;<&rKnz}AAcU#DT7JOhL*zv2)NcQX96DB=1kz&6-2j#NeYIR
zpE+_W&%;9s^LrfjoZlDdpCpqZUeh_OaWqGgx~lLMW`5@2%ZNT#B02d<nO4!xBGUEj
zUR5ci%p>x?gX%3AMb7xRLR$KT#-2YiFy096#HU!exCsdfrY!h(k-?yC&;-32krLv}
zw>+6A4Mq%)6-s6X1_pc=dvR>l286!wDt5^f^;zZ6=)S1RO|9+<mpmR*xk<^L{X7>F
zGp=jD4ct~XT*ITM%;DSZgLNUr^Lj^(pI+>4Q?9?eu@EI45@O!&(&<Ydt^Zl>4C#D}
zq4If*V#v)z)`=Z)J)ApkECLUGqSB={nOc0{yAJH*GBE}Q3JXmJQe|uU=0*vzMD3(-
zS$vcNCWg+R5-*?l%bZ*b-p)-E`jt2oOssefwc*;Loxz^higzrr6yoRmhOEW^Vxhnt
z0ccWafxsI^ou;_c?Xc{cEG&;XbQXN%JwX)v|2?;taxSVqM?$|{`YJp=2z?l7NEltL
zZH_K=sQs#KHm}qbAN|3ilC5pH%{MvMJN)XpKnxk-t!3z+>j}dlw|J;fr0wcH&^+RQ
zsT!=?y!IUF9~`ixpkU*_Rh67{`fzS|5^I9e>~#408(pYI=H5R+&aPNbOMg+DOv6UM
zu;*cx&Mf|OOr?wv)$j9u0qGshL#<RX(%)P-VS=?I#p7X_l2|C?Vd!;W>I}Fe1v}Kd
ze+>rxgTc;JXMrH1k8VypArfS`tAj?zPe$<pAM-wDV~?m8fyw|F%wej{LKKG6T@ywr
zBJw9SK6VH}gwS{dViXi%m@k#1*VC3}>q56%*h0rRG8v>ioEag3QS$Yt8i!#iw0=yI
zf6&DR$>%ln>3N47dbDYrpZ3sYe97%&euO05X;c;I@wpv2=}O>OhJ+NdD#uhZvFL`|
zAdO>`OXCGS=T3k!Hsgt8c0sym;JIP;>7RA~jtmPcdQsDlJ}RP3O~5dBd;CDN+w!Y8
zD`DAQTVs{+@pbbc-yq?Lm?)=+;$_?#em-gA;1Eyweo_*FyV;ZuU7mjkMJ~QjiHQg-
zVvZvmMs#Co^!wXyGc#j4WzSM+IhB2vVH@Infn@-R%EP$vyr5C21ea&2vvU9q9pSkZ
zUeVBFmrK#G$9~TBpHJYo-FjM~Tp&)60^C-jz-`5z|DV^D)h}j+|Dz$M+BU46ih4UF
zrD{|aZC$`mT$26&BY()oTb$m#@);uD1Znn;syoHpi}{T}u8S2n+_57pE)^aPYp+!L
z1=aI~k^+1pI5P25?{LM%u`o8~v{VysVKFqK-c`Nh-D`5jK!FCvZNL{aGa$QW4fo&l
zE=7Xr9_-3SM#WGHSyVTQIY$cGU!)OXA*I1YxD2iqy6{iApJ;I5&>QK7QlRG_V#{Mn
zqFfW_5c>#>W5Ic0tG_9l8GYZ|vs)%YF}m)GJ`fT@^RZhjwt5R*%}5`#QI#FzN+`qT
zSh!V8I4KegLa;0-X+vcji>E$OIGe-nae#q2vmmh#&1MWuZLTf*+KEgQ`rt<F0FD@%
zrC=PcBZoPXB|NYs^xx@vTEEKk7s?hdH(Johz9qv-??PsC8rz^W#!^b#tKC_b%ydJa
zTojx02f~O0$)BRvqz~CDqXA`+o;d&o1z)VvWwY$bgp<iYE<sQ%{{v%x^hXjYF|q7n
zfHQr?%!|3eK|O@bvc<VZmCYyI#>POjPm0Mz#Q!pGb4oih=ff11T{*wyRqMtC5#yuk
zjGc?b7cmU4!8(`s-0smA@gI;QVaIX)%|%li;^UW}IXM{Zo+U$%&`|V%KjhO2qE7<o
zeJ~7_>w&nYDKOaZ#P#+a*f(d~|BWUklNn^QvG-INq_Q*j56FMD5FZ;>d*aIo2?<SP
zP8(`%9B6b%j+J0BEVp+2c1=V<LC8jLe>49>o?8Re#!^%He-dS4A8^MsKJc~=>NX5$
zN^n@@IXh#=R*7i5ns!GD9P)dJiFKaGtTLh!J5=i^<^D!e<0^ltwz;gzuO<0KBTX1p
zfs;klz@+1>)3m4K;r|3VD{&)GH7*5!HUtm)D-*^alZG0z#%$Sz^q|uRIMe+Xi#n>;
z(au(4$=#(BPLfs|)ZX!o`N0-#`AyB27C(By?vUX`Bwh=bB3uD8Z_u}YH5YqCm*nb@
zuly)xksP-$H%T9I#=*hLLeDy`v0H~88ZUeK>S;2b>!Pg@&!3gaqUqzZBS}Bg+^Q-O
zo%2N3>&2Z|wnBZmg!u0L*prN|(hg`X1n5y_e17Q^;^m_L=w<;rtMaB0nr=j3{1DtG
z+92Ww;1&&lmI@zUH;W@L1=SvMr()My-3>csu40|X9AJ_<giQ-uoSNLesjMi1Q%B&B
zc3v*<v6a_%`|<3*RoM!fSM6uN%U47#&XvY_USPnxWsP~K)Lw&70VxoT=(2kRLOD|=
zAYe2&Wn@QAL)pj2M?>TPV(Y7;qH4czhm!6T=|;L+x*H^<8>G8Sy1SL`mhKMekZz>A
zJMTfi-(B~Qdt7T=Ylb=T&b#Z`&)z##so1eeE4?LXUS^`nUG0xrI|7XxBE~xqs0|x%
zWEg-Y9U2=O8y)T2-tq>2J1I@xe;E5Ci>qtSZ>y#Hx|GXM%8bSnG>!2EgVf`ZL7$Vl
zo0rE^!=?K~p!SNzbSw+M)8=09k6MC0AV5G0!jIpaDM1x*F?4l6!e6iTfruBR{S63o
zjh9{@%t{#!j`!}cljI6}Ou)S}QDGp{>uMh!+)0$EFw$G%goBa#78Ldf_>}Gv$5j-`
z0X``qkjHs_p82CaJ!8gdKvmuj_jaeIb$btsa-W-lTin>#Xl@4jcLSqYc-lvtYn_7j
zhZ1S#S-rP`=4aS&Ji7xOK!@MPM)l&)=5-pZv{#7QYIlj1XuiIfpw2bm>>WYbwGX`0
zD4@M}CN*6A{Dj97V=joeK%FTz@O!d7)g|I1>y(Ql%^oj?MgZT*1UyJUt$Ja-?L*}M
zqzIS<PtbmLD9hLDe1+2A3JyAR7|;Yu(*tf861d^!c#U#7IXN+GwMGJHHP3*@rjl56
zXpvvxj*8#~F6~GaQ7q}s+GY0fBt*`Q>ZR6cIvr~ltES8ArU}vghK3MtrXSgjrjw0V
z`0M*fF@*dXX~2aO;4s7yqfgGo&&r2)yG|{|BabRo&OzVyQKy&*^D-lszQFy&|DKUp
zmR)XTmiB~Y>2OWswAJ-G5GDu*9Gd|{9EEFG>+z<=?Bk^DPWjkW{q5@?+&jJX2v;y?
zOyG*X?LhP=AdMqcs`dtCfz}iZyd3g#Md+TcrJ37Q1+F8-`D;AY_1p|qI)nNQQpd|E
zJbqX4Fc58ovR}@^H#teqHXkYGEI5!G)`5GZTAWg7dRsq#81lAIwt1FV)Y(~%-^F9+
zwSC9k|K<u@u>ng~0a(^+?zdgTM?ymbKPRB|E0J^zE$43G+c+6pA_@-1xCo*{;0k3D
zYOFsEwO#lFd@dlja|ul!@%^X}Nl8l>9JHMoA=tn#zy3Op6QdAu9e35Yy^N!g*1CSi
ziA)Q-<?$}d@y7i#?<%%+Jw403J2|CQPD(_g!z$!Z=l5;*8@7B4y|OQR&_{-*;e!<s
zYlqN7ap+jT$=V{<bz9HYRh3GVXL@n2L}a779MoIJ+6fcP7K=8MG6n|%?xBTVu;z1v
z$lo5Y%llqq@G2N6-3V>stOAlEfG4}0$m~KIn?G`GSmoR62ZNZ~DF@M(GHC`-!Ew1A
zsrvbWRRHiU&~eL&f`){)*PE&nkoS_!LamU`!+=Do+HG>bAPXH45mrta#II^^o<L1o
zJoRsS>MEkKu{++8I{)q#HYxIJcE%(FBOk%#nBI}+KL2ZPhi+j?9tFwn&ApSy3C5b*
z{s6;Q4Qr7c&pxSrz((C>*E!kPxXiFtiaF*y)fp>AHr{{h$&a2{;vjTpQT(k^yj(gA
zz*IT?pHZ5D(2octDI)l}wXe=iWW@FUW=1J-u5?|FJ8D|lHf+oLbhT3I+{PizoJ5P5
zBnRHDj)B%e<su%V+jRE&u`Vg3=;i13bzl9|^(gXZm`u0AwaQ$}Q(k>OSrHLa_L;WE
zPzhphQh6r1h_-icX|{WJEg%NtPM52k+tfpNc&bDOBFt(_AR^U(0BH!X!cAmnP61#u
z9YCk=@a7w?_B;qv7<YMdG<?SuBa&r4^T1yoygt_2y7Hpq_aWeT%5uznVB~R04w+Jw
z7Qb@IeEywQvvR%)r}<<~{c5|U|7%iY;lpt6WbgYq4okY)vPJr#v_38BA|H>I@Qr*Q
zSIII=4uVyad+Bf6qq5z|N`ame7E7j&te-gzzp0m^BH_u<%u2{C9deu?r!=c);85!c
z1(sOM<<z%UB~j7eq`lOXC;rF|#-bs_p=0#1@Ukes%yp_Cw8+cedO0sD<^BA$<YE+G
z3&!?{tD}2#NA-){E%7dpO~|>!sn0ptBx$4R96avb9!k9sVHXhUwgm+S0{(NGPV}YH
zM6$DucMNjw<bb?jur<gGh**82fURXFCiX7q_eGsQ!JD45`%lk-=9I#)x2bX*Ec<&Z
zyxi!cLPA6ob81zFFl5CE7*Za1M2B!la=_rh&E;px_&0c!DAPxDODa+LUd0zU$x)eV
zlAFQ{Iq((_8#$%jna977V$t8#JbhwARlG9j8V^3afIBMqtgoE-8TxX*#JlFZnqEK}
z9uzo<ph&Ch)%9#x4LYi%HAaxR$&juDDzix1m+HZ$F?>_4-B2M(Mv}U<+35l|4D`HB
z)~L5P`+f&7$s{ab3FL)gMiTg+S7%ic<ei)L+7%&q>6z}(n4Ix*x9Z=%^Q{MzdLt|l
zrw1zE@;U9rr=|{8(*lxoMf;z!)<6>t&D5aq4ht#Wq))IfbXj8G;jU5BK({x_=I~}k
z_t>53fHs44Y_6&flaLhY6@i6zHP8F_UKRtNkwb^LqBTyU3*k)bkpU^BEq>34^pI$S
zWr}K&FYUW4tWv0NNLkc}5v+1lb~P`|qG^%D>8lX#xs2m_h{6vH33l7Y)*n!*LwYj6
zyj?o`U-emW0qVshA&UEC#o@lW|18iqGO!hMy#PRabk~cCQGl>EHuhWVhb-vgnS-q@
zBS*VHb|xt?^l`Hdh@t4|6Dns+o+78!p}HZ~rleaq9C?1_;H5(y4xh@HkUuu={1eW{
zX(5u1EEe=rkO63k=^JG=I16=$JA7rTo!5lNi_KZi4w3@hj^|rsgT4}7lP(9ZI-x6I
zPgroRLY1Ar%ASxTqn;Q=!ofP70K?ZPnlYU?t{L0xT(~`)oErr4J=J@&L6840{yx{J
zW%lL!W|;0Qt33v4Y>$-_N+=WwQ!KN9lo7oHPbd&!JEsnXV64Kg%i%loC-mxCoHAp!
zNYaK_MtyDi%lg9*6%jF}S0-obfglNapGEv}ONM;<GhWu1Q%OJym5JUG6xdzJ?4%g7
zsu+!2NcHY<jh7B>#<3f*$uG@fMC07;_u)(BtO^^v3Nl^P-)v|xm_)X0J=GRA+y?b+
zTG^YR%yf;7$zM|)gyjn3LqtdvZ_Ctw@qp^-&%#C*NnZ@B%Fh;)c{P9f^QcGfHG3BT
z*+TxOvmRi0B@a#wIA|FK5UeG%{|h;M%mIw#2c(*}NXzp;IP_ISNV%5G+4hy%4nIxT
z!MoX2z09}Rg7vuWlsb2v%#If|7ZXW&A_IJQQT?l5_4$NpJvtq_{H;sr!D?yTGWy1+
zmZb+bSkXL*Us}g&2M<d*?uY0bF!o*@qt1)ODG4O76kUz)<MxCeZ$zDDzif2H^$TY%
zw3il6GDy6uYs;9diU`qanPa0mF~%V7mJIE-P|~e3Q!Ux+WZvDmpW&BFjtD`sRjGi0
zib6NWu+b9=HDCJb=M5g1;>~M5yd>{0da{|x;i>7vri^8gB4M9augT6WqCez|D!BoM
z!whe2J0hu_%5gogONp8)Q($JflA-6^tX_Z7{GspH8jSXNy2sPq`R1=^H8r)hBM83a
zWmLt(3=XU7_`V-}Y?NbB!Y%;xH^HU*^p+;%p9NgTIVRjlAUzs@3%CO944jhp>}+MD
zRoK=ISQqiVe#F!l$ylM7K$O8>qi*k=?{&Kvv?78UTD_n+J4)Gx-6@kn&)M8gye24u
zkxup4LlV0i=1n2Rs&cs4{#&1vq)P8ftIv-N40W&{vb`!|Rnu^-YEe=of<(`tgk)tx
z2l_XR|5Jzjj166_4W~?mYJo*;12B)5dJRqU#U=f?)A{gLM1)KUzK}kl5I2BhFQp`j
z_u0>IL1PyhK3#;BNRgd#u-;DhQhH)Y=f|PD&G*aAIysU*ey|&*#-?BuKDBPjl7ay9
z#LCJ_LPDa$`?-0SsC*M2;y$B1!*fd<SU=ljtwn?A1I%O#$H{Xa1A~nOYGrz@#<Syw
zwR!@KQv74e=_knrBzkBE=bj&&;z%o!>&`8caG><QmgVSO5^B!^7dUxDur{5{q@<vC
zU}w~xuSoo~=Uvs7?(PqxJ4DA+^z;>_rBmj0Qu)==b5DSiyfQ6z4KV$FkA^~JD@K%v
zV#@_T8oZyIj}BFWZ><?V#rn<yAIPwC?IF>nP2~U`0Fzr;ALL*qrR~kXzi4&w&E&*@
zt~e)UU6oI`GNH*F&rbm{ckj-9wuuX^XwllaG_^%*p8^UUpM39*d{GNu&3O=iV(pP!
zt7V?M!CxX9!_n>wSiTZ9hO?Y}cyn{}#)bhfZ(+v#e1ErQXm}WC%S=j2+Cph_eX>25
zgvC_)E85VheJz8@XN~k|Z8D=#?d78Vp5f5*36olx0M^IKc+M;RnnbSf^6CDPB$kSu
zJ;}P-+d@Ml4S4J5PdPH$qC11$n03aK5pxx7_}b1y`a}n27K=VrIei$*9e`FR0E+f)
z`?ZA%jZ!ThG*M%TSS<44V6s54TCGwmv81sl;o*@Zv@cwLhFi=OoV}T<^H*esSd1Jq
zmimU^0ef}cWB6i=l$e#}@|?wzD#LMfQZlu_IC#oUaI8O8k%7)c<H!%VciMO4k=eD@
zgn&)D`v);ZI|H7dje8r=G&>u31*XY*6<^Ta4I*>5i?a_`KUH$o!h)7D{(X1For>;L
zu;`|v0)T`2*!lyv+#d0d8Rjd;V(#5h=KzH|cPjJoM_t9*XM=Y+sUXi=9Gqu_tkdTL
zmN$MAswooCgD|E6-0_~H7Jvr;xdVKx#vcbkcl8^UYj3y~Vc~zAFHG)85~+D-Z3V$1
zF`O`V@UuRD9RzKVm+Ni2bu8G?F=zrvAb?f`|7O4C51Rk-cTE{_&Jew{SJ@zAWFhW8
zy&?KdYwb?}?&FyP0Zh;9IY;Y4cEf(ZcbB38TF-pDleM{;zd%Kr4272E4DOrfd-`B0
zP6{l{-7W(UtEf#^@aEk<;tGHV2#kAjTo%&+9?g!nFcFZ45dX(gTCeFint*A8()GZG
zlF#A+?A2cCsmUj&J_QW|QBl#<)YQM~WdiI3zA)WX8aJH`W)^01^F37B-&BGp6rj}!
z{IhF$?<(3w@}~Ba=(WSVfZ%J<ar>v!b!K_+P=#EqxrjW2>PQGSkU-4}_h!G1Zck0k
z4S3GgPNvUU0fC-w!y#lr6E#4jc0R{xWMXo$4fV7o1$~MOj`I~PD;c2j-QVAvLGK}r
z85~-Km-S&VQmb$~MJR|y@}Jb%tet4Rws4;pcvCBy5d-M`P=dI~b1s0e8Qn&-75_~1
zDP!J7J&g4iG2(F_aKFl`IU)fVH2}@96*FtMj_Sai75CYQitf$L`|y`&>iu{DWFteV
zfMCi)h{M*WW>w<|g{<Q9GgDPDM=84wFL%jF-^^GvJC}EpIxT4lwSx*bBx>3Y351i@
zAbCSk2cyP9Z9*E_?%<RbACyR6xR_a5QqPPXHofA{q}cD;ratz=Cm5o!rDbuSarKK*
zT6gS*AK*z7=CMFt5x~8Y#6`+A2Krt%0}rtMidf(#V#H^UvH+k{uK<s7o#x5v-`%qO
z-#pP9SnwsHZ9*F_%7>7cLh3?dgDoF|q(GX&+VNy{mx|&XX5*!)%-O5zu1=^A0Dd>3
z)Tgj{PBe-%tn@+}PgQ3thc+!`kueccOYf9RBt$=JPMmF+yGVU>aE;&0@$aWYdP$#u
z06MmwEqfm-2O^-N;;U;v77FbIT0NTp&g0(oomL$>#O#Mp=>P$+7vNBN;a{kN*6ddR
zq(J>33=lXk1OU!y!aD|OSC&;R3xY9$xzwi6o3JX16s{OMvF5Ch)iRG)e~gBgjAL+<
zL=*jCC+js^@GzR|j~(&nbhmwRnv+qnSr0rn6uU@>9DQbt4=T!`xSJU8QjV5(u@JEj
z73)A*4N3*PzpcP59$Pon-jd>b9)P6W<zN9@tJq?#af1MW763kt9TJDQNE}FV*aN&*
ziNxMv>;`-N)93ZtP*Hi>1SBQ2SX=XwysS?5b<N#yLy;%Eu-;CN<WLpa%kcODIA@y%
z9|w(3WEnri@q!QbGsq;V>LCRx?RbpUTVxN1AOPh1(D~RgNt=uWSc!&lJw_le1>_q8
zS}{9auG_L}-+UgBHGVEiNGR@cBvwv=L~T@Q0%P`uSZSmLZA{prE=WyBa?94rJNsd?
zfZ94Kq}1MH2ZvWI@4{W+$uZQV<@j-BY<E*s{D#{CBd6B`Y90mjsqA|xq~oM?){c91
zzz_r|{c0FD610NgM&m1g$Rhj|KtBK=xn98}%P(xa+1qi#j48IK$xd+P1m!+68D=aA
z1$}7QuFU{E(z8;Xgv203qDzJ~q)ixyB1k{&6)UtdQkgBT_`Lqw<M@~I%HvIw3OX5V
z(K6h+A-~z9`0&a5rBaC58#grqTL6%xZ3(%<1mGY~zHC^Kr>+g56#U!5XR(Bc#-l-b
zT;ORoGzE=fhV^!zqX+PCe0e!b$g0oCDG1*T2>~~fA`v$<ey@muI=P^n!3-fI;*-N1
z)-IoxW1Y6;7$dPfrp$cHNA+Nv&*D?wGC+&Np;FeV3fgn?e6M<O0C~XBvHB=}dCv?7
zkYDZ|DB071ZLydCcLfA8P5&dOPar~W{@#keVe?E@e*A7QxaENp7}q5Ihug|IUg+vd
zi&$8MEQGM1@lku})e?1FuTeQlzQfXp7dew2Y4vpe2|9f5y<+Xav*9<V!*<`oS!!ig
zt53#fZE<!MKLXw4l%&1gklDcmqSqZ}J}rk{&(_jEg&hG{nE0Et82`(>0<;IdVhedE
zT;nhG=?2vRgtJchR|B<I(a(^JGrU7;<B?hHN*%9yO5BOyrQ&9%7xF$vND40;o62g*
z-9Dxxr=cmmO`16goQ~t)fq7R^|I{|oUKJA((hD9LM4!OPKkO8hN)v~Q=t^?dr0nFg
zWaDwl$ONZ<0Ux(f2MmNL>FqRzUoM?pTK!_SNyxKE^r7(U?fb-yVr8hnrjwB34Ob?1
zkihL8gt59u-X!k=4Y4->S}FpllY4%)a4--@pSxKgBjqQir{n*1?Pnk80Y!R7XkN$7
z#kD}$+-Sesezy*P3{)=X;eU5qad+M;+7loV7w(hqo0*JmefrHBxm#rnb=q-FJ?iSQ
zO=|TrE&h8EB8BQ)xwNkaU&}3ZNz!kfN=&q(<F)F<25M}BQiQZu#NbcFOqZm|+_r|G
z8AdJ|`Y4>Gcdgz4?RL-?AmggV$F?R7;O9Z!x^*r(jO*>iB=5?RCj#$2g2AZszylk@
z^$c4p2Li>B60pQ=D(Y`Xid>-uQ$mtXrOoRt|8RAB94ZDTW~vMY0G?oTo1J?F0ZQz}
zUmvGQYn4WgkH~W){<1(61r^oBPHpE<^OenRaEs_=%wf&*6p#J4J}vL<`Vl9j)u)D~
z&{VV08Pdhz)gz9{u>kjLewCs?I48|$ED^H+RD9Q@;X-`TLOjh5N9kV!1QR(kV9i2z
z7BsO{&iCUls)4$qie|QFq;C}Kg%2-E@AgKJby&d!DxKj0ofZfK@cykYhfSUzt}|>H
z(&c{L>WKor$eHNOFBk8-zSSD+^qjNCSNEgT_oV(ByM;2fs_5zpV`xyQs0&f2d>9Fh
zuJxB~m{61i;UDOPXdc+lfjSV%Kkd589WShsxxEZ8g1Fzr-tD2iu<s-?tiuB$B`^mo
zfDXC>1DL3Zi9lliOVTJX@IhXF*CVA~;q&L-%;imnn+{{3DZ}pLAQ~zFFWqQ-{+y7J
zF?Q=ZIxrwG4+r}6jb@?g<F5KV>HAa5fgxAg2g|Xr@i8vVaT6LY=L%xpl6{ec?Y-5_
zNpwmv5&0h&&PU(<u~>6;4ZNEg*PxgD_+Mn0*x<?mgcStB4*==~yr6ZZX7m`(_+%=1
z5|StbjK4SoE@6Xn6~C@zrw3dIt2YVYANktc??1L7qVMv5VvZn=i}E}g>=m{-rOSBl
z!hy=Biec1o?H}<&OZYl4oYis6hVr}*vgGx8w5gGr0TOjG;IY<+;SQUJ33e{?o;Xpg
zNQma``kkXd&S+vT;9rKBfnfq5Qk?!s(8q}%Oh`}nTA8d^+$|<sM+6jLr?`q5M?i0&
z8yrkgk3TK>Oyra3r!JQ@ek%i*FW&P_?d;T>9HEm>md1gB<dcTv*yHKsIYhD|gL8q{
z@dRz&#VuD?jwHbFc_C>CP;HO$8lk5G5R|_m@91x?+xIJXufxBj4qg^pJaG}R+^TCc
zwd3Kjv766mX5R=q;3JXOZT`_EPAf7$lUe^{Wlc0DuV&>G`S#6IG#{i`dvnr48uAW0
zgFo$lzd^Ecl~6=vw|zkq6+=hsOqA&rh2m+bqx}<yq4l9+jXL0VU>Pz1c10n)G`AD_
zj|2fItL9C^)o38+JFP6-?c+yi-CtB%tqfz^i&8Rxe0uDD^lkJ&$!+#Wl`S<MDj1jx
zp(tLTYX=_v{8fv9y32-~>$Y`X70&MM?x}S!;JfL83`z0qJ!lBQTlv%I@O14XCpQm$
zc63DA%=|nK4o-;noKNElK+cME_RWY(N}8Qra6hA7RT%)xyz|?gZ1NV<`1Bn*0ZD*#
zxw%+~vJW~?{nbb1GgJUWUiu%iPzqbkbg<CJ^I@vsL(bWw&+E;PSO=`-26~Ul@XP?{
z#gBNCl9xM_1DbqttQS?{IoMXu%#t7KZuRDSIiOuvewm?BzH4j0eLM*0&H7X|2o4So
z5SpE}-&{=h8(R#Z=s^IP83%#$5aO>r)-@|s{>XMK4M>C8s{$tnM*}Ig--=KqIB0JM
zCn$D!yS?foEQ3#D`L)sgda$mS3jn4bqd-AXi>jHf#;micDQuK}0LndeRvC-8(Q*jC
zG=XX2y0g3$ECKuRHE-^45@uo)d;${ml12Kg1k`~oaqX)CyuNc{qzJKGmvW#(FCkG}
zX-`GaTcr(fXhQ%Y!dh~A`bzA=-9TNr2M<pD1)nBB$5VKL{r4xJCzJzN(3RbBwC#+L
z=8T$+x+D=TtxV+{0BQYq+;IWg)JseJV}-kgwii&Lap-Kl&7w*oGy(^Dp#|LO&Cb7n
z=?gHPeLsx>;#y2a*ouXTSw*|#4KoZ>mtfS1a)Ey`nb~(N!U6hl{!1ojR^m&(Nt$o8
z#W_57)(7BMJxk{FeVt=rCLOM(9R^BVE?_NzNies1UpyJqfMP6G+<V)kj^epPRC<sA
zJl?M_v`GZofd_H9<c_-%vI?a<-rU|%tgtTcoNNWjb=x#?u11z&twzQZj~>>#&$t}W
z{eCXI=@xfr02i*N^&Sy=17ALMZLkiDn%4@u06a>I^iH(cK_g&n3<=50noY;^tRCaK
zOqP6Ai#d6>w;A*q{$CT=MA`CY0tei0g9DQ1M&x5W<Qwo`uhJXZ30ZA=hS5dwHI=Rd
z9s0)J<<wtHAB2nU4n|o4oC)Btdb?zH>&f&(PpAkuYoL5%&Hn2pdyejeV2(eO83y6S
zL<}4HyD@Q|{}9F7J9G7@o#VD2jC30Z2?t&+NV9s6e@`XQ@OCai@<<925{@(=UbhaU
zvflW4;P=XteC4R6tfpF#r{~hQ(h<UQVwq9+a5skBG4FAcql`gts~gS7VrYaSERy7)
zYxT5uNJ!5OVTfK3>8PWfmv1)L9}Vx#$RjFfcgy!O($k|L^7n!uYg!O6GY$X-GC5Dw
zB>%jQ@u<%^F4;TI-45j?aN12-RT%A-{4msyHfpUQrrE68tKp*g#Is9Q^5VSGvNqw>
zp<PFO7}?~4g-u{?miB7j)k27e;Bgg+JnMC~GgZZJ>DuAj_JR(SI047MY~SB}A8^Sl
zSkznD56;$e8Q~%#y(iBl>O9x&j0N|rneA&!jCNz2q&<0La3AjH<3q1jRt?7NJ$0*0
z9#}$|><zXi;|abq=Y72ihlXQ7^4N;i7Zf3j{?2YS&?ih;qRCIme?zngL>0int04dO
z-^0860cbRSFVng6EudYY3#K|nJFkYE$M@5+3Tv3q7Tk`Ke~MnM<U_BZx%~M;)>&48
zIb_m7P*Cm%?AP`tkFrR=6UUs;zqV{W2nbgIlVZj6H|G6R^^r+Z5(4Y=K~1u!*nd)4
z%^atBkzC~?8izV=NLtuW4P3mO&Q9OXcd@x$8)O@Xs;0ui%0(aJ!u%=n+MgYl*@k!N
z4ICf@7z_a38Vu;zf!pp=-6>F3(MSZ_{2qwVfHDd?ykGYt@()>|$Pv#5E1H_3q`S{M
zi+{>D{(l-;Mq&Q!EPV|_3y+CTLuw2jD$q1!BP1jzctzG{!NXH*S%Mt(7s2AQ@ceTq
zE^0kH=!&B5?_mZFBzWE12>yj6psR$eq7YN@Kskl*d?BzIZJnzHfC~ege;eIw*suMq
zN-1dI<-Q`PW0sl#rwoDQ<S9D&cFxIc@b_8Fdqp&orI*c}OEWXyih{qGi&tKFso7p_
z-bCBl8q2Zu*Kp*Z|F8ZI-J-rEc)2g1n}YA2*?6b8-0RUD%|W+Yb^qWp_xy>EktAef
zCI{7Fhtv|vca@Zk3^EclVFyI5kwA$e48W%>^*x{RK3DNOO8ZJbm){&@_*#5ddj9IA
zynFJ2fklGukb&&`jB|U`VH9P;O6;S@@w3&k+y33)BoB?1hzQ1epWF5?lT{hxSz!+{
zBxoP8Uv5)W{CTs3wx2k_i)qbg%5d<x8m#nhGp&yXG_DAl%?@jca!T$^M^80*o*k4+
zl;J}gr7>sCTjN0?A^Ir<FNxh`t*yk__hR$4{9c^p_hJ)U<b-n8FVt*zf7I5}O_<bp
zL<9wgc93xu9hKZ?y_{xd<m^&3<V2#`#tCi*;>s}KEv!+ByZzFBFU1xUM)YDqgVrd~
zLP<zT->6$4K5*cv=~5+d$Sls0P~K7JHy6cFqCHg3+tY6T`lK*OX*(IenIr-`gp42g
zTt>ZjWZ*}X%2*WnXelIknOP#LoNEv$;-pd<nJlZ?Fr#5X0vFc7Vh98%Ecgswul%at
zr`&3DV!ORcW%JMdqsPKxl&*6|{7Vnd<iPigMeLo_4=vU65a28fZ+^l!y8e|B-|xPy
zm`~|-_LE7OH%$PKi30ub-L)<!F|8_Atx<vVS{h+{b<Q{I$ferX-Xs5Va2-wIDKe+C
z09!EB+nJO2a3nYTW8X)%XTOjEEq0$@KJ9jF^6l#81I=XQF4u|uNS~hXh4~|1kcGFn
zqv>Ek+H^p=iIx`O_)Xpc!G9zFYSF8W^WEP6K6oCLe;rM|SPoMxNYxiSI9u?S(RHiP
z!45O%*{&jKHmFPecy#QHffzFcK5)>?9WBHDQ438bGI<2C;D-JHyI%}SEUfVo`*2uW
z2fYV^)R>OK9UZb5!Z1;&c*eRpe|RaMxrttj)`@#ujWwa6;XR}hv2gE#7Jf;sUa1wQ
z>g>c#O_*Ou$dBX-tC#|UUrzPXs}>v8sY8oq_Ng-1^spoqtvN5PGZuYih`-7jrnJC|
znH366ipC6>d92!RyzMyg?j}HIs1h1)OY5m+ZB74HUd0OK_~Gp_0W^J`<xt6cCoHtP
zKb}#Qj<`Bb;ofp+ogCI&K3`E&dvw+(4ePklO6sti?Nufc|G3Qc#{XiHC3xfm8ji@A
zm1cXoSF>VC)PU<=g_|(=_sU8awPcmDA)cBIaULHYD*b@3X=&b1;}Oc=N|jG=DiMfC
z>=)V>Ow-zim-+S29+h*`C^b43Tug50>z@wN-u3n)Yj8MqLi`!EFe!gsR_8taK4d?x
z0Sl#+3UC6&BO5W+B!D&*V*}hb-uS4otRLM~q>gI*w`5gdwQmdyrFV{LSH>w5yR$~z
za(AGXUYfUi+lhtMWMQH(;l@t+IUP-}kGG<K$C8fmmnEU00||`u=dHGPl&DaUgE;n0
zmQSWiBk7MxUqeE)oKm^ORpWOui_<2B9%t#GcV6cce5HFxnF|r!okMb(tkQo)Wo9<i
z1n9c8>*rULxcPk2!WLcp9~Yo$bmrU6#bVTDy;kbknf7?UME(;+sh-;LlYU>cT#hfa
ziQGIc0|_cAIf;@neJg{1NXQr=H<NyH)1$${!GSqY)tvM4cSC|Wj978MB+VRNb1EaP
zyaTvjyoFN@2YEX9zKgPCnYUYePQql<)BZ{*?<+!p(Y{`;pMPHa{^2D2{tl8xRIs9R
zp(bb?XN%3fj(MYU9N9;RjwpHR^Q6KtM!y&hNiGR<CWfky?bS0J2WqSW4mG->%*ZV9
zg}Qy>_v7E*)kN7>-p@bBXJ%s0vvtsoZx4g5${%kTB0UQg1ueVJyjXfjLkG;cRR0it
z$)eX{GCYvb?$|_kR#m;UaJnuZgM@uglEY2u6WYcqRYRlV(Gf9V%%NSY8s6JXbzc<g
z-vMiKGJgNHBRmA$FE5AaJ%<#(=W2}iZ#fGc3MJ>d#5|$c=}RUu40e8abTUKWXQ_}j
z;Rs~ul2SymQySmwhTbYbauc8?vR^tVOO{yHrN(7bdQ@Rtc@-RFlJ99w*#Vohp4ZAa
zE*f)XmMWysOuyz0H30U<+1UssM9CbU$C`{%EY;q@5PTjpYUbc#S^dcg#8SL=H<-ph
z6Ajq4zX#x8O(^-&fAR#<x4A{#g53g3m=;YlsgLKA_okL2BFxt%*r#7`=-u>$UgtN-
zcPrbI{Y((dd3_3u`d-3v>nfx%Bd4S3Rn$w1omX|`Y*;wZVp$KyY<5=IbP6((Cr+QW
z+0-(XsZVv!UlBfxds<tQ8Q%qAi}RLZMMWdmr4F0e&@QS@N(}vUDkj|xgg=!JAuy)>
zBqmaQHd6d2R%F|B?9<Qus}x`%9XG{2FKtt-J(qL*Ro)x#QqoeB$C6z3Rl)Z_Z3jmk
zI;$>(ge<dTqe^C`yYD_TlQ`TTMHV)wu%am{jHK)GIn3Rq%#Q>}cPyVa;p)1-q7;{J
zT)=JiX=5MHI3eQ0V?kB*Y}AXnCTrH6KA4`@!!;aF*ex)9V|AZs$7bWMes&suxi`fP
zmH3D@|HJQn4TV)Cw$g%+*lt4FkBmAi!Hc8X76Jvh0gal6&G`4GW$bVvk%X-ls*XO2
zS#oroCe|wF>;0oMPbk?kbCoZZOY$T9t9rS<^RefzKQQ0Zz^8JwuRXSye_GQ|I40UJ
z(oZ%Ck}h_ie8qF?NCI{U-YVDn++UcD?_zqan$X*}KPuw<;6z)^zLoauq)VT~5{Ny3
zV-I#fvuiCh@0M~I^@xNdZm738C1l}TiW(?%{H^;sB~j7bal+fk7G32A&UTpCkx`-R
z+M!gU8iDrPt&(-(X0z$*2Wi=irJdn$c2y%7sMpLk4`hpjq1$&a5NOrt2$p`KU>d?A
zF|IU!Ff?uR*zuhLxa)ckG$;0}I2{)!-0_ONG{5fdhwz=Mn?)ndvqVJH!QnGh)nk=$
z=R{)S?W2Sx+bRklftX@Z5Xg8azTHL(z^EyrD)?mnDwK7eUG*qWl=Wq>s1#jjF%T~~
z-vu8{NKud{j+PZD)wGqw#F)Wjbp$#yT`eRQV1zRGUh65=vA{ybf6{bQt{*_=M9(qH
zs)a!1rp`bFaX5rkP`4aRWPfbYK;g0Dvk~%yq{*#Hujc7oolIAAR&W*{f4{KYTg(a)
zugWPn!~luOdhG=M3eS08g$ic0(BZMe&bF^~;r)@d$q6?&9%IPhCXAPsJaN>#0nrGB
zfqI}VplZX+)J4{}+G+Q@b8w9h6%WYOSr*CSFpoA$&|jN^!|o$~mU3}piq7#!AkQ>*
z3Hidcn7?snThNqtMR&=C^w-2O^(J9$H!gl>rGidjoFi$W(#eJ3ilHSuj3{PN7!W7k
z*hXLYFQbsslT0Yhc2Z>)juDp?Jm+oo)vk5q*A!ou@V8c1K)<Vg$er@N*jHR4W9h+9
zOH}FXR=37;BpMhKz1irwuC&C#)LSUsL|7ouROv0<4L<phn95qNi;|<NOn@#%F!>f^
zz{{r`B6$$eUIv-(l1$mLRHTKkQQ@L(I5<(KQuaRVWg3DioNty}oD3*FIOZ1#SSQj{
zUv?_ruXBt^UJy}C51C%kw<5@2;t~)b9;W^C5mCM+$*4<}0u+ITwn^9g^Q_|AlX${$
zUG`$N%j4!P_|cR7nwy~xc--ZTr6nl@(#t5+R#*~Tkh&DxNzXoa-Me8Tk7Y%d1(jr*
zX7NAf@esaZ6Rl5R5qH;q6TYpjt^Kw_`pxCeYr0W^Z7q!R7M361ntg)io1G3}2x^*|
z7L1tx<OlXsYSp8@t&y2oN<)r#hV06oR$+U5^@66xNPco9_D95($?RlKjf6}TDie3d
z<kKTu|AkXdG3m_jLtiANO^wAL&pp~l-n~cqdYa`ftG~phpua07zl-j70t*tjl#RQ)
z;n$ff753WpU<L!>#HCQjf$7{Fr7B}`f4A0n6l=KAG)Lb)ZSS)$bx9p77*4y2yLr5<
z+Ai}9p8U-<!grF_f=Ac3mMgew9Wc=-_^Iyc%W-zAjj89IknI}sPNqEO5w#L~?NaLi
zqvVAJ!-yUjj`XE7Ztq5rO~-?z%_Y3%U1sHI+r7%_DvLRb_TBW*bmF8I1Bh|%)lk06
zB@q!L9U=8{gJy723l#h^lI#{d#$F4Nj*$MiF|<9YHE{@#%Dj8$bj+-+`(3ai9adLY
z=VAZ$PlM=g#pFAn9o=tIfrf6P`2mxz@sm~p5@{yTXIHJE#L)^9(Zu8^UJZQ!ENc3Y
zh=63U><Ndz+nS!<g0_OA82CK0VMA@4Aq3|es(M1XMaH))2DYIWHl|c-mf@1N9&nHz
zJ#W^~DgRP&y3%6sgJTwbQGu>P<ch~mSu3UB?Os&&se=o!v)SBjyHAsJmJ?cRW`l<d
z8^;&TD|p?J$UanOZ2U&tZX(<{LA8{P%v*{;#73TZY$X}i=$U%noEp_!<Kx&~nDLn>
z*eQDAWij1F*8h+}9GUO4V885hy`-5-DJU#CmLz{;$C4bsc1l~6S*1VRkCN2%Lg>-{
zu{_+9*s@m>9SzNZIkilcroGDS$9GOU5@+x?d7JQMv=KG4CD;fFB>_!nORs9rrBAs@
zaYpDLN%xB1p@B4v>#imuPdIJ59d*ZJi_{Aj^_ct^2J?i&^<8iGJsSD55lrgv16Xa)
ztdq2=_|#9~@@tBK7Tu@@U=XZHt5!#N*(c4~_#mLM^o%PCF6CZ4){yGw_S<By4dnP0
zGzHbGG{i}5<Q|ejfmjV_VAhWwd}@cM4kJF1`i15ED&LEAQcx8bk-c~AYf|$&*Z!7^
z$8h3toA3+~Xy0|yj7$gD-KPF+c&p=v);&jgkY3;uv(w7+*YJi07XblKd*R<bKq~q%
z@!;D12%0qwc-+046a-@Q1VD+9m|gTs>@eP(+sfHU{R}-R*5DJJR95x_@_k~{i68Ql
zB+q5|7jh(_J6c!k6fnPkud`&%E|aXZJd|^_Vf(@j#(U@It6tFFiz%0u7XE>o3_OjD
z{j+^`vjtCNVy&Zl^}fa;Z2H4$^0%C+yQB6lR^x{EbojUKMpKa+L5!iD%4i(2&i9_y
zxey>29ow#h;%<wyE1N@U8J899p0dtzED)L}*_5aK@S>e1t<WYo7lYHayw3o2#1mtQ
zX>Pm0TKmS(gUN@x7+U2Px}w8V=_`=VoHksBqYd}qwxf?Nsw3kfNF##7st)?)?51Ib
zFrWQF>qE??YP))ygTrFGCS){+TmO^sFt(h&{x~o~SqLW5!OqTZ9@Z>De>H4kojXxY
zW7_VMkh4OUW?7tASWcBe;X@qO5DcW-cQ3j=wM7;v^nnnVZ4I5`F~Pe>Dae3Hy+D1$
z=r(+1<)96wK@xubEp*IyP3aHion#{5^)WkOUCL6GpD~?{qh>cQ>G~%?aUAy|VllHD
zAV%YD>KdoxdvPB*Hx(kdG|!E6UuIbk^of8jZ-aVlP?gTq57pzwh6lG0a(TAvOx*g1
zxYseCwaXIa<0f&o!AiJ3OP#Sr(^-ysi*FPg_c5%Xl+KRA6I?8*TpD&>ip`15<o<fv
zX5(6ydMyP}{NMbbuEXaPV>DtAP?9DNv<W}dh?%S`Q}vEf19buaK>f;YJO?CegE6q`
zAcgy64!9r>D#9dW?H_lsw9VsGH$MaF2+2Y75r}p9P)$BBQYnct8%xoD?HJ0zD0&Mh
zyGwD#$wDfv&Y6hQ=t;)gpcgV+EdI$j_u;0+`J=p+Oov`UXFMS*6O+A?=g-3GCnbX~
zOE4<s!e5k#segZ@ODFrq;7!5sX<AL<aN68kZmeZ5gP>%XKA$RWCP^-9^3L*I=BRCj
zgK?`8&4O2*x+!#lybL4*lOL&xhhor&g%+9<F3Yyp)8RyJNgY}JTtr_Ago|+43*xW%
z8l&TbJPe3&Kad()g8Ei}oZqH(i_t(4K*o2OqJOijv|G^yXu9;tl;q@GYKr1!y<B)m
zH4D5q*Lva=o+?@%)zycU+$M%ClpL;T@SslHoss)L41dJOQo361dbn~gdnVM~jtHGh
zK+pTY)EgaSkR!9Jm=Wh!E1O@p1pkcvbhNlBWkp+LKQJ);vG`{CB`9#SK1e1L=bJnc
zXPRTVaS%`Q)T;m1ofs68#3Nl|fr@bxp{^BcY^VG{0jhJ0hUsVAci~celI%U1Cz8Io
zg91&FjMrP2p1H0F;CN!fnqNt;1y=A+1?}J7iJP6wv!I(=5UXejTDV2XuEd@I(W=&c
z#SlKg8GDR7bJW#c%MJ&~$$>)I3RKIxA3xI4_KxGRZr~w_>+AoCB^A$8D*pTte^i;n
z^LjNYX{)?)%+^b~n8uT##@&0C5WO*rzmCm8K3$$M)7pQXOYnMkzL>BLBZuLorzH*B
zrIi2{v|cpgwOJZ7f{fk>0wL;lG<<3Ce6)xam?3<my;a~#9|p?>>(-6Qm-5?>G`)Dy
zQuw-QzrenE?X2_Ej5r>QbNvAH<h5^FF4t_>NCbv!zbQBH5zR?g#+OUgfd0hnOI6bm
z_x<1GiBdDKD!>O`B+0|u_V;_c<qfY`hSMK_%K0BHsb(`K%(&ENe`07WSD|bDkb{SF
zT)g_gKx5P2{+-jl>s}M+VN3i+Mq}Ha%Na_D-tpojSAqNcR<(_TfDF*z7PxXh*KVc6
zrj<I#y%EqAC_>2W5nuj(05~a(V_dU9vCA35w%N4a-w*S^3Qh6HqH*V)ZiTh9@cvn!
zx2tuw0{zPgIFC830A`nylaou;>Fdkss{!;XG5}iIDExg$-~}?Jfh1r8;@7X_m+xy&
z1=-nufNEJ6ZXL{E{V2{h9sc#-M$s7lUA9{TsI55v9zzHQ(DpFd(5T77h=_=h;o<R8
z`G6q+y8aWIY~8l3JtKM87!7XQk7(i}1us~78-4Hi&-J45<|6v%kb;1PMRCuvYto`m
z{(z0`;xg4l!b<Ybp(Pw8t^kAtAp0-E|FttP#MmQm3#Z??Z4b}`KYs+E0RY3C>l4Zy
z93BSx>Fe|VcUlx+RVG}wh!N5;dTt;OcDfGO;XZ_%zyR&TR!+Tlf0(Y*Q@8@X(cf+&
zJ2=$gGu#aXt}L(wtV1EGEcA5WxA@P(Z#E$K$#l<RdPkR0|G-8j7pgTxKD5TL5=Rqd
z)$~nN)U6dNyGDJ5AVjQidw1gKr}poWP~212bGpo~-ZcE4&%!;+*R4(lHx$-rrV4x>
zvtJJ|u$)NVZc-Zioi2yp=1l$6j!=Lvnq2`I`6I{x6kG)VuCtq)es2L}bl_=mNt3!E
zRK6glw{bY*YT|*2ULP_s#rZm|Rrr5cx$qt#;wAq;Z#DS);JGp?4(+pN=?%dQz@~yh
z9QZz#f=+K-voMdjm;KF+M#wO)M$}Y^-5KF7ZBkDOsW}(A>ju@XwbMxTy(NUax0GRs
zE2QKpBkrO~K~$ARE_UWFsgd90#B3q%?Yw~BAjSnw-e`$=EzY@;klDz4gdkG!C_UiH
z6TU|c>wa#Jvf_MyuPd?90>J~uXY}U#Je^1AVreqQ%YCirJ!vg?2hte`{j8rLOS!ay
z*5YU$=1T+uaYu4{&1Q#xubg*xHhvgHBVq3KeyXUn3Ray;u`6QlIKN?mh?$fxLgR#V
zjRS#n&KdgZXrVw^j@NO==9qL6FFu8mX83)<-V!INroPmge3Z)T3-*7yuOQq@9*JzX
z?OWqg6VDn0Hn+=y9){{_A(F!p({C3AkQgoli>pImwb7Z?t+5DHFrmhW()U%e@8%rH
ztz55(Sl{9OZ{s>0s?sNiCxZ!@6%n5B?)`OsCrGc#2>~l~9-K8LNs&J8==IJ0p=Lji
zfzogWA=MX*B7D&Z`i#h}bws7c=*J1Q?$;k9)07T5&v&fBs&g6zOvT3;o?V+fN|`a|
zK}>YCk^QE>B@5Kl@XZ|KBq>uJf7_SBg7z{XzsoVV4eE5zyT-`a#7W*3dNdOTFr?7>
zaEWw-m3NiZJAzzcg##~CvJDp7*kZVtOs|h3QPCwpLk5Z;$}pxlzJ4__p%P?nd)_B-
z-6mpX|DSZ#^At+-TY&zCJ<D-`B5T5X1dF7=TML`OIY+>dk#YQ<8C0z<Rhbgh%#+cN
zabLgfbFZvn%mbCtJ&;PNvjd+BR*kc^oTCatR@AzCAY%C0u<iXN*7+OYw9AjUu_<A(
zGC54<`K>Hw+q7{cQKxPbtUtOA^88_1r}~r?^D#Xh<f!1X$XOpfVasQA+qW#n10wa_
z_ZcvxKoRV!E>@fS?6T|x(PyX6oIwdv(YjKu&Y6J`u%$3APosvvfZXeqhRbH!+}`dp
zsQO9xS1C<!<|$ACWC1`2zNF$s;{KH&5d$HkoX2<CUY~DJYJe<Z(GWwvN<Gj!75`vp
z_`WKh8H%7=PZv!iay_0iToz=;pf%Vyfgh7*ZIr-XNV^%A%?m5*F@XePJs*)&l)25$
zN~c3?_K#%uMH)iVJNyYCD#nLdM^P<S>2*H`-q(Zs*4^#Z#qI5%vz`_aQG9M#bArf6
zMF(}Nd$Ea59*jg$_gr5I=WQW=gJGchPiFw0A@rO25#x@eDCx5@P%1eucwldte+z37
z+483;ph1yYZuby4U(5Di2tazo16QKfn<fjZ7@#h)02vTEh%tX$b`58m#zrME1&Mu8
ztk%gYU+su*+!E_r7KDT%v&hFfA-;DCHc>oVYqoNejf;>^@eM|;TI4^~8ZhDN`=%m#
zm43Fc%1+r)ws2}3wPh$f-+tfw3W*OAz#*Q;ne@>`_B#QafSs7MkN;imv0W~k6oqR2
zqs{*yam_hO1cL7eqUAo%!K~oInf?DvOTfh4+nJH!d5j^loIVbFOYz&B-j*EymNj)=
z$Y{HCya`F4Mi;ZG(-seh@<se6!|nSRQN8+2#`6|c$k6n7u@3YTkiR2fK*!%xzUg-1
z9&5!0NAM}qnAtDxeF7`bc0XPZab*^JfGdNUp$zDQ1J7^tAbRf+p$f6|xfw~%BaGy3
z1D(RyC2u5?iyG_ApeJ?daoTSU<SsbBS(!t-<3Nx7&aRBrMmxihI0)<a?jAl~a!j7+
zlyZ1-RmAiWpBSzg_Pyb5)y@rs-&PIx^)1}dpHy{RXk{7*=2vcPU;g=Jvvq$}8jpq4
z;$U<%n3cTX&&sq-4K5u>X1u6a%e$%GwBU1#Wd30Q5whLZ*4=OEJ_=Bp{O{k?p}EOU
z8gL1t@3Xa-C5U_$GQ2n3iw6zedpA>I<L~ZvNLlOus-u2y*Xc$B*<A=z=lu*PZo=DW
zh2YHKsMmUA*?gsy4(>+i$_m@ry2<;_>som*1gn4JC};pla94LsqNc|l`;p#D%%bc=
zJhS|Hi5Hy9k||mcG^cAC!sqUw?yqzF&MlISDXuH}#|o=*UY(<FHWO22#mK5^mSrrI
zoy3@-=pmO>diuHyN|m5R_2okDm0%8CrDU|;X^JF_Y0#i)kdqZ#>Wp&|3jZ()t}q^w
z>G?)q^byKjaJ0h&J!rYnu7{81xW-Nd5)!hJ--&RVt5+HV3nyobar~1fz_{w6X79aQ
z_$0ok`@7Bev9#j&IoGN&K46Y7%t`)hdUU!Mr`Lnylf3{i*#sp<Dj&Hk(__*fiUlr!
zfS}-znrEDY4lzEExi4F*${{!ZI4eg?)!t#A1z{epqm3v}RysOG;P;*<yKiJaBxvt5
zHGSSs)7}0!PYP*T&`-_bA@gcW<t;mOvC_bur<L7lyYA}^Un(D)ZfljV6{h%XlFFGV
z)%x?L_QU~VY01gh;eFlSuD+Z;+X}^>$Dotxwf$P)l+xYuS-v~?EWgfl)$?WrEd+d>
zDgb^M1f1?>cJ78Nw}a&k&!bPgZhHx{iJkT>O$3|CXn-FAE%euO!!`cvhZMo{-*>P$
z#zy(;W?bF<w(>tQpXO1s(M9n6{dMZH_PwREC1ZRZn6Wv7l^Mt4>Cdo&ozWI)jwv&M
zl}h0XS~fr626y_or&VPjWXTu~*;3L~#Owqz;|p_lW=nb=&MjL5I}WPl$Cv&P`VL%y
z98M*)+X~{c3WEaVSifBRXC@Fu2*Nf>&8eWY0dcOA4igau#uh5P;@8QCNgR70Y^~_v
z%(G_ks%JtT!bZv(2pz79Bd3R}eLz@x&ws*PckLL{Z(MY#<`mIP^I?!em<5FVBH9;7
z9r@@V*^F#TBm~8^0jtNc?c-5C5;cb+wAc0A*-J05mo1zsLq1dB!56kt!8Q-g?W!s1
zw4}}!2?IBlVm_+0%;;=&h9Y`N&I)3rVQV-m7l+9v0Gvc1roae$$yL=3En&%U5G%sT
z`=q3GT6f5dBa;a^``l3hshh0;j9whZ3>BFwl>enKvPax2@#XrnDixxJchs7O8v984
zym#ni7s*fGi;IEXa#o?<Y%m~B!rJt*X%A*3Ayd9k&+(OPeC}7&@@CuYb(#q&$bqym
zka7ggUmLIK(DWQ45nrtXa2WAb=XxMUevotuz9gzFG}{gqQ0{mpvM-|x-s!r%yDauY
z9x%GifM?vKb>ex1pp>2pjC7hhIg~xw`?D!y0v%y&v)rhttPBBSV`Bp(0(c!u(4|iX
z8a=HCK<w#v!^(sbN#KH57|A!xztkCyScr`LP(8mO1CQ8G-L00rz5ZN_BWZL_g0yg5
z8($3}cu6#WBr=E|H&OaKJz1L8H58aRk~uDS1RgNqFEp2+J@%bI98TDOF1Weg8JBG?
zuH)%T<g-5tF1lueHGXhY9E&CJ>$_ghs4?k@-VXMLqNjg7T5fUxTkntjke{Ew=6M;(
z;*=<rKQ9UdHN;U!5&I9LsWD;>Q>AtN;sxhhBVVDyImtlzEg5cC=H7LZ7SMj^F(k0#
z5hn5Af1*VD_k*n!TvLexVtC&Cdt@XFyGbs^dtk+Usu<F<oRlb}iYEt4mm=pmU?PeC
z1|hNHHOHF{keL?X-*$F(qWNDqiHJJ6%C75OqDUPJe+X-(`)j5fYNNWt_?X_vhZ;dR
zmf((d6A?0#P^FfS;=4)|Ll3^Wz7Yk^iTWW+M|}6KutN2e>ej`MJK3`udeyzWHa`{2
z@tt=0Puyeh2#LRosh5>3jc~0?GV*m95K^%i7xH5RQ|{u~$tz2Kw#+1CbKMGv!e8$D
zrDk9(i9>QExGZv0DEAu1tWO*)PA7F)D_?G0dIrb<-u35#C@^UhnWT_i*?CKI?G~(E
z4^e-+7ku1Tln}9N4`^pFp}lVg(<=iEP2Zk<Cpb53IY&%bZhAPcuBPNO`8?`@3Z2@w
zW4dXQO_K)1DXiWY@02u=JLt8j3Mwitw&QcgMnUQPV++g;`}u1^I3NTj&K`6`cmG%^
zZIt7ROUboJ*^t*J&*WXiczZv1E{FgQ$`UU9Izn~Ke+=TIpx}LM>cpuxoqN}kpZ6rS
z*bu2*ll6N*CRKe$e*0BBA1j=hbYZOJjyk@$u~t>}cNx_9opBWu%f7%Z2~hUJJjeeX
zNV^|gWEzAcV1RrJZB5L^a(->Pt#q7dE+Tz2Hzwz(9V={&K`-v6`c7}$!fyU0fVMjr
zQuEKrX*|Q#FsnaBAG90DF(FZh`sf3bCOq=9wA;W@Qow43)7Z<F?2rtMY-q{^pWrvy
zFU3jtB@Bmb)Zd`rGu2TEqKu#0zZ7M++tVF)ZW{>8GfP84nes8nz#2QrQd=~9WSAaC
z`Z{E54>6tT!2Jlcn*MS4?ys8&xE7A`|GCBs#1)&W0s@^SDy18nn~3j(fwtJ%+UbWK
z6)7W_-ObKmpj+12rj^ve4L=QJ@;Q6;3H%c71|H~AB%G3)CKFCPIe>Nze1A%w9}xsp
z8v!|mU^qU-kxdAN<T|Lxx$h-C8>YVvdIdM)xEoN@1z@_vbTPxef+oJQe$nXEq;j<s
z|CxImq9|WQR^l2<2%DbJmB-2ja|6i~OdXo42-(nVGk8k|%RP7{;VDSFHPIzhOt(Hx
zEWyrWr<fP>QaPW?0n43k^t&Z`UyyMh5+p!roJ|GL$n>v{ejD?C0;!Kkt<dQ`gt(%k
zVT*=&(ikFhsij|EnNCTz$X^WCpbPH{vt}Tego8jM`7;pD+B4Gy%{8-NX9UTGV!btd
zAa3gTeArfq1xGt0*D8LA`ng+2>QVm7?NG?k>;y-AlmBr6*iz_Zga)qAg5b7GxeXl{
zXaK5-NQ{FvG^j;oMvv)to2VeI-BvA{4};|D3^U$d7z(b#8x8@OgA7+Uw<>koqu_ID
zqQ3{wM?EzVOu)gRhUFU+$n@#J6SR`Or1!P#k*SL=z{wi#Y~NRNBE-R@pyGDUYtvW$
zv6XcMKD6f#=PNwu&!0aapnN(gJ(|G%M`r>Iu5Y<|@r>Vd)<uvrMdtWsVStWg%LeY;
zU>sc<Z2_2i#P?U7L`7LJS2)rFtf$PF6rBzm540QSx=_(Xn$2jS{sa`!o4h+fn%ceh
zD?WO4;PicpB)a{PF*#-^rM1~H=qNPx|M2w}P*MGF_$azT8bm-^Lb{|&8W9krq<a8~
zp^@%Tx=XqQ1f;vWySuwfy5Vg6{{H9QbM9K##gbvonmwO<`+c5#BMa=2PV3dO)6;_o
zlr4|99p#C!CrB*WPrEv`+%Q1P5lEAL%7%Ax4b1H&UM~`6fdIr}ta3l!Z3ca98yn_?
z)=b4`MFyr5@9o_V8rB6RT~cd5y}1;CogG@OtDeZ7FOud<o?=zN%I+!`Xc<d<xXKGL
zMFpL+gz+E<I{NRsfcGSQsjK;rj=_DCdgd*zIT-vCsi%)5?B9`4Vr3fZvg<(ddUX!5
zHaHgJXh#U`1jxKa7q8IZVj3bGPq27S6lN`k6v_hzx@tyXxOO*+y?^{DVZb2N0I$Q@
zU|K2Yaybshw7_~X_}II*0bZ_zd@GlNr3WdgV2a>Lt!nu45ww_D?0VB32{4bWw{=Mu
zs~7V<6JQjTt_~fqaM5N=hOj4ba_Wj+L%V`>ICn-j$%4>=vS8xR;qeYYmR)Y#gljtU
zY`dY3bB4Mi)}({L6If=iFt*Ol5Yz1{bbEX4ptk0(k))%x9Igp#B1*Y<dXAy4mAGm3
z>0xeK68eEoB)@ihIp{)pH;u0jpR(T&c&bWG`FD@FtwY($aIoQls-re8#7&QCkDcF6
zT)@HrU0wFU+|vIm8w{m}+xx8DCL3xRfd0$%eYo(VU<;A>ems~Kw)2wBTKno!nU$Z(
zq$E|gABrG#T-q|#Mh*qL0jAb6r$1-pbdNV={)qpwh;DAVrbOtKsV1*DL;U&;=S2b@
z33HH9hd5k>;ZxTnF@}gf+L;OmQ^a|$9q$9H4jC9sslJ)uGE3mP|NOz2NWI>H5inn@
zyC5?TW><gnF4W$-vv>vfrq;@Cc?l|&7Vu3J>*EGVU>#!iL)xXJYUw^k-OFWc)+FGI
zs#$Q#Xxiczr`BYw_BA<FS)=_^832F_r}_;8D5qP?QSm9OhM4|+eZljaxaU`3Y3n=8
z=X9?xzr$O=i$?Mc__kS?Vj7yRie_p|$7$E~m*1BN7Y3Wt*jK=Z9~T*Lh-X3{JeWPA
z%WwvS-}*}myIvhVr;jH`h(}`fT(IoCRu_H?8SEo1!C}4x-enP9%_8$w`aPOhYVP7x
zpUuA8O9F`A#&ZB~-wTQm0yBY<;ZP=<yOgLGfbh3?5r7KPfRxVqI7nRoFJ7^8wDSye
z@${##aV2$OahG`09d_#R#{4O#i0va0G$88ZG20JfHaOKCo({-Ee+?c!xx5(CBDbjV
zH}c^ZE$NkaJ!@`t+{<jR!3GGFfeAM#&@Vf^phLi-;j(OJReFIeF8M*?v1~*%HO>sx
z@c1nPFQ?&g`IO4^TNUcTeJ=ZKR}jn0Et;~)yJ@b|sN%gJzZx{UQyd0E_|^Fp&~K5A
zUE{uQ>}{vyd5PXPl~2vmV*Ov%*Wr@VxLwL0jAA~vK8I#Omo_v?5Ho$BZU_z(f1b?@
z-NUOO!eqnOui2gF?vsk-=(u!D`CyRj9RQ?KE@rB-Nn~Dpev{C!pu8Gr&cbDC8#9RW
z(t=()DUn#CluLBtctLboYclv344`&E{82Tr47}Cx=es(lx8Wxh@zO__C7TZEl#?^r
zegJ04S2#4|1MY&@EiTuV5{>CedC|16Iz@+V>j7Uxr5itw?o!3iXrN7||0DqS-_l>T
z>9P><*udSvfa%S4pFz2VE?{SZi%WIkfOzkwY~33PX)On-DBFyeHbu+=kt{R-i>1Zl
zGtBA)q&y1~w#ert&^6GkY?xHX?Z@^4@qM*FOFXtneq9cm$Ja1K;hZU5hpbd=00ywT
zBIcV!2*z}iq$butqw;e-e3AD-=PO3Qdq|Yp_umcKzg5F_={%U~DCaWIe^9T_i%+s>
zZ12%pABVx#lwZfN#ZNk5j5SEssM~b<#nIOZ<SOM&@0XH}7@vZU2I#+!G=6(eWK>S@
zZzdl_Y0;xtU=9k^>lv^d=v^P?wROHC0xC9k;83#PP`xAuQdzAtE<R{eB=)@hAWM{U
zwRk>MnX~gf`DnQ<A$b5w9rYq51?gpP;Mt)|U{@?7YUBuTke#u2R5NLp4-dA;unl%U
zynD`j;sS|j=A*w;E8+$RH6BQLB3}pa9m2axRb|W~0sP<j?KPsjM(yDU83Zk>_=n;Q
z%JrQ&GTogD*%xUocYwb?J7l8a?O(1?IklcTuF*r#BTM|X85C`;KOzD$Sa0a{>gwWR
zDh_%x#Q4d==srGlc6Lf;jeW9!rRAvlTmCQHbLG>%`VP{7?_UEdTNkGi#SRZRAj^b)
zN9gm!x5mGxN|JD)LhQ7q5<12v!;B`Nqf`?iz-nfZTu`cwx|H#j+9am}_s23Zl7cE)
zR_rg)?BUx<)-?tT@n3;(M401wl}@#N=OwP)H2hg%CW;AO!%1%qTmZYge&+4Hx~}gE
zgxaO6tOWif%^X#Rql}r2<(og~_aai+I1On@qTijq78jIft*xvWMUfVv)I~KR!Qjt2
z|E!$88+MznsO=z$drEo}`A*?2t<y(g^#bvFn%?Z!i`tPMJ5WvlE#LnCPrlMyqjH<{
zfoD_sJFW3)pd)M_$DxeyYlOSNSqRYKB4evj`J!0YX9U0fr0{v6>S!Pk2*|YX-(mlO
z0Y?Mx49rOV)Hno+H8=Z=C;%3kY*8#gx1dOJ#<Q?v1^9*RegE}GyidCG*%4vwoy=Km
z*ZGU-zIQ1t=aB=+Ki{(p8`npU=19lMDm&YE!K1e7btBD)WFyWnQW<Wi-Cg>PDtG+<
zqzktLM$ru=eSb^jSJ*`nHD4z*Tz}CaD^$$C;o&)6BlauQzF@j9<4^ySKCl7+KYCaA
z+Ew_JRn)H*F7w==aQqEZ@~V53&YZrXRYzg}pY-IS{9=fzDOG(j?*2i`d0^lQmH4*~
zbLPHp6t0f&euYTs)-c`wtg(M*hUsUvOCR6lM{E|A#|@p`ru2`>>JM6*hUqPG9hb0*
zuDRKZNWf(G*aAu}vr@N8ekZIB+3OI$sxfiY$zo}|`N6?@&z`O8&Qp9JFp1ERTXmkY
zI`OAAw&8+s1Ybml`4@~@!J_TUnLcb~d6Yzn6&izaq<ZZ_cKVl=bie-vKaPJ36;K5Q
zm!jaLZNKT%#~6`Jc!%?n+Tz$S*Kd|LQ#`uiA~9qgd?%tJj9S|)p$LkuqCeZr-*=m`
zY$<W4n6_?bn$Jw5q;psl)oyIeVDX!_!2wfb9W09aH$z9O{i(wD#Q2~q008wG;{{DJ
znE9tc5T1s1dB3UoVrRqZJVYjaRg*+zirG5!!XdSSH<43d+1GdlW-b`P2}OQ(U%^Km
zVNw70#Pwt?CPW}tDM<Uf_{*VXFpc-`i@^U#WKn!YnBu^~ngbLL({FAJ|2`M^FCvKq
zCZl4_HvRiKRRF)Ymc7vp^Kn-1guZ0M{6$S`)^%v9vC=jxT;mgSF6=U<L9!XEpjpQq
zK)q_`CMM%BcTjx%-iJkvZ+|#L0LI9G_W#t(RmbW#FcS<GX`oH5lNP&V2_pg+)O;>R
zisy$O*p;zhJK!=&$yl3VCq-QK9yTA>rs;22?m^PtDw7SX#;B&8yvk{Dp_9b$r#i#g
zpQ#9w9w`@(j?q<aRpMT8FGT(bC7lMz1}=hW(^tIz0vs43|3}K)!7M_~Rt1*ri_J2Y
zn(uzvqwW0~fN(hO`ZWqwh8^fB&*nZ==H%hDd}6talZtIDdFv8H3RKWLHr&h~<P~u?
z*!$~v-X=`W28lC3{!0bEV*PKa?c~53FSU156!Z6B?zRZ@rJ21~M{o{>rqBR1gcmJE
zd|KaX?9UlS3vw+;ZZcJfd9{|%TLPAx4i0kN>rWky1Y9mBs&r0(78j4<R5mDGKW8Xz
zCYY0a&I^(Z(frE-+r^zf=OMRkYG-iTf6YEQ`r>$zL83{TJkKE7`$Cro%~j@u8xY?D
zdw*NOULmclEH^moeuFYEj~fHqn^X%=fiEgqoNC`!yI-Z1uizw@H-VinkV<{+As&=$
zDkxL=@z4;To9xFMnPAu5&~r|?Gq5#zaVun4RuRy(5_s5U$aUSmzw|U1MFWfr>3e!3
zNsCQ?1puwd@C?(xnUJqQXWP~JD9Q7o7hUe2+Wk~$Y539K53|ldf2yly$#z(S4h3k@
z;{;P4XaJD7l2y&lp+^`mJdkP(?>ZP}pL8{?4%4Dh?%8hTiSjhrC8GfL_1R8-XLcYG
zR94Q+0>9}1o?#rKy_R?h61LqXH8-L+_?j>iI-1W}5A~!)ml@7mwfX0<|2y4;iqG-O
zy#s797}oDKcZ>7M&66-6D7@YVq&J7r1;PG!D_-pWQzfhE1KbdHR6B!JLxJfJ;ZY?l
z=9i!@iceWCfBM7#=wFU;nnX?soDwHqdLBhi7pdbN(SH?4t0zBk7n{BwpPM7a>8826
zK+)2~Ieu~xE6Rgv7Uq4H2b;c}O#7K10FLb@fp5QX+UygXV`KJ{9Z#kwV@Xw&4)2ZT
zi=AS5J6FdELEs5x+uM>ZL|miZ41&7$r;GgwX486f4OLD1%j-vq{VVrM);YwMbb+hG
z@9l?Qlr{5tbPVCWE>(7_5vC5}c^ZaI?Jx5dsN_;0fA^`*wmS!dgK>)&XdT?|njja8
zC-l&9JLT?8<-}f-q~E&AhZ642?W<;soypLryD}lqgze;u%8Rkx7pL?qlT#kr%9BU{
zVN(hme%a7JF<~sd>IRDx$~D=I<ypj`<NB)JQNJufx(@N|<c|+`nbf1em))v0PCe--
z=OpFJc~W)POAvnZ^SS`{mzzL47ikw{&VjYP`$>6uD{SMFK4I`kCn%p>y-;=l(sCd!
zA6f#Ocb&-qwfx7X`i5=3pO#nI(qilYg`4^m*(UsE?y$-Y=l()?#vPZVTnY|;`Sbz5
zrgn~sN2A~nVoTA@R(;w+@MLPy=9AHU`mr4GgETHNpj)_j=2?z?*aDs-p=N?^5V1a3
zGcN8*$Yd1F_<?KvuuX$ukok5{JBL1h*rQaC7trt=aCX&HF4j*iPN3Fq-u*b6$gRTr
zYBlU6KO|h$haG6i%fEH#-a1#s6`;ZvtUWK3do|6AN01{>N^dq^2Ex5yFwUQ<FJ))J
zmP~IIxIg{n1k(Oe9Ut>5xzQdkQo$mq?V~p&#A@z8HfpLW!&{pPE>y5`GHyCAvFwLD
zOJo2{BPirzKPNvysp0})u3g;8Ce!j*XKZu?T5a%HiB=NHdRjtGTRjr8PQEp1p(+`A
zJkHK{r^TTd_P#;!iylCisS%F7fs`57WNts`>kRg&;RI*?t5ujPdjGc)0l;VN_YffP
z!hIMUn*~E-5A{!@0&0~vGbd~ez+V;f+p~P@A8%U^aFQNtL{>cTA*Ybax?A(!CK+a*
zGuZp<@hu-j7@(r6s_vDyA%8;b)ssrsU?OqjSH-1yYfviOZ_V52`gAfTaVG!(q5f%u
zVgK(@SdD$_8|xHe;TpEH2dZ+Kv$nM5_ylZp-9lcwnuQ0~3|fnM=o>>T%eg3C*I&4&
zvG+GSqr+3ou2I`4`kKY`m*ybjYj9HkJd79KL))1g91GyZdJ+iF5X%_9iD<}osiXme
zKl-L=rqWV!VcfcZX{mDms+<6rB6U1?9DTU<kXl-<+?1KUeX6!bwwkdQUl;$B^O6sA
z9J)ShX(cPscOw59F>bqo!*IcX589HXyg!0UVJXHU>1VRDRp-z=${(mmYnPPc#}>%t
z_A_2<N^UW)eS#{$Ji^X)!M8KZb`>n)dt1?oykh&WG6hs_K2yh+M3!(sR|Q>%Tn;uG
zU?yDSrIJ;;kB&#Obe?bM0eq9mEQJNu1OL#}QAl;YUi*e%>@vJ+I;ui6)A+DZ8<du}
zXQu*fFg{lHA;lqHdiYrMmRCbqabixBbJ`AQXyPn4Gc5uE%PUfcTm&Y0<b?N?DS(#B
zyhN3N)Dh(otIK4Q>YLVCtIMEJX9g~fH~Ejwx92CMVWdwg^7AN#LR$tKjdl|jb8vWr
zzG)Q>z+6SmjX?2`hePVhdY(r8rCSU_>QPt{!D@Nty`Sbb(6H8phxnyj%>?$lmw3BQ
z6UbRy+||{kiHmXv)>!=26ckyPOUiR{{9v7s7f23EK^fRA87tkWtw2Z?=_7n;$O*re
znt_LMCKo;CPO#7X6W6>~=vv+l%X6i1(!U&)L33gg8>|+7a<xSUMy{ilvpb~W)_d<4
zO!fM#G8579u;X$yeZ~f+^^tx}(4d}P#r>(kmPW$vq|7deMXHxG=TAs;of0|$A=rQ!
zWkexCB&n(H@gUo?jjRjo4=DONQCaUCaB_aumj{!1JkYbk6phQM9uM>i<(ihh{$7ry
z0FlFzvnyuK5=>2dQqKJ>g&8g;sc3leCCU|UZCMt3wbOefUKIJc5Cg1<@g65$axDPi
zMl@pL>d1?y%?fGiRR?YCtWNi8wVQ|GHdH3VCgPTb+y+t5n!i~uNEOR9J=!0gn2l75
z#suSyALc9;eN;mvxt^^Y!>KU!px>{R^TF$QK?tMbb<_~E5fRcl2c4$9BoQjEmzD7_
zY2--8<lwXHJgC7bld;-lifJe|u2zXjG1JC7ntociEV{fjHN0XEZf|_Y)BEp;hu~qf
zE}e<JhB~{t>RW+{7E(L?!d|^Lu%3qiw2RWMvzu6Wc<EQYy-ktOl9BSj0=~e^DL4E1
zSs6xbc(}r8<}5=`hA#rK$rWDMjmbo2%tKEK0y9Hh{aIOthBz81w1-*a;7jnw6cq*-
zIsa(6b?JT693Cpu*CPZ`$TQ9GqQ4Wys$^-rUY1@}(tG;{g_|EQDSkgO;McDI!T{sx
zQ4amhr-U14f+***AN>O@-S>3LA8v~?u>@5dp5VvT>iF2t`ce8Sk1y{Fe$QMp8sf6C
zM2y6`MG?Iz@3P2)(?2=Z&*{#=Q}KS%|1ND2d7v(Nd$|xI)Tiu(NF}2GKi~2W=4GDc
z)6qECi%Xcv4FJYZ+o_F}0PrFD8UpA}PE6G~6z1BKF3h<&^>4LL7k~w-MV`6CxW{vs
zk5c6+bOy&{a2k_xMJ76hiG-nGi8D#D<I3)R(TDYu8#F*SU_2c1r^w+k-|o?Iy6;!&
zT1dJ$-A<Ti!|H>~Y*O_BlVdoaH}zRvGyALfA1KtQNI=Qdl$PnC$wrn2)E7Sd>XAKH
z^~zgsngF!d(?IM&i@6F(rZ|07aLYWOEtn$Xv*jq9x!0(D%sB3m(dCUR9XCP5!MXR`
z2EHGui}sk$UOgs-xO3&ve6wH}S}w+IOnyfOh(sykVr1!TFd#}&cJ6P8b<&i@MzqKc
zFfq~7Gtu8tD~<_dAnu{s$V#CMace89{Iry!JT$!s6*hs0b#n8SRT*r$S!ZxCBZona
zNJ$S!F!G$B))UJ=ah4ByW~)9zvIjgPX>VxHRzg1nsp8>Q^nQ+^L8;o_06(<WBR<-E
z#$LrxA7;-_aX7p;BNPjvr<*BdgmyP6ObSZw9$_^+!KJfq{HvHDnccEl1lu5{3Y`eh
z()U^(Z5RNV5+fb===weFepv8|RXRG>;=HA`T(Uy|CXNQ@<e~6DzUe22?{~0Yy7pgU
z11$G&<4ypeGHKnZ4D9fwd0qG~gfxP7RYmN!xe!Q=>S1kd3-%mw{P^&qL{n3<P`4Tu
z3HSah;ft2Mh30j;Y5awQYvI*_9K45Vm_ZgziKK^ppKpLy9dT}Rzd+GgWmWZ_Qn&3m
z7pWxzDA9LjhXT>?!PcJq(HT?3d>s5{HGDh(DD<yx@cfnE>$>!K{fz;ThMbO-HzG`6
z34A}6n0VgZ!O%C{O7t<1;_f2Q26}uhOXJUt!z0O|v@)djt?}@Eo$W>$%!-Rg<bbMm
znAi4dY;5VXlq&-xBPS1;QQx`htF~Zu^?VnEHvHzkxMd6)8lD_ms&&=Q=x4FIEb~rM
z{&9>^J+O%9rE$jZ!&BmX-y+z1D&$h16NSxR>(Jg#f&?io`*oTk*_CQKhM`wBycjud
z6_AOPODwq@N+HQVo&_VYz!xf=zR7SZE3IjE3)mZT@mp9e3sf==t{N?-qQ-A;R60qV
zLBY4Nt#d1O%y;^LTjcfuWW2I?I*L~D`pdZw!5RVVy16?9PR_NgeGJ-J`G$*Wpc}(M
zTQ9tn6Dma#Z>RR9(rzkk>F5FP(#eM&$lQx!A!1CrumG)!8*|X)b}*fj8tSnfqZ9;$
z_g5<-T1s&E2+8gzLz!jtyejD(Zxqj7tG)|$)jJEHQb~y<C1%98q#zh=UanvZfzh3G
zdw2}0#cb>L#k5szc(O0V@xrV!UC$q)s=AI&Ue_LmGGn2nJ$l2!##*>J!h|EJOr}oc
zi1_HWMbugm*Pyf@=_DkSwm?VDLPb-+_-JZ?Xa>m4qjg3sfB~`qg>{N&g5hzWuGf{H
zEnZ;{VqebTlZPcD?eSi5lxOXhX}GHwHWlQ(cP^=klBO%DsbChHID3;@r|ZaiVQyn1
zI&EuX9dyoG=>{!5VKNm9(9gV+eQVaBrE+iRap?*D-AFHUZEu7uCht9L&6G^#@JbY!
zX_-_b#2cFiNnXk44&h0d$*b2G24Km1vY-@Q+ilGI7}O!s-~iTcx|OB!{~hE6y#C^O
z5HDWEf^Y#(&VtDZy3@o={2-Fp)qS_?Qv-EkH;`YXl0~&$_(TqkX=%LoYY1%yqoa8|
z?z+S2rb4?_jmO=rK!I`&&Y-|19E0GHt34Nosa+u;_yc~wY39noPIZbwBbNFl6!4lf
z&3ac@2*99L9rkNJ^?wucFWQ^17Sm?%#%p9V(}A3(h{l0$5--)8h*h)*P+qKx70=XM
z<ujtz;g)=TC#}A&HPAP=WNL0v3G35*+sW;z-7Gqw?^Rh@JBfmp>NK0}zHIKA>X<aX
z==!`JB;RTmcE@wpNr^{ad`SNSl5i4*g^Ir}it`<D`ih7Ox`_1&zhE{6aR5dKcRhI&
zQo!q$mBIny&}q06BMhM(5R!o(Qrv8a8*L0UB}YD7b*trK_4S0tkQ>9Xwl+rXd0R;+
zI^+c;l**H*8=IQn^OGqTSrXg>9mMWqBYxMQJ5B86CAiD{T@_vAm7wN&XRy+@4Q!iu
zUxqiy40fZtO$GsPeGbk(=>Ew!-xBv*UAZdlzMiZyU99?`<6$$dzDRwww#raN6k|Ct
zVN&2R-sX)Dln7UsZreBST=@1?bj>9it;iv7-vBLr+4cg<;`P_kkgA)VC`ZmjafVst
z0kM*U@^aLr;z^Ddo;FU?$wi8><_pyZ`uW+H8Vq88bC8_{$}8Do4{m2!3bIev4AMG#
zA9Iv#mB;5S;Ym|<gIbEO0w7)EZ^9uG;}fO@g0QXF+rc4!@wkp$^fHxmMzyNjC_lU}
zo)#=$7BF;j{5Aek9{R83jU)T-HXC^GJ;%WF=#r=EBSDZ9B7FNQnGi!R8``85|5sA{
zk>0saU;9n}qslKzA65tDS@kP#)Em6dr;hoGOHDbWza}CF%NCVvtQEudnI5a!qN@MI
zj_&+w5f=QAqMCpB_UPt3V`7gQz=Q}Q35?i1z3UleT0U{v_=cKxn(+F=j~A8+d;5F4
zwo~TU!^YFwzY81w%B7{^72hf04z?A!V|WT)ybdsWb;3(Q>vqYa;`<d2sqF=VN3T^2
z#WivpbkTN`pgNZyR2bz5v@I*hZO)D!b(p#Tix|kiPXk2Odk1FJ!1gK6;`+x>|4k~3
z@1@Q7?<lF(g=X+yA#!_8G^%P^KYCPtv;QF38~D2MF~;DXS6Qp-;McMEk*x*`Pg<F{
zN%2Gptnuh4!f?EpurU{9(SsjqWpyl90h+HS@is`N)+*Ui28479CgNQ!*d~;xb}X(o
z&2NL3<EE4RX?63NZ|=tjgc`0|w>PQXY`fzBs}H12`$-b{)>WHYD!S11YQ11u<Yzs)
zwq1>2%usT{YjFf^e&?xxWAy(*d;fLz#2YJ+@HqWtxbp1$%m`pMKQ=n;NhO?L`5In`
zgGEYJ?2M0vB5DoR59Qv=9HdcEH+3~8Q1Pj7K%2oZFv{cgxD}$uX`T}}^rwxM)jtyN
zsI9OKpB$}c;bZbN1ZM8@zO##cSCTx+p_=0Y`u^O~f4u!Z=D9|@I6myj!>$jJdbIkQ
zb-amt#60lCA)wc4#F-WzH$gj+^2mQITC6;3KDq2sr!y<a=FgqlchlaKcmzdX?;PLW
z^G8xBKI#|vh3ZLXznCONG3#hUl6QI*GR#2dCiE$)eU&9?@M?Re%I{cc`F`k)8N_w{
zaG*h76;TkC;axIxYPTNm&vaa>d3;R7Z9c&~oifM@_a8^VMb1!28;AD4Ae{pv1YzBK
zB~XV^{j@F0dHm$=U$-0vA^Wr8g-dIAl&@g&wb)bL5CAf!Q9lbxmK?|h;@sXL$_BdU
z<3kqYwc^>%v4OQ^DD_!PE$K#H3WK3QLNfmdW9>#1_O7UX>)U*zNM`6}6O)pKWIzh1
z`|Udum5Y{ba$JLr2D-ZQQQPn4uU=()9w6pjTkr&CUIsIgL7rqJTlX`vi`fiw_qp-%
zL`i$jbkI7?_FAE4)4LZ%CzBApes+U1nwJ`L_qN`aeG9(hKWeq2JU(sj8c%cX&GD7A
zf5P%_S8s4Sd3axTljf|6UOC{-xL=<9ZkB)kMDEeX=vN>J3mEg#98zLCCiUr|_ju4Y
zHIOoM>r+)KufxS-mG-}Ghjk>2s)KUvdWWOASfa2Lux;cjMkIZv!X#QGz2(`4f$@0P
z$s}z2GjNHqvVpH-|M&GEaaQrR@&2R?HYPs#h8%v^k-<H>{Wb`75<y(vgTQy`b4@Ix
z-HC9^k3nt0Aam=JF-MEy^%e#YJyum)I#vYTQ%RF?L{sHPOo{Z7+Rve5QM-BRcT5!>
zu!_xeD5(Pt2DUic^{P(4#v>&W86_Jv$YgINQqSdw{i3QvSXxQrItipgyKRE?EF~UP
z#bREnxO|%k&Mh8zmHh&r{_1efF?+~3Gn#A^xtC!JVZ8P7Ecb_iSf8Im5*`7yq{pEz
z!8@7J*yhwcD~_ePizQb?uMm)2xyOO_j?z(PPYXTXEXV<usNShv9Ev!s7SV7T_xrS5
zE;w)B;_#(s4=wC%$mTKC@W>?}HQ-hFc=zMSn(a&RzsL#Y=P?_9>l%`eP2ZRwzD!zT
zI6V!No~v==^(s9bDJqto;+WZKDA9SRlY%el!lSt%;Hlw0I;x>*x5wL>Am@n|S(vcj
zFfg#0=IO%e(jG)$=e7@)ZajCJ)RCV3P8@bGMuXH@JYwSlKK$h_CY_o&eWA8^k$^5=
zv(;IwzFo<zhMnnat5vgNUuPI###A=tB)Ogp4KU(=<Ia$?v35##E&H4WrT_uQ@|E&H
z4g2hYi~u(%U=`T<sZ{ie9L`Di&IFv~f`<d3KpRAq)Fe;J%q89(tbIbStm391BO08u
zzCDYdaee`n+tan*+nQL>#Jh4oH94x*BW>(qU>?X%t~hF5M$w-=hh84>*vlsqA;)|g
z{%-JbN>QsWo&TzRLEuMWZh5NAgZp)<+~Y)v@!}qVofR(fm)v}-U-JNwz0SiitIL@G
ze5NFe($T!TG(ik6bhB>dSm>cWt9E~CAl=m^+RoPbNnd;{^l4z$rpNUrN!=Fy*PR}8
zX?okJ0Mf{GU|=b3xRF#=3+HqP^M3CY*pHZ#W7L2vnMmJ!E48%sVqZ`LQ#|MV0tbqM
zcBbPdhTFDX`qn4L@gT&*5*7~#*?zPRqf!sUTzZ)P+OR8Qqb11{+<gLK#RHVU2x(gM
zUpM^4bfJrCcXpLRO~HQk0WEbKbdl&#PJ1IS%@3<pjom|3gYm)nM*r5Eo{3485({iR
z0`o7Al35B3WXPcoWk0mNFkB+=gSm64duXV$n#pj^kXiDA9gVi8f~P%WLu@ZXD+y4e
zw-Cw!HTG0briPs~d!v=bvPD=BEbwiOt&Tc3=#gYhn+G@?wW0(k!-|;_iy}l5jdhRD
zCk;~ZfZ%-^*GQI?^9_wDqtrAuHyY3Zxl^?z;VUv8?Pz2)JL#vss9>_ij}+c%k?z+a
zwyII+c2Sg)P(amj%iwOI0w)q|+a|ZU$-CJkO)JpG<$3}9iSR9E4nImHau_O?!#-fJ
zkZ>ZyN827Urg)v&0$5>Rmjq}Y8olv4bFZpQvM2!3itu_PdUn|X;p-?i_a{|*`CdIc
zV7)eadsmPQ8r0-~TNY_|k*l!2ptFYL1fES}N^qIbG)KM3nV8V|U_#lRb1$#3*#x(1
zw5~s>w7T{ZXav2z$-p-mibR`bMmk8k8$$gxT#<vWIjqkW-o4|}L%y?8V~@IH8(Y|L
zJ)K+~R^@tk@`GZFN#=8WD}sAxlY3{(#UR&TGWBc+%?OL0=piTT$&d!^-Qo?W=!KQP
ze|!wKA08WD+7009*WZowe83MqgwfNQhT-8K)3UPT8fSiA1GRG5xUc1Va>#zX*%X8h
z@1&{;z9_Ddzxk1dN#M5QB>(VQ>LXIggIn1SIh{w-aj`?eObl}O@}jNXqyI4CzUO;&
zA$P;6Ipb+f=%Yu!X15&Yy^RTr1I_;wyV7%~ol6Ub;%jbPYRir=e0u|6)}yi&iCyaj
zyuF5ifY&zrHPd<bcFv=<2mQ>z^!_mMKrbWOQe9hobtcm(*kSJl55#ynXRbJGZwg+d
zy-B;3D@nKz(s_4lCQD*9?+10s`O~HNg1CHmq^Grrrp~dxcG20J7l@y?8g+L=(2}y5
zyRS`svfVn>IxTP>@?3NKb>&-oKCarQtvVbs={d*ZX{}LqInHP%cN;Ncd74lvQgwa;
zQxxedIh=ZHx111<_eOS~LMP0wSx`+*fMKbATWN7@gXKW?pPA`1V*IaLaHJ$c-WGbm
zJf|v1!wJHn&kuK_G&OT>SCU12(!WV{uXE0S4QDs?yM$-l$(4T2;{VyE>hNu8>y@D3
zfWx|dCd19a*Cr=pM{TvSF#>a3$Ilxfx>l9B#J?Puc>l8Y5|lgnf0pGEWkkz<p(Rva
zob2{^L-4MyIy|T8!z-{a%&UYaGCM_V^Gq>Q63{msrF$sYT!p;gqPd=RHrRR`Zsgl~
zoW^b-a$)_~U|ien4cB}vzZ(3&ZM$7`V|?^Ea|uq6C~9+3amIaD$VKh6dmaW@k(<9G
z@d}8w2=BVuDdObjo}QcxJ^L*9a6bM@RQUxgY+G9!NkQIo5h7B0{d40AQu^xZfSk<u
z=8=R>dH-_S($d$<gvc7pivltPmngmQhUN#SL)IuMm&6N%sq*4L^$&-r?QFJV%cxu9
z78v|A^fl@EEM`M%zTKx|%e0qdX`dgA#jaS@+E|9t8ysapz_eYXMiO#Wnu{d5cg9Cv
z-DqKPj88Y7T*U&iHZ8`#KT#Lz!vF#{dTph0+=V>tmuZpb@}_nhecWk-H_PYTm#1#D
zi-jw`WfeQETW?o}lHR!KLhs6y8rU0O`M2wFB^|FLBSrCr(R_PGz4*xgUEA9FWVRgI
z?M#)FlmsFd2wtyvTSJDuyu831DgqoFKZf$dI_XmYspY>klq2GElcq?!*rVs=kx)?}
z`|_Jd)iqs-L|$r2`ccA)4`de{doTs&n2VpQI~s$_q>gbKH62__w9V<D^_M0WN(UNO
z%KNc<LHSkM^Pf%*#$@c%^mk%Y3nhYa8W?s<zsFW^7QUqaC_*n25Hh+Q{yhLAVj20=
zx!LW8Ji2$F)6od_=G&nC;NU_D#rXQhf}meac}O$q&OkvNDqmq-{F4efCT6El8INFp
zd+ox8j!slmHAQb{ZPDi#8)VEZ#|DXv;ki1#V5ihq`Vyi%^}e~KlyzxVGm%HuJd6C}
zk67eU?-u_~DLg&?{9fJrM{%k#dCl6Ikuvn$G1Q=ff$ISBp&^dOkBq$c;QYcDA$)wY
ztE3<V`JVVD&|V5x&orQD!{+A2D}I)1rlQPIMj_>Z_SLuKAPwCYT9I;Rkx%mJDHEg9
zoNy`p&I-mG#wK~~t4M?1=m1$pM#dUQjS`H<+$AlFaT_8|foTYGIb*YYeFjSuev;Vu
zEGeV{rR*!(XEz@vz-edsUezkw)GO$^knQtwMoWBPp~O4MO_AgXRmy%>@2+`^QZM7q
zQj|PWl>7s+r5=Je8OJdbcY<_^)qpB3w%Jqx@*tr`p^E1iZ(Sk^fPv9%4R!>BfCRMg
zK21SEC}HbS(a}~n%|AcxfO!01Y>B<eNXn2iN4?R5$EXj4Zv=jAY;4TTsQp$202mOL
z7X(=(h~(XjtEGJbpz-*(uiEi?&$z1@1DJY0-T^y!SqgzAAX++zFAHX<-&V|gM(v+H
z_=Kx>;1_Y$TV05H6oB;g1!2?nu%~Ah*v3-kL7JO_-;wP?6h{?=%&jdG&nNXS(om@K
z=JjiK*xH<k(y8fb%|aDDVFc+2@aq8<pMSWL!p7Dq<x^*t86I1Okh3Jsf!omVolc6m
zxrPRQ^0`v|ZngIu#VOweFXN{E1P=V*Bj1vF{bYA^NKrEGcYKqXR*5lc79&f3TTOW*
zbEXo$ei}%O8~F8yD3gk@m8!87X~MzedW~CA-l$&Vguz2<z|nleudAL(A?Z98p_oZi
zMn*>B?1M_zuDMP1|BXxOKfKFd?^A{xm)*Iq4{P=;&5iAJh)3ORh`Oy8)aujAR$ZG`
z(VldmOpSjn*I5jUwb`b=pS)*sC5rRivAr&vW+K=Jz_u1}4rB;83e)L*5Z7Kc92Bvy
zUo%Q(x6?dHUK@bx3ZRjJ=(ONc&<cW;%qQp8hL2j#Cm$3;7YBpY4#x^AU+qzfijtV^
zK6)Q(9yD<9F8$i(o4Qo%NB`NNW9E2=pEVSbl9Gq{n)4@4j6$2;{-~$gL0s18*-Q0&
z#ox+e^^fva+l!aW0#Gc;i+nDFE==pKC~vA5om!_-g=X`My}L@RwLyOri#eq{UM&mN
z;_L8aCuH=MUub%lyp#lHa(-N{#BN=4{~G_XQSq24$;r%#zg^CChv4Kefdg;(3*t<J
zzrDe3dQzvA?@jjB7xBya*;)J5H1q`9xfz3Eh|q(cQTC$sdK94uBf&}mL&8}(BDV{B
zZuXRb06yo50Ya{Z3^3y|TZ9+96zJ?6<JXXU*iqU=S~%!T_Gz=6c)r<e-edRj;K1#$
zG$YjL;dQO#SClk~jq;d=vf<dDPj@txQMghkm)fqg%iA<;M}D_uiVaH<3FQif;xoLz
z5h5DV&AR5+)!+X*u<v1?N=19;YX6>yMlMyf3k9Ia{=YBsGqWS-+Yay=e<SkeZIJ-c
z(*U5o&+x|8TC&!+UV7}0&$*5fl^Cn2sO(wf3;%F;y=|I5_CiAgkfLZftsPE0tLmN9
zAyd-GGwI&ooQyM10%j9h2KtwB2^Q>)s>9ve=1viZ>yQKgYT_8f&E*^r_;%@Zx=k_B
zKAe)={tb4d1ZCuTsbAFfr|Ir$&)msjN)VG_OpYz>&-A?=v=0p#GxvB<>r&p32NRPn
zZmZbJGu?Da%l>57<&_DgD0a4#Hcm0_>qxGP)rr0fQuVU+b@f;>KSOmD<9@u&7`M-l
zFEj!J0wxR9vOP|J`^rtcJ{kBikbD;*@UnQReA+!mmT)$tk_M)HdN+_>Z($O%U(MBK
z{Gc)~&vY=6Q}lc<zu3V8hfM{*w<PKYnY6&##B^+o+rO$X_|DH#LifG9wk6p4+Z>S}
z<81D)Gk2<?K0xg~24!$|P7_Njw=PV*P(wc!P{7TRO2af>#jbuZtM3vb&*_GEMp{rr
zIq}Tz_I-u+1^L@@Dk_~<=7zhnF7(c!tIWk3n2L%*Y;XJ<90=N`A|IR_>VE&8Ft`0n
zX-LIrDHud!>QLLRUi&0mLLZCSwDyKx>QTd}s$K$Q!zGejpAVoErBuv~51XXPQ)A<H
z4Q#Vo7^kZ4SJysY$i4KpIQk}&#Pa1k{bvD<K4nc!S5&v3IAzmvO%^z(Q=o+TBB4>g
zpWHYkn<Oi`ok!SAD}YBnxS!gHLaedBzwe7vp;6eXE8N8!Vx<G3o9kON7<r{q($I8w
zb*%*f01}AM3SRS^gx&o!pJK|qdV6}s#zE*84|Vlj8I3FUMg>vIDMP`03i4a>kv0ed
z{Dr9k>2OFKCT3*8u<J<{^)(7Z^T5Oa^>7ZUI)~(}4bgAR4f^G)I_0N<t_1?H-~RZ)
zo)t4vsjZ?`fIl)K{nWBLB6v=wKKM<X#Y|$cQNZ=9Y5|u9eZIIw%O@@Y%VA}4!X*ox
zui}I;w-d4FZ_1TZ1Mo-x(5Z=ubuKl!>g(%AM@NtPBi;KPC`)KRpjK)ywoU0NtEyH*
zhe@&&5<qgLA_d@c7M30ZNNV{kDO<js&h$MgC|tsK-{B+obDzqKipFVK`l`%QouSl}
z;JxO1*))6A7{Eby^|{*JxHp+t+qNyc(UH-mI_%(P#m1j*GMY+Mbf~Y7-BLqZwBSfb
z>60LB`Le~A;6Nc&l}Db)*q&|cN(trhNi@uxgKoh*nz3rDoUFQo_z#Y0iDcmyq}@Y9
z(tVK_C#z#DLqnK@AhEpqS;bI$jLOICSh6(>(pYO&zw6%2Vk1y|6RFC}rz_F>4v8W`
zV1Bqt9?h0_8*`xGQTzZkBd~-1Z@(z^=Y6yD^SR9fp#<bZ6}t&8<m9Zck1lHhXNK|-
zwD846`H#&!H%v~@9GFG-Tosj)`Uml9`>gD9!znC@)#W8~=NRi#J!A!ZS0z(ebS&`D
z`%V|mt*R~h7!-Tk7elyAzV}Isl3VmW98|@fz8hCR{cMlART>+1*khkJ9Gv=yNIJyL
z9lTvQB&CdnnJEX#_s65r&$E<^d4)VDC$r&Gct<^Vg<5pA2R6M?Hs|f90*5oD`Ubq$
z--w_bAdDdr<arxQk>0A^dm>@}n1;C$esyxA$=lo;5}WP!`15^cR&M((1%kgx&ph3|
zAI6$YC+Qzu>3`C^-QTgYhKlfIKL(~1LvhuO+VgVSF;X3;NxB;p`Sj@13G9tQso}NW
zs~4sk7qiwY92xeSG2|EI{Js8gNEIaUn28;_yOVZE;@-`n#qu5`?pkzQK3r#>r*r0u
z(n{XXQ=*H*kgqF=m(STcR2y5fIazHxl_n=2`)x6kvbMI+(Wfphs?Na6^*Z~r3%^jm
zt-HG(!yL{VhvK(O{jb|GF!_Yc&K47xa)R;7KPRxL!9<^*Yf5HrtQ#|vDu#-$gGK|j
zTxv*QI?RuJsaGHtC|QRwho6&Zh7vtkX9!}I#DVR<x4GkC#C)b^X4U4iSZmwTHa4gF
z;R2T+_6bNd%+w)OXT9FFdd<omgg%}s;0j`@i#(Al)5r8^THSg`i4qVHT$jRAfXJrc
z?Ism(es{4GCevJ~B2`p3>(nS~PD38m0WB8Lthwst&;7h-Uptw*ui&CaJBB|+X+D2v
z2ic`OWjVvTIN<i4N5zb!)GIBj8A&{kK>uVcaqT7;-aQ^Z{^RBSZfImFOstfe;j6}o
z7sYZN0SWuBL<J5lCTu|})tqv0q`!Y}6xAQK0mVQo)&284FWtSS=jJp}R*`FbXlsj#
zWOJ4LuC?^E5XoB6Hk+PA_yTD=lJ=N)zg1xPn;6f}PE1hpqiv%97SCj@@hs2%^JnEf
z`V~U!|I}YVLQ3sTc)zxcW#S)$PSICY%y9#tF>mJUZ(CdJ$dA?#k@D0trJcY8)vQU5
zVL1;du``<L_kFP-bth#+nKjUHLO^I!d;cAgWXBe7`a`tIxy#p&=0o}52%ivBhw6qG
z7m7LWPFFj2?NpSM>_}E{(x~2ID|j8Hv}MaAaoKLYJZ{(?O5t<8G{1^NbQ}bIAGuVw
zcHLn}adlmtM!=`L)5a?^FV*xHCdOM7DXs&^#1bFR-1f&FxOjMYI9yBZkEaRBs1BvB
z=7WNQKq|vm*IdE!cJY3>X~!$}yB=@AjVycMwwimm1iNBob8OpukU!qJDz$LJh$EbW
z^$++)<j8zHx`<-p83#+YtRo`XIgIC?%nkhx*>wlSxP#vLlkYh`+WUx+2O7YAYEz~*
z6d<w@!@wFo>L+zQC}nDXxH22`ShUz9UBP+FQ5*pBZ8}Qw3xO^xNRw^v>HKZl?Q#}c
zzfp+Cq%{Lhu9<$s*=w5bxWRTbL+m}n3N8i6C|rocLMixoy>7cbWUsR%aKJ?v3ggMQ
zcb5oIfFub%D5=|^Dw0P8@F7fQhB2Ws{W^_E?1rsh_I6)@jmI0hQw90?`_m7Ok4#1B
zLL4Lub3Z>Dnwy))`D(Apk|%OmXtbnSw`wb9DcGx%;|7D0<DokL>dNK(#tmdTd>({1
zAH6OTxF0fwD*pcdj-C{UkMr^^<wx0Om**m0dLXBS2y8nG|8OQcS*JM{DUXe4@iYj6
zpGin|wIo1ty!2uSSSu$9|3sksUeMw4#IMhzaJHKa{=PH}W6STcVVDKCD4=O!3T*PY
zaB<h+!e8Lbkxn?B(GS09?63VMdCLHzXb~i1eIP^2dET)C!T6MIg?f0;ECu>VNb48%
z3S)U`=@4n^`rSl#Qg)bf{SOgB<(Y68c4+OQpFU-xr*F0SqKewx9aeXDbo?R&FesVk
zP3k#hbFlB?{x0C*#wirXysn%XOyjnD)K3vD92W~MOzkf%Eqy`4_X_Y+iHomn&TF!~
zDY`kWkbI2=?GtZ0S?1KK;;wd|cH`0iupy)RShiL_{}WU3E6#`9vsYUCs-zy=`%5O(
zbQCT+<3b%lBdfejOqOtEs$}|wg@qS6XYRe4nqed)LdwLYZ^>w>8f})AZYAwY2>)uI
zZRQng{;=Oded)AavEEDi>0)z$CW9|J2Y!FxDZTfjsC(^slk?xE0C;4~v-MBcB`Dit
zF=sF1!bnsEnmN^SID~2cIe|cWA5@^>?%P1H=F90b701DO-&io!!2ES*;qLCyKz?}o
zdVA<=v1ymY5^s+hBH(xLLUdm-4NlBs%L)oAkyNY<!}b(mWCF9v+1R{dW@Yz95z#<)
zLvZLq?m3T$MHgI#9Tt&Mbk;$=rKhVy5Z{oPfPt>dlkVS<n2z+W!HxF<50x5)lcpEL
zt6W60W>+D~zrN6dxM4a+JjSfVm{@Y;L240i*?+%s{4&XdS8uZ<D|+gwiTBfPU}=G?
z4Jv4FQ1Hl}7H}I?4Joj7Q$Nw(T$1x`=9UgqMsvKRHz>f{PsTGP#@izjwS%5kG-+9?
z4$XtBYRTggAK=qt4|-xp+gJd|^9t&Ae9D~ho1^+;1jFe8QTMHNb&XLqx{EKJya0>R
zbfza!IN3S4^v8lAr1tMXKeF$8sio*M;7-2M%8!6~`RQ^_K%W@!6*-wulmj8(@+Uze
zdS7NgO4maweHrld1qP0qR*L>iN}9j2Kv9puhz37GxK=w;l6r$ps}IH#Z>X*oL055+
zeX5o8ifkAXSqi&={oly2(}uTCoWA(4vb&WIfzg?h+u{_^#lGH{pBgdG5oIM45&UpH
z_awFhwmpGzzo`j=6wWK8tf8Tyteo{ckVEi%Td$&BQSq=55BO7lCu{>v`HwM;=#pJ?
zDbKKN*lqT#@duUR1p5*Ac?b4Cx4u5VtJgKxFUfcP&oP1w`NKeqd(>cVLB4o!xzK{N
zm^@d$Bl_hB?<=R4t6NxRua^efhNsWj5XD7CvbuD?3`p$3KZ;_ECn;SC3m8JZy0@}#
zzq+Scu9|Y&#%V&b<O5>?ShZ^DjoRsr0t$Si&G%ux^s9Q&7k9^bHPri8A50+31o<NA
zAj)9M8HInl_31|M3q*8UY!rjyj|?D<RL!)0Gu9u>9yQmo<pcY8wyD%}23MReZBD{a
zis;hI-_5i8JsV$faB#ZANlCx6cHAX3h6#gn>RrA%u<u}-f(+#+89mPl0py(fHzx_o
zC<#BSiHWYnCtjy1(rT>ZHHmf45rer?D?^Z)<Ij=n)~52DW<iyVj7*#VlRSr?a7zg1
zimAM9LfTMcO2Ch(#>=0j4mzO$M_>LNsr@;9&onM!SlkFL&{iOMA|6lC`2nsb=o|UU
zlAawQPrMciOXM<v#Uv0iaoM&T@1c=_fggYzLvpji;_VH<2`zBAhK7WI0{U#Yo-+#c
zxnYfp^R}x|hPj$mIKI~6{|od`xXFjavZuis7E78P<EGx86PuMh&yK<Kl3ue(<b9?F
zc;?+!c^YnBprQRcQb>AwdlNKVZ>aA`>-(xU)Okr*BK<<=txirfdBXnDhq7{UIl$Fn
zzLX|qpMpRH*=EiDy^w8BXuM?5Ka`aS^TCN|45Jf)t+es@9l`6Xv@wwo(TFAQx-_?e
zY&k>t7v&UMjEGKDqEYy~SmvrGCMICqYT`5fMqXi>(j6(U&OGN=S2<bPqCbC-)|O#?
z*@eh7xGY-X$nyl}u@aweckyEb=kFmm0H69_pa~HFURTGPo}?Tgoxs@>QW-ahZgI)`
z@5R`NVKH^Ii7vQ8w=s=tYp_U8@b*oVI4JOaW5nIou9pXxkL2;m$^3bU1fV9=`0sfl
zSDEHlR3PV6Mg}=8sbwZL+gUWZAHV*gq2prtbZ+(4?)0?8VF%eTc)}FHxJDcwQP|j+
z!s`BNjpj}EECUk<fWHw=uXpXMsqgR!@FM%(fCHW*T|3Vj;!<R{R8L`?0;^ivOBY3~
zP9HJnx?dIL;2+~LzDaR~D*FMUNVAHU^zXW3-~spP2QJiA<{xc}px>bz0#e?C&+%Tf
z<6U;Y(q8qBWvLy<(D>`=A)=$C<Yw(2`2I!}wa86N{(~qxyUNH-U(lI;&2{VH5sG)U
z(O{zPVV&(I3c#`<l=O$fiC9cg5Eekg`Tq_kVaS6XUBq_bNVi<o<53Z)H9>Xu2P97N
zX$z;(>-)6Yt!ONJg9NY;{V&|rvw8aC#}6>#<p|n9f4WBYtUMT{wL#Sm@Y#0{{-0U^
zz|UtyCdLI{uAx~dHAGzBz<@Fj1<;52pB#&nz<&Mw&szbl&?9vC?Ls--{jjwb1VVy>
z1WtE!Vq#-h(|>Q95R=RX-m4V&cPoWy_oSxNBNL)24RT5xc~i7w)e5IZZQMS|E5hgw
zoDO&6e*>ljTrbb9`BD&Vz`GBIKL0QR3v>TQ4ur5#@L!yLe}O12rcQ+3onPD(lnt&%
zU{6oa(kHD$sU`Y>&45Tv7~O*Zf>o`nH*I!*oAVBt#sn&OY?<~(l@%2g<ESWvM$21u
zSrVklAb;ZH@qI-iV`FjG1jIoFCX)U(Lw-UD@Pg3B^rc#MVtDA@)OW)IA)NokNAaD`
zT7_aygobtSx<AjZE~iZ2Z{e1EtW&t0{J9tAYh%q%(_-SrM@!ctDpX-`EB&_GhCwQ-
zX8asaemDtxP=*Ka8HGSNG7*&jEo9EP`j;<n<N8riP;SrIk`D8j&OQ-l8H;K-PcF`k
z>@@KvqJWkV=#FD941M5pI6#_h)r3d!UGv3*fHMG*UC2Cy$H&Id5Zcd%g-CC=Q@<!T
z)V9WNetTcnaN&E+r$;O7uL5`t{crQH-I}tqvq6{V;(_XNtSw_*#Z_wlQHG07@-@j)
zMA52jW8>4en8HZtgsmdG>Ki*Y^0W&Hy-GG26X%=-YN_5~&96dAYQ>UkLhluvhZs|0
zqArZ-ZQVRigczA3HJ)@GE)`yW>_}r^7BtQ)t#`)E*G{aw{E%|zG+N|2G4EXc;5eKh
zs_>BaGQm1CvkDaGZ8S40)aYdM^Po~i3B{%!QlQ1QspW^8-?iiQ6~CoO`f?;puY_}K
zp4kIhWE0VClhJLy{(<aPG#Q}rBvO(K8eT?Ja%NAw3O1l+z|7Eu8d-&mXX5VO2wEfI
z*O=giz{3M$0rLNpC{<j4rG|yHwDgWv+j>sk=2PsxC#_~v#^%myq`5E#Uj+*E5&Rw>
zf8^1!<3wivg9K0LsohgMXp}v)R6_os7CO$IC{9*0>wFN6<{kP2nr8BzF8!{zO8dKe
zQ*wDHMyMmrW++tPo%d>u=-=dXJVHZ7-M^EXpqt(2%*J!99Xl(tBaf2ZU_`7wu#y;D
z{8{{(JJqHB&koL9N$)Re$Mxx$?xm^hM~?1a@^pcFZ{B4qN>9B`a+GsaZE7-{-clS5
zRNqHEYqy|cZmbyQfSS4d?7oL^GHN#q$oIaN!?9rEeXsvX_NznFrP`=?`D5S$C9AI-
z5ohJ*d&}We0r5=vctQr+R!|!|?d9f(lna93v~Jr5ER2j6t8KnyGegldat^7s?|w>s
z76n0UoHSA0MaF)<KfCNOR$r4=-_A+Bx5j>W{~0MYZ@3Ch;QJ|!jAQYHgL*2+-uJuG
z5w&%^lefDES`5Rk@*RVstFzPk*@}<{ON&)ix4*qFP%$$x$^AQ>q6=C`wyaM!lV_UB
z!&d*ZiuZRF+Y=i>((ABwF3W}YZ0y`i=L5rP7zw3sPwTh(FSn}l${Ce)bWjFCV=-!K
zNk;UMT_|AIOl~{sX%^fUOjImt`J<RKho;uB1Ds!^(|Di3odCSa{;>+(T@fPpmnl#W
z4ECw3h;}6P%a!=yq4Dxy=B}$6PHr@---MFc5=Mr9^{5+SNh2dCPYqBCW>bWI4xp3_
z-o{B<%w@QE_wl!kOuS`Y^$9jrpr@U?+a09hHo<^*Uc!a5zYLA_U4L2BsH$Gr`uni_
zKYuWOsFK091R}CbokwxNYdO#TKqf#X4>aB=pN2$bw=gi|EGF<<7?e#X85OiGSMxGs
z`#G$>2i^Umsw$Q1uRi6|OSYv)4E?di5sW&l1+)1qiYiyR#NM7;a)}Y||1ZYgG9ap`
zYa2$zLJ=tm1*D~h?ozsu?k?%>7=svEx<OjHJEa?GX^Ej>V8~&B`3`vB&-;Aw<J*5=
zPRyKh_Bwm7>)O{^ONHiY$_O~-OB#Zsa@M+SBzL4t<O~BaK+HP>e0$ueD=>Izk8Yuf
zziZblCMHIqE*Q|Ta$Ihu+-nu4+Cx8|@jM|A7e-03s*n&Hn{4|cJ*POQdsy<QPA={0
zm-WYkWfz_|A2O<h@a8^8cS!AMj86_qNWAsl2mYCt2qgF3x9=hTu3T_}b;!mxyW8$m
zaqyZbZ;z9b(&WPLIp%DV@YU?@?{ox*!0hh#RN1t;z&-rG8yXmQz4D=lC7LC2UcL(l
zJ8E?8j7{0G*ZDg6<DgnA%(dtyDZGK5O3ZW#xx1=UVfdyOsG*oW_Rgi5ILYyme?t`>
zMu7UQYfkC5+4|Cy=S@UmA${gO?zDmJo$h0!nEb#SN4$*@3>j&~bOC9x4l0vG5{Aj=
z(!lli1hIINMCTOLg=2}IT>@<P3m%hw>E)^!Kd=zlxNb4>_5l14mgJGE%t3w0jiLNs
zW@VjTAlId5l^UPhH6H)Fc$(?q6ct@{E5sOmnm>eJNqs*hA)DxTo&d-_QCUz+L~Omd
zSYR!$8*n!LWjlHJB{|mO3l|`mhdUYkvKJzR-(4=gc7TYi&*M*)sH;x@W*`(5+1*FU
zgf+;xJn5a!ucCP0XfUU@>8g7{<uFm(+fNG~ZQFC5+z7heL`eL|Nn@Y>%=G0i;B<Dx
zoEHR9TJD%d>Q&ANwH_YCu^!}2uSg|(jSG6W$P<l~otOTyC>Gon^xj{SFbzC`Af9$6
zhq_|kZ!{|d2ElKx@W(4du=tiUXh2nT{$gG%I&>&M0ss-rscL%M@ZJj9GvY3GjJJ3|
z^+RA>`(dI&I$Qyx_i{>eiT<;=H&N<AA=&#9aoLvpGe>nlUQbatKfU8HM<KDS_Sqnj
zZ((Nfp?jqI0L`|#?QpO~VBt34#1$01j#uRD`0e4VTXl?9*2&!nj7WWhgSd8mf}Zg8
z-;V+W&5F}tqk;~9-;*7O77>gZfY|jw?ATE+8(sh&9Ipw<pI7=GD-Sc%JL-h1sWB5P
z+(6s6J^}ZSmr*><>+rk-f8Z7vo^30ZKY2PSuYuc$!W}DbWpG=zJm0Hf_D*0jz#jT&
zSjY##d~%b-Y+B~aLaok9t&Y8F;RgXK#R8SSfT812_qNL`JiLESlV39<z3*<R0+yEV
zbPqsGj<~7~3{dY|{*=n7(pk2orrF-Ei@u7i)?&6l7yh0jfK<o8j$)&>my8Z|){CQu
z;!w!IJWAZU0~Z+}=kP8T(TjbcG^YPZ#Y>7gBa%tAyA*i=&j?DjswAYO9>=ttQJaul
z5kf@MTV4p6>?<o~=7>Ce1o-X1X2ka7y{{sDI^S|{9Yiz}%>O}{Gn11Xb_?~t>_d!#
zXz2FL_`?#3A}IC%-ng%>-*F+HN$2wBmk=+n))NB%F+It@zp)s-Ns<5@`Cvpph{a?-
zH@yROJ$wsWf&2Hz`;W8f)ZhaUbVf!-CJj$I$8xKg0mTCA?&0vuMR)FDeOX!AnPM$b
zu{3)_EW5WJ6ML@c0B+fl<{rG=bqt56Dp+jA;-U4t$R5+w(MbVtrEw=GkfMp=&apE5
zC`O?KxR^UE9l-i_kn7jtUt7S^%LgN18V3h-b#;BmK<Bq8gP$urd8UV=r4-wdUaXt}
z9C&akNiy}r&pIhA?`>67aAmOhM6ALnKeOJ^`)KVy$2=E4D}CtG4>0}R#=^=v1{BKt
z^r9HJK@mv}6dcIUc2BYB;jgK4Bhjy+YxQbH>fs@I#Nj<rL@H*760*_{2KxF4hZga(
zw^*Q1Xn2U*^wbpQ1bR9e3)+8hhu&7$Ycw=bGa!LxCRs#RPY(u6n0(1zAu*L6z+I}%
z8MtGg=(Y{jx1c%1OhI&9T(!(5Nc6h24)J8brvvEtdR&(2yY~w1$>OLIMC+cNTE~<i
zeexy1iV2?mG09vWsjsgGNW{z>9yyakmhi0C253D`?s^3+Q>sAq;7`Q^*V|bRjZm-S
z<87Z2J_iVp^%zCKZhzp3KJ&dRu!bf<d2;vqemFw-061=~CtQ^7?liPp-w+$S*HO>>
z3H~G9vZA7SfS}xj0;+>-vtm{>LZbineZA!)0239n5QhVG6#>U8@$pl%AN;_j{09Af
zQ}CwOHEA2CP3hiy0*a`2Y-~8US4GGVOoT;6)>c*`z^^sl!XYs+nOrvDL6nvPykeH~
zEnY`!Cj^{?sMOT3%$|E__d80!Xb#uzc`2pe0CB=?1Bgr@^Cn>8pFaav+kU#7WY_rO
z&f;(Pe%1$`;Ac0PTzH-%{iEBvylYzzdw|WmfJHqRNrr-7QQ!Z30NCM&4_|aa%<ta4
z=lsEb$3;j}&@(aZH#Zj+4s5&JvOOZ1Q&vhp@5QQKESCW({H$uSa&Esc^A^VF5)cr~
z3>Ito68*(T>v@|o`tNn)iM{`GGYoix8FlNwLkbHEfkd!iPI9v6ULOg$Ja_!fEZ0zG
zq|WC)D3LQ1=Ork$4KB>7F*13Wk^W*h+Nx#_aJvExrEkRoJ&eE;b@&GK`e#pT8ym*J
z&c~lW1Ns$!9`Ed6ksA9mCHwa0logQG9w3A8F#^gD0Wbg%MtAe!Bkp(4fuT@~+y<_?
zG7Aiwk)~&JveE!l3+^*yxkv!&iU&wZ-!t6%pH3unc>b>w9louvn*wtzPOy)V!6p>Y
zyhu2ry?qNnJtCrDzA(<~0nH|WU=mp#;<TO)bNJ{n0RVS`7w%_s=V6>VG5|Wny(2LI
z<9hcH?Y%qjhWvd5boJlw_W?-y4$wI#%Fz&i?3s)4dl>M*`uH=zM{x1@1Kt~Ti)9aS
zSM+@}Q(z3BrCoc(^RJ)ZYXBE<5M8(G4%p6H3P8OQ`S9^AAH@8@hz!#cAb=tO7UcoM
zHt<#0@95r_>DErTS*9w`oiVR8Ff)R3^%=-2eGf5TRehTpRb$snKXi&`58u;G(++#U
z>J%#j-MaeR?p!m)$u%NctX(u?J$p7$&~10q{Dqc+BG4#~IK1(iC>`0cnqE9pKEu6&
z=<$gk+s;aS&3y0maI;a(8Eq?<jx?@tOgHZXmb_aVEcGufP7A_o|0#!tR_?GOQZ&W4
z4lo&B7#IQ2mNxav6wD8FN}Is+11u*`Y`=kr)wEHIGiwmu7usD}3^LJQu&Ph@XtZ7e
zRWteug0Uys<b9PYw|&)a8e<Q(RZY|!SNLNQ<I^WUU&lL>?aiHKkaor=uv`3<dS50m
zP036^din`x!zxEecGml`>_+k8p@)F1*yhsmIc=y{Yal+hryf_4Ugf*Zq$#hb4TYi4
zb}{XtwED|Kv*+h6ypGp{!ZQmFPxszccWuIS37EH=CCeJt<80%qoD4=4jX#1vSD*v0
zfXkGx)EB^3VjrRzGOGMjEv+tHe^@C|9nVkq!k~V*RT#HMoa=+7$Y#pS!1<bV${@1T
zqiEarHCdhr^sl+cpn+T5VmbmDPA7wnjeWfU-<Xbum#s}$;9#@jo+d?FSO(<l#s!S-
z3#Zmo772sD!&8O4_Kcg2;HT83xOqD;CC|?lw@dEm-=m@`%m!brSo;H`5dZ%|(;qQ`
zRo)R3CjN_=^Wn{fFx{BLiQ*w~t3XKVMs%GFeF-Sti1R40IB0|OjmP#OQPZyXU~PDu
zoaRsnYBTCJ5+nNL0R7ZbC08Y5oK^12=K7eG_}BD0gu59M_N^PmGT-vNNLusALA^Qp
zXJ+xwSez`zAlX|Gd_J+a$AkPeBQu2A?4mBCu8^0$UlCP`iwx0w`n0q%+d`MMsFud3
zVgDy114@V!8~Z4}?N$1Cu{V{{S5X0s!+3b%VeBSfhdM>{7U^sy@8LkP0s)lG;BNuU
zlObC^d6ym;#Cd9Di(G!-Q(JjX>>8|Hg~EE0+OTdY4b=eY7f&+lv|Rd?_<I~@c6iC-
zpngRSINNO(#}-SfB&{JsQI?Lulb4UHsuHXS0cDC`^TqJ^P8BGbL$A$dYG*PDhhoX+
z=kGlE+``T&x6=?53rCy!p0rLzsqJ!;WxjMHJ+~QKgy8#^{+Z}Ny`;@wiNJvT9tI2<
z0J-Aq%q|0j-?`~Qc~hp&YKBpf&h7%;arm!V6=gH#TCICGBwfzh?B?yyULNJY*k@N9
zxw$5DhxyS7_iO0ckzbD<9|KvCl<0U-t$x>S7CTMdiD3f$hh^b;=e8uz_RL_jCzjHM
z5$MCN#<1G+P8D#I$$9C1!|`jZR%PR3Y){_#eoDy(YD7Sp{b<U}MS|_<LBX^D9J$!U
zl`l(wex08LtBC4q+T+aaHq(ev7mm089lfD2edd=dPv|^o4MKg79H+1$p-y#cQp6Q3
zv1jn!$4reiLvlWPUw=khP*|8f#k3zOm@gk!ISHLRj4(*Fp)T92vaABPnVcVlCyisd
z@0Y+|Mm7C1_G;2@usS}_OPMf>YvOA2$Z_qz?3k%5o0!GgXXkv0?OAwc%Rio4?S{;7
zvmI?o0Hfks0DT*UyQuFhxyoAVP%16kDURz9Ug@5PWFykdQ!86Jv>gAf)a~O2gc;*x
zRmO+vXj^HzpydYVO<)!EBL<NBp}1nB?Zts`dE1(SH0R6>x_j1w5I2K&ly(Qpo{gvi
zv5s$XAg$Hfyi}ib_drh2an%WqL=U~$%8HS)`bcFl)yL`XEx%}b#BY}YQ2-*#_kIdC
zDg}#2BI?Df1lax9M7w?xFe@G=7rpk+uO1{DXCR6ml9L{<&wT~}Whsaf4byZ~$<o1Y
zMU+@q6&-I;Of`FKabjZqB)WkaG(k)(T~WC5Q?wryS$6OICytu~_qL*v5)w`(yEVGh
zZ61yK%8um?=q413I_qwm=TpeIG(jy#QG$h?*30Z)*?|hjOiQ<5y+8V8pF5Z2p=zHs
z@u+K)IisR?7cJWu%hpxc$bQR+i51&*v(-Rz#Cdw}06Yfmdb2|rjcabYnN-#-d>6J}
z3l^B_wVvFzRxy_$l*rObO%oba_8Xv1O}`UMxg@`fCm+wrc{dzq&J4Vv1H@dG!Nf2*
zv=3j8(9zzO10qX6jp<_d@N)`Iuo&RF{On{JT5$-AcZ<l%TW$WTBcI538a$jPQR)(B
z+h6s>pkZOWZ<pkuRznD=VX=$q%BBWdqo>%q`6A<|@~E_of!XhVyH+w9E9t_s?QE#7
z53vfOR4b-dkHoEZt>1ixy7k6B#m~F?+VS-$)zdHWa#Lep4-XxTp(JZ9fkEs-t(y|S
z(5a$21G~bpR6dBaL;jBD<!{{Zm-mLvvCcQu)YjG>`(R~%z7-HH3PeF5v9hF#-~Uq1
zFB>c@kGs|zO1z3IH$QIVF+(EWQoxVI(jH-+VY3Sv&S(S+G3Bn~#~{AES>(=RQ&hh3
zc}lYD0gs-C7K6MFRC{N)U(8fLMXDFlxbCGy4g85936kv$K7UH<#=~%)VT}`~yW`XV
z$q~y5*@-PsP|$V?+RmOxzqq&@U@y!;eOyg@`d0JSs0{&!6L1s&zY=gC+BG*Ec+$Z)
z2&J*f@!O^xi}!YtCHk=T73%N(^`m+^3V~EeQhM~Uh|G(l^w8QJcQ<?zWte6Wtn+$4
zI;T2^vPnK(3nzw>ZZoh#B^QT|-5xr<aPgY^__Y0XKW@}kU`YW?Uda~3ZLm(?HkB(R
z<n|c%r4UV69{&oBgruyzIzt9W0y%|5Po-F~CQnaC5`NV>IM-TSPrs*7GY@V|86`aJ
z!s%q^Hj>lsofMVKcg;H?$#tG&b$YGC350)S;INPXoXoGyS^l#b3lp8l#c@oTUmNxD
ztP4tbm$RRLfQf02`^s82Lc=+%J4-=}sdpah{qMK-0pAZ>&iN(qzVP`cL$?QWY!ZVc
z$x?sAMRl@eh(q}R9Tx6JS)F!C=CT4Hy1m@jt#`;g1aEPxfqE@m2xW*{gnT5m8pvtP
z*hEPIq3B#zOcnJypKE!}(UMK)a|t%B6c!!V!|D`TV~U)i^D_x%x^6*xC2P$*ug0y&
zm7NT`=@5v|4XtLHg6n#=)yx%@y4X5f!nMJ6<iYL)T9F_}^uoQ;?_{~3<Rnox=UY`L
zs}<gwlFNxE6#JFPS$X^YnHz66Wjs-_b!W#Bu14uHD+za4^EDVmr>L8N)BO@#L(RtO
zx5d)pgk?8PF7=zGnv%q^O`(yc!=YkN?adXLB65|%mq)c&gY|@>J{dh4X0BeRQRnit
zO%t5PtD_|BT7NtT<L*^y_g|@A8-$$4HJbQEhi3-d`wBP&{{9RaEjkQ&fD~X$K<1?r
zY3i{g;7zlzO&4pi)6X>kE*bf-nDjl7C@ywxUxTrY=1w-chK0SXy(?s_SWX&=-wCvt
zO}#(RlPBM_f6fz!qR~0Y=(4t>*+AB84u5n#AH9_<Sr7icBg)rjr$sMAUHE&<O@7wq
z&0<b$%%C(Y)&_`3m-0ar72`BMfp!e(_{yIM&`z}lyzIc&`8XTdT_`P=G7mL32LcR|
z^UgJ-H!L;{{xrc5y|Sh;gsro~U5o~Gpx}K+Uz(Bc#Gb`YUp~xGM<jGv2x}YEQ^M+;
za@Q*dsbvSo&Ksau#e<C>>zQDQvDEINs^=f=ynuLLB0Z~SWJ9FO?k1((!zX!;fD%d+
z|LxYBbmltW9B~q~%hg}JU|pA9zDaU1!dp%)5s>=*;_fL1z%TY#rf85zzVw@30PWJ@
zg#dp3r<#i+49bhwX`POL+wT=kg}8T*jHL2TOiy#UA1oXm9<u5^$>@c;#K`q8y%aej
z9f8sS+JEkaSNm(8>4dE8<$6a<poD}RCv^(qXbHFZn{8jkwruU?(;f#xCXdJo=mt@N
zz+sM!VCw7U!R=LX<y;>J-n?=W)}q)`)X7<6eSAvBpIXs2rsm`-lhdDwhd)|dTXS=%
zX=o&9VpjJ2bsk*;=oPK~R>l4G>-`2@{f{3%TlwG5RVI4<8lard0fG<Xe9pQ?d*vD4
z<G|F;6<`^&19g*$V{GC`_Q-`Yx_sc^13snFBTX5KKx|AOjj%Aafxz;pv_sH*(oSWY
zHV0NIpf9#!#c8Nq&Mbf!#fh?BPa4)zn#Z+Z8Ld=mji|<_8e*cpWL%(q2>cIcw0R5^
z1OQT6yvTh*0-$(6bQ!?nSq&t*_Jrq@>)r|G!Dl;QNXRdFKw?1Q#Fut@dfGuC(lh5t
z0$lmsf)DHEBRBLd%7mKAfu1~QnZAEY4f(o<%?4as#*2d6l`Z7nOVj_5C;tV99Unt}
z-)bITH}c+sZeV7`Zv}v%h{T4X;X{%2ol*U3OGa0@G=I6XX&Tdl$*8PLG7QJU^74cj
zCSVmF1Ojn!y<%ku`%*V^QQW(x^oWA(4(bDS4W**CCY(>-J$NtmPjd2&;*L^R3VJF3
zC>F@gm6V#wD1fJ|K~V_&C@B{wr{f#i9Lt-75WR8DFTzgy3)HpF#IGI5!<t7{3Q$Jv
z0WFUU7DuTmaI(~P@T#)@qM5`vJbSnUWD5Hy;`m%40x<~g%V@{oiO7z)h~vwx5Tin`
zoTMw?^&{|v@jWgR!JD7x01Y<L)~3zuS*Cq_#j=2R|M%xz7p=DA<a@^xAJSio0mzlk
zX9Z=Tne`@G>Dj%%uPBCsgM=G_w95Yf)_03{-&82oaj5?tH4T=6m^Crt-JA>Wf(cpP
z?`atGv-}D%eoW^3qJNBvB6H0%;oI)0T7k;V3$)(`w^i%C{w)%t*_F!L%(&nDjcYWk
zY$^T?-90GVu_S5xv6(J7RUQiJbac_rAG}N>yENeZyLL;^pmpBDD4;#h*1z9nG$o72
zZ@dgdCjxxfhyQfx|Ba`3>+<#-9oYN%2k`L8R|eu3-ravs=Gmf;cTXf_bQ2&~h`H_F
z0G{`419(f34!|S)i2hlq(PcZjcMTAn1K}f28gyF$VOe=OODu2=hGG9h03`A}yCVQz
z`T)YNCurASK`Sqn0X{C_DH!;=ztie>Yd1lB|MS#tf8~;#lJXnmcLP4a5C8-0zxEsJ
ze?Jy9Ll59BkWXZ}2M|G9t8nPxyZZ}Xz)XL4@biSHxHQ!@HEF_N(p+Fav;;@|v(D51
z_p&Ge<UWJ)6w-)ENh5%3IH3{{cms=Z2>nk)!s;<1IXM}qauKAAhUP*W({IW~1eg&3
zLI6BYD4SDsFvA$9*K~-O4bP~F_$@?fYFTB*h#2v6Jz9J9=Kg1$s5=MRLQ|9Q|2)9A
zco5k5)Qt^1r(N+<CekO?><qK}p|DYu{xt)lrcoXFd$VM%y8>{NT93m^N=F9ORO(P7
zxUyH74;Fee^tvk^)jtdt%j)Zsn*2|v0JrTwRKr7OEb#iGfF+*j;qvE;(%F14tCn18
zew`}n-nHhxBhd_PfQW>bGTK<z`{3I7vc-D9*`DPNX`@Bf5%Gf{+sQ>lX}7^Tx9zWb
zP&5A9SY2y7n*vKhQxFlCxT{VzW0>~*4zXY<gXo3dFR>!3vFDt1Uc1E1-pGqWhNZ?p
z(rFT5a&>C+?SWbI(ST`dprkg3ICXf$+l1qw##jEeXlRR`b(If_{;LJx-Q7<18WA<l
zmB{#!B7kuadYDST<jqPBZgBft)~4ACdqOmw_tRTH0NJqzFeNUH-dz)hyt|>(@(Cv=
zIIX{o-O4W;tnzdMm@J5A8~AGKv23j?QiSdQ@-uz^>Du^C`X?Dq#_&nL)6|aS>a5Y9
z`d!r{o$#l-&$Y2Z8o*uFp8|nQgqS_uG|U%<1#$~JLyL&*%w5W|2m?fDfc=npa@(cx
z*q6X(T?@{NLu7ser=(iFIh-2W!aS!_joFy4UsPrs&lfT@y%f<o(`Dzcp}i!s-Ok2}
zC>WheneJRA@`NmP1_J3B^sDhX3-s`G>D39WgCbYK(}LGW)C2*!*EUK=de2!YSh2^t
znA)@|0snaf2-l42-&V7{Nx~HbUqM&fQgzQgCAH}_>{y2`r&8zhtL?v+A)Ac`y$wjM
zKEe72dzSSqzE5Zv<V}g$aLXCc69}aPW_Y^a*ygLX(;Vmw6!>_cp;cY;CvSS!qP$D3
z6`@c8-+Vp2oKK1f{3Bb3uQGjEEwn#3Jf$<T{ixNkZt;@W(??8j!@V@w9V1jn!)7tI
z^guKJsFQp71lrmxGcjRJTom$bIddqXHcxU<a}CSQ7O5C%G&q>Ve))Ua83sefhzlBY
zy6q*j6cPbyu!g-Fi*5L!O^SVSSxxs}tJ~F{ihG_6?VlV<4xD>uk<fE7G4pNOH<$Hm
zRUiH7t!w?m>*2qM4(<mG1Df-bn_x!0V?G@#Z6D)V+v9a?S=d<8Po@JYl^>wIwReE&
z|N7_;>A^*?&mLt-V?UP-tG~t+iEc8#Ddok5_z)0-dFJ`-LUw9D&2|~&j;2g{v2_lG
z5(S1Ix}y`*Mi_R9iOn>Z=rdz0{h=c29G}`1{JO5xW3a0}(1We#TvN@Za-m9u%WolZ
z^n7;GHy`!Z1x6z5Cxb4^e!wykB@ub9=isGH*)@ipwWT%I!#VQ`-xmmgw|Mxz&w(%P
z;lqbc4b4A(piL2aPM^y#Iy<=)Pd$TmB@GUop(c8ww0T#VR`lLT`h2o&Uimb}e>p5X
zbJEhwEP7U{T(zQfeVTBnK7r&ClAAhTWV_?AGMQ-s<8A)~Of_#~<un^lN>@kvKUp82
zQEpm6ejfQ8akImcsI~dr;jxizE$_v|R1T6;UwxPy*SsQ9<k0u&_N^6P6;z8kOCJ;$
z<1m6U<Ahy8wl21{V4N*ue1GenM%6W(iZIw;4Q{(;cn3o$*5|=3f4K{vT88FHq}alm
z^{|h)f;Wc+QS5Pvx{7Tl(qq&V6pc6zf3(cwjLu|cd91E)u)v6n@QgYy%Y|62WNyDq
z`I!E?oMolPdG$>L4VrK_eO>o<<w=KtPa@iv_tymLg0m+Z-2JY%HcdPX@1^8IVXoi=
zDXMy7kq@@EOd)^&&tQx$o|0gZqYUQh8<HAEP-A0jMcN?WCslDt*Yhv9xui(Pr!{GA
z^HtclIWIN$uuY%@chi4NUwgQoIhFYT%!i6KUwa&-<W5=WaacC`%@$4-uv3dQBR*!z
zr^u)U747ffk5Z@k=3Y#8_ejutmBt%Q7VMHLM)?`d8R(i@t`a3_*D+e3`X-m!>)4*b
z>*VrOEEeU;no=BVO=V}o8G<1T7Loh>pV;P^*>?~OPNWLkTo`EE8l`#TS<&FxWX`5t
z=XC9fo3>c3RDn*b1cBHzp}K33*Y@Jmorye|d?E3rijDs4sStEQdu#BJbN<p2xPD-|
zkA0`1AgKtGZ*~|fX1A-=SVQD%(cTu3#!z92R#matW0|G0nkL^cWsANMg(*ld@j!?S
z9r=J${p2c6fvfQFNJJHxPf6E8yR1d7Db#dlww-BH=-u3MbmPqjCTvfIHZ9HEeqVtA
z>MUoo0C4`)u5ujcII8Xi8M0%9oqj)(tx97@E2_NdFm{$}!u!<6XCx4sR%9XwpEH`d
z+LJukPn~aK1T%x^>KqcJiQ~TC3>N3S3-p<$444`~{@H1Z)8f@HlpgTmV9@s%>sINu
zCn>FNAZ&2!XTt}n0oF{&e>;)kDD9=vRiqJcc)8brp-iQ2Hc7e07IOX3VB}+3Z27)=
zSr82$auzYpAm##H3mM(Wco|%F!FPXtFn3Dc;&dXrSJqf<Z7b!=(l>A}G@Np(TT)yf
z>^C=e)ng%iWnLi1*k-%r!N;%{1^XieJ3bpna2PvHzX{yh>BY|wyl(h@&!FjWYSy5I
zyWqegYA>vxL2bGD&zAiTRFsC?hB|977v_un;G;WtafndDK56jYOQ#86_AF$AtvY@A
zO`b3aKo=sQ){HW`_)v@V3Ny^B!dCj~jr@sAxH;c^I%DHL7ylXzd&jPN9>GPis^@l$
zZGBO=fLS!TqI>#FFQAD^XNK9#Tl=73u5MpyK7_WmEJNW87a6KQ%iv2cEUoargA?PW
zT|E)3Fq>ILEyOl)U{s}$^-(&T`P9bZ5hL#~_2U7njZLb4`%k>!#s#>;L12(wW0UJ(
zlZ~!NOI77Ud<@b!qH$OmE@L#Nv_Fl%7|8aK%B-xyW=^FTe1xsHS$}S<wQ^tuObbn$
zm882-9eU!?;jfkB#KZvR^zE|2RVx#e91|ORc5ta`Tf;o4%?wxDmbcmrENXR-=aW`{
zG&wIWUiLdsJ}@OBv;B@5Ee+T%A`4YgpgHJLVz>Tt7?KX%H*3;o^r8hcv&3+x+L*ZT
z^Nn?a4cSo=Ps`m}i|pz^gKHt$0lfzI!B$k*>`{IC(?9TS#=54tM%^O(qj=n@)BOFZ
zHYsVd{u}V1txfH?1%IWlfxPZWPmNb?w~q&`l>imrc1q5)2DQm&2u-77ARU>KQ^IbT
zo8RGRL%{vKI~uRN?f0mzT^suI^shxq)i)|g{458p%~(^O%Q$gsN5&n=I>Bo<tL%=y
z5V*ILP70+Az^7L=&gO48-H&7VrcHR8>-u_j9cpgjA|c?qzl}ZQ+z7-1>0wE<goksI
z7&Hl4zq3^;iC#?4Ot@Kjz}-(%)~q77?w8-=*&?}zZ=6j*P(WSbBwZ~)p-r74BZk)#
zK_<!ZlU1>W!*8j$4?%yo2NdxtFf;UJFtUN9LWY<>9nzfN){xn`A7HO*LFYs&y{fvl
z<+XI#8C=F^R_C)jD=X%NBN?_|2a;ddN3mHy?PLK)Q%MPl9?hFG@w-)Vg>QyN<`%Yo
z<Qv&o*2QE~&GvQN@EvNbGH<GinH6-7!OD0mO^W37YA;V{OmC5%tcthh4dw9>vFqhX
zEWqcFGma<-Uz;N6G{&!$DblsRCpjA+{WYE!nRc(NbkRc@v7^d>LYP6%>{Xq7&C`Y6
zvse2rr15XxyyjkNm2ELni`EvQ8e>8^h7}C#kh|8Uqn@WErrG+rE$cKc3zNbOtc11f
zGEy4+K(b;uCBInmhfV;cl#tO{*a2-pbV;6y@Njs-OM5PoZ0g-jHdr0%U1e;BAtp1S
zixY)%Z`nX$Ke!71S~S5?4?!~#4`#rv&1-}uC1_X%Lo8IB)M;zT5_1u|^D}D0DMcV5
zJs5c=s>}@`>P0Qe!A>phT<>FNzVrNP6qRK~>4jm-V!p-wS|91J`qgEZDUACrvf+9e
z$)7kgj&hcExn}Bdr$24)u9_U=xMFR`QBYI}Im-_eBt%Z3<bdGJGMdw)A&*ROX(Ys>
zuBG|#{Zmv7<}M62t><BWc`2~M5WHiTHlS*MOcX0dg`udKo~wa_UBpdQkX|?EQ*c?v
zP~<C(U~rYej?zqMV9SJ?ahKjjBv9Usnm1ifcPExl9P+Q?^0<Yp^;{LZ1f6Mn{SIr~
z)2+rlvI39oU0ig{EIut=%ofKL?q+v;mg(YSeJBz;UNu=}kxL_3Y16n!J|B2LF@Kcx
zgd(e`bozDOZ?D`W$U8ZK)(Mh+jZ!ar!Xm5*)2C#^`I3y`(oNTA^GZM-0jaE26|xXT
zVE&&e@J%@*o$P_qo;C}0D!_Z=aROv^+!}&A9x&Ux>0wwvZwOVgqGk0!1W~aNRV6|9
zUA%4e;<0`2-R&UvfGQc+As^9r2P~v~^4P?|AGD_L1**7w)41&G*4C659OEHkD2I{e
zaB*+NPZ}<a@cM;=31o&hP4n7gnhN$ZRJ&1De2!IykoHJ-K5HB-nbUH9%E*F$+NHrG
zKF$C|s#@H1q+Ev~Nz7#j<MU%+7l$hTLIe&(x5X#zgK#Qr2qnia&Uyw_Nv>l$J3E8p
z+!q4LNJlO%NOeHtG(-8<8Gk&=X7i-~Bub5v)_H7jBnuw1S>U9(HQ;OKwy`?l?^ZPU
z8*tGS+CzT&x=rkwu{t?9g0k!$&zLZvWTj^b-oQ+#7?=Byn|LN#s89EBsgw;~D~cs7
z*r|J8L#kxd|M3Pr7y#zY?w!IaWS(Bk4h~OF#dobhfYjwQgn#lb3(Z;j7k-i-k|Am!
z2H{QdD9nR@T3@Xc+e~i@$nOm=KWTj4NI1=#i$B^}bzz}Yv0jASYvL9{wU!<QO}Hs`
zWr45f+Fz`v)H~J0LEeR#+3h(ER&SO+P;|P7*7J^F6Nh0o!rjiz?<=VJ&jA%hU@6>o
zKh!g_S3c$nYfgY<j0)z~L}ca>Ie14ocjB_{hYB`}k?(u#9+c<}bV+hi35hfZ@msLA
z)LLg)Z32lUyE3mA7u$nfwX9>f_TSNGh*Hr~7<aAgWu+#NO`jd!msZvuk6NS|WDM?)
zijDbGF#DYH>2duc$<y*%;oi8=jibssjKzw3A7rOnYh>M;Ouj0-aJF^S>c=2hqx-sS
zbGfjzS+{K3LVDwn^>LAKp;N<#xs|w>G0npM5qFZn0ha_lCeKX}L=S!el%v$a;69iq
zJ4dD>xZg1UKSOjccg3H6JbvxI<L2p8uMPlV!{YXj)@a$%-&!tGip1(lbl;@qzQ|VZ
z%}hRPf_JmW%9@F-&%rlQvzkWW%_7*Ywogt-;@td3si#gE4z{=)q_x#ScQ0(7kGf!O
zV>mdBI;+RN>V?lykP3lJ8)BscKO|mptF__G2-B-&aB<fqqeVn)6~S(I;WDtq${G*(
zd1^y>t4#4bRBgT4%bQ_nh*(N0otS!M5UVZCp;ppt$Z9}Ekrmn-k=C#TjzKjS?^&lq
zAa!I&`@x(BKC{R{tu%dG-(}Nd9PFdbh$P0n=B5W&>g`fV8=-EB-c5{>XQUO?FIjP*
zaxDrZ2X@=jEfuX~O-)8?*S(+Eo&ENN+&Vz$zjqq=*vQ!HUSdZ@@~dgv9WPznFEL8m
z$<7jvuNqXg-^!y06pJRz%r#G#Rup2@kQ>{a*EZp!YgSQ(c|}AN@YzUT)#dYqvp?{p
zr(W)!{dBp0AW>tzj+4u1l+`5DBo4ME?g>+`T)zK`mY%r^FE=+ijf46Ky&8`3+RTlz
z1>$%~#$q$yLnBm&O}vv$tZROt$hKuOziMN<wTvU}mD>(0_@ixGNYSmLrw0QNBR!#8
zg&fjVg1)S&*)zbb$hs$efA-Wk1`4Q?iHntvj=a{nBB=T;8J9$lmHmobJkDMV2rVO<
zw00#}1T}_-f4s69vxTW%{iQax7@`kpV`^{2#?7LXl5+1>a<};{>7tjJTapB!-X5PF
z&x%}JN<W+JAh}d-av8Y*(&t}oq@WO;+!_$6Vr7EGGSlE@%y5ZYChMA9T03R5<z}NQ
zl0Z>es(*e#Zmb}3lSUdsC9A${pjW>#1I6f(XQexyGDlfgkCk#~j5NJbC^a81?L!sh
zB|vu`Pn+qzTZng;8;EWaFZ6!L?d$b7?<WD#hHpKp-<3LMQr2lE-wG;uCg?23mt-dJ
zM)Gn`O;1I8>b~BxyZc(t1m`sfFB@Udkc6r#@clSdKjhjU(3IXEqhQ{@ZG52LQo3vK
zY?c~}>dkC#zFQJ|$e&nA=e?$obY0psDmQLqHA8ye*r~gKZ4?*O&uw1TGk%-VGq>^K
zv8HTBTX?!QFj6ua4iIk)$GnUZ<2D!um&i1#TtG$IE`zC?&IGP^+X{1*dbnJz4jMhA
zVHQ(Yd#UWB`1?$uS{ZY!FYjd=#Cy5)<}IriQEM#PH0#sWo;L<x6sKNx_$s2bv!X9O
z%$&YPIIcL|(g&=@3pLh#TyOh4(oxl|SGL|egr#3DN}EQ@k`_d{yh=7&aF2zzw(#*X
zxdgLiL*G)sUL%FQPL3m%D6o}!EDp4ZK1}Z1{4FBW<q&cU6yREgKhzhX&N>bEjf?bi
zjq4e7zuqu#hR*EJ7p}=I<6?(LE}W)#HozUlG?+PRQ(JIIU*rF(^<eJnQrQ~s5ms&a
z;~_6VTgv6F%@H-Tefi_pE(x=i%jUjK(F_A_l?KS%_VVQW_lJ=hdzZskx(J>u>!oR~
znK<j=jHzMa4o@|9MbyTKrwbzdnyj`?-L+R3ENEegj;+T#|F`wMT(L|?1a0FuJV;nW
z_&P2U^2hrXM?S+A#`#uWGWQj`xvM?uk4xm0aD0zI_>m5|%l}%mV0qMb{of^+uM02A
zLXqkDsEih5WN6_KvKRG=$0c=)PwqKpgkLhN<LiF@z>ZgaiY(Mfhd#URsq3ElxRvpl
zz$xlrAv&G)@U!rbT=$#U``w&Bbv|sVBRubE`Z7(3p&fW#%|l<p-oz{N;u)=L4{lH}
zjI5Ix9}tB>Pkk;PJJ*k#uKA|1Z#3=dZoVo=<>!ytfG(yRDu9Tjo;D6Mw_QY1^Q%jG
z@-Hqv@0B+L`S_IIs8Gc@Mz<?nwJ2Pdo#3|k<V%|7IL$&3Ewa{jpP(j(8E^I7^Me{}
z9k>H!dE(rUg{63t*5Sxi=pyN==^I=RTQ86oKf*}FSK3FvwNo_PeufsFCLrIE`s4vH
zPAt@yXP~uXZVc7s2DQ;r><B1bo_?1+9(gl**(tm`dNOWWaFEWge9)lB<hI@nu3sH`
zXuZ4L=Y?{ZeM+YEnPRW@k7dj(?y|Y>-%mS4aK$Cjk|IWw6^vZHXB(!Jvmgi^6239I
z_*=iWP<Rt}jO;Y)Z6o-E!;g5?t;*%I^91!-;AnJx|FP_ipTv*1r`7Yu`O@`2$mSyJ
zELxDc!m}7?+=PGxaBOUBYAWEPt|B9Co2Z{%WgLVmRaeMI;Uh&1R*Ki_8=k>_Xtq=>
z4mLX#*??3%|2lzGt-h1<&>w1@U`SLStbR!KZD&7JBid#^C3#S$R^NOyDa!l`2VJIK
z){6W4%DNO#%>u36|J!Ma!?ls`E+Iyvxc6qsPJfqsdM&9iW!Png=ng;c{^a|c$?ZI_
zM-X_yzLMii!IB3tr{JTpJnOt7^wS<Q=WQ1IGQWb}JMh3jUQdAh#?M~I)tvnf*N%^x
z%NT3nqno=sI~Y%%eD62Kb+}kF^WFXFnKIzsu$hYz;~ZE6`UA)~C0!0?XqV63^qyZ{
ztd$iS-n{5-dqV(n>QMpru@4CG@JXtD+!8R`76_1jP`~o<(WAh?K#CXK7}6*36IotB
z1qieRbR-a=_TLtUx&B{58`#wI=nVIEPyN#dEbe3YPYAj9PDrngwwUW=T6Bd~2QsoJ
zFs9JI0nSh$Ca^TV2Y<oa-Uo|NmM3X?pX$U(#s|7AGUsWK$z8vKg{`K^mmFjw!(`Qz
zoY#lG!~Jl~*7x3P91D-mFJGNO5|b9RFnKM{baU60y}O;9;$%EWQi{>t2R}rzX?N=#
z)TeE9Xxl+m9N!!OgBjXq>*t7DfBtg{mVFVUO@ucCGV|A7M$tjb{<2_=t=hU~&59$}
zWIdHeqP^fzKta7aAMD^u^$!kZ);z?t{h2+uNSvr-^?oq=((F!jS8iK5h0Ln;qRLgR
zcZ53KK0}{B8@BWM!f!YbG;sYxd3oFzIZln^+GG>n%qGMZ23#(@3SP3jw~PSAC>cVY
zG*|1pdV5$Dg?e{gbX4|znO!ZJKXy--8uDB#Ak#a*Ka}6a9usvbK&ZFY)*p6#)rdzu
z;l>7TpxsQQ^>!LAm9ns~u(K<bqZu3?7U!VBLx}ch^kk849GI@{{4APL+^GEtEkaKG
z@X=ejO!Fr7=wyj`_m=TyvuB+jq}e*Udh(8fq7DJw=#<_rT1%?fSF^Ca3C)WHH+v$U
zqnH<6YiKE8aa?=$99QoG8}h2?y%g#fEgW2LUu|8XyI%hr0>q?6L_|o((2(PChfTL2
zUU^=!AKm0ejzk_8<&|;>QPIL{Z_Y>bz*dgn(LR{GpNmA1Z6Ud@5cjy_Qe#V3T)mMN
zI!PIbRi$~LA-{0;R4_YCKl;Y>Ycx5<YTI1XacA<hf)XF4-;wBeV|HN}z7hd=b!lGv
z+7EH_4HOv}B}r4Jo0MZ5Eal>7#g_D1S3p|r_?56FB%|kL)Vbl*$g)PAOUQGAsEew&
z&`o^@aasZ*-f(+60?Q3fjM};yubCjA-qsNd+f=i7Ak=Zd2%cXFnT6|txTA!Zy{a3}
zkREF>soT(GJ|UWw=6k`*lgU8xZ(UN>@1ehm(h=^c4o&+a&&|?nCXiBLgmeTBGvUzM
z6t#IWclPO;a7}${LSc3cCIgP@vO2U)WmjOt9p!d?Dg2TkA-zN+YO4_WySi>{o4f$N
zw4Sew8_Re!y-(w}B+z=fR(3I4kb>#F;7Vq*jBFSbeeSMt6^%b=vR)6wd7>Gv{>KO)
zax@I|^cYXtWqEo@MxO>9e?m>bsT%p3`rDGn*PI#&9m^93soij~!9VH`6_q=kA0AEg
znJpqxmbq9$U*S4Zk`Q<%WOs^+QPrnq%%uK^+fEseTV#UjJd+iQ_|q&V-CKka`g(G7
zy&0Q{AX2^UI<J7*2KveC;O1)09OT(K{Z7F}jliD1Aobd-Fa(vw1GYbe8@%dTmZlY&
zZ@6wFrqJkhUREctvxazV#MwIY+V3uPtl~(I1m--WLZZ(&*2Ny%uUvlHy2+oQdx?$R
zNtf?v(iT0br>9wO(7LMekkx2WvZ&5}aGB1N^ZLsFxQ>bGpj=9PS*}Ktn97ltzJY_>
z*uG{e?T4_1z|scF?ya8NuLn+fRTb-T(R%K!QKa368LtSma|Xd`-&xXhr}f%qSB%WV
z2<31V6s(7XA7@A}Cz5VBPwz2~EYmSbDawKE+)k%wbcOqMiTcrHc_=Cnz)O-&JzHkZ
zK;*vGgg}_VzvfelFPX8YT#zyAAE8F#mp@$P^UII)?OkEg2hSjco6a<KvTp>x{vrVa
zYSD1<;kS#d4&wJ8JP;KX#SU`pGh<rzbU4i3+SnFR+y`nslPEEPLNdt7DWE=`Uq^|p
zm|d>kL`fGM$pFj<d@RU9!*^_l+H8iv_24Izcg!gel9W_^1)kv|Bi@iJt2ykKKt+yU
zgmI5&6E%R?n_vk>a}6>o6WGiFoI2X7p`keDG7u@m_oKKV2t+qyrf~x_nm!0*w4vZd
z9PAXf!KEWpTPP;Ib2cKqb?!sZm&O+Usxplxt)aS~&P&p0uwY~NuT5k~A8vAXM-(<2
zsN!N{<DT_BE6bPAJ;SRnsVQv;+~c7Zr9vn<HMWu$_0+P_r<3EfS^tTHML&x6=-KC6
zD$$OO5*S{d-`LqGzX*4tNIw6Vv)xI&sTf~0GPm#^r;46G57rC$nWPHk`JhBTJWW|B
zF8<X0GT##Mb#&)y0!UJ%hC_h?k&_8msuaVEIx$t&JF{RimJ17ud4&F0G0yphyCitn
zHC$I&|Escy7(B82h|JzjU^VnPB~F9?3I5Ry<R~kj|GUwBh*Xvrl_2gpDbOi$Hhp(%
z+6$@X%zj2&$ja2Fj8W9V{cgc#+Qb;l*v*d1c<%_8D;t&04!donQU}}m?g0%v{Y1*_
zTdt^Sr|>p&@Zf^u*-gpnQ6n6yC`C`;p_A_ClpNN@Nmn=SOApV;e9O&Cg0D!SXI;a1
zlEmcfu5-;9ZPBcfBQK*O-@Q*VU0@lyDaq{T&Gy-!>OO3iB<3T$r|?RGR`4c4lNCqp
zDqDKs{g~yF-|9V+jcng%BA(W+&+@%~FV5}I7^~8^Hl7_u=WnFqJG-lRHZ;s4n4@x1
z*JCA9X~r9lh*I-bzDnbuVlgMZbO<Gf_3mSPpP`{+G4m8*q?et7Ph7q4SKnN)g_M6W
zt6R9jmwmCn5UaE_92jt2*5{uAQcR}%g?VSfukK734Hr{X7_#Uq9I9UyBm$A5k8(S1
zp2-NMgu|PqCO{5Xt5kI|g0vn{eHqictB5hzQMH<37vo=n!$~rMQUc%Gc2p>$W&YSj
zy8o!Wi7Nl4t9!YtsPFK!(9~_!B(#?b?A8Az9^E|y{@mxpOOS)No|${dzhF~L0$n-B
z$I&^W!0OjVH#?+HO=tHzo9eMaY3ue%*C^xg;AYqJ4Q9mLKEHOhejn=+9eLJ9T?Qb}
zmZl#1eE}hj<Z)sNhHQC{eoyyo7G@d$d;GXvDsgtunTtwJaJM`Kaz9AUpNhhBf5Vg_
z%AKSlS!QV=n>5>>6R?o>@FzMYw@dbNEGsge-m=GaVqzz@^1~W8U2nKyUuH)l2UGaP
zDaID-i*|koL>i>RZ#)+a&i8oxaHBk$-O9Ro3nzT^Z(=!Y0)aU#qiCaw^2SbV1k<`^
zvi>R@tfO(`7@8o(%N?C;ovesZbm306MxL3Fkf|yu{gR)mSq>2sTbhm(b}Y=M9Mwb6
zDY9*K?O<7ZQt)Z(xbOTrIelG;wDkaK&FU_)aH#A<1@AdQNpL8#Mj9K-1bmMW5zG)k
zc=4}BKwY`IY>n(&3Y@lrQbCt5hB(+`1%W%Q4qiGa)wM(-P8|_3<!w_elr7lX#&lh;
zwDEzj(m82qi4`f0<$=)Ql6OX9Fk~e*N~x+LGf5YCB%gyH@7{TTG>QB;8do`)4xLps
zb?l_cX)jA`fYgZ!2}S~^wcBx1^Wt>e+;l=R;|{;p2)Iog&k7Ir+w|IY@T6gh32z8y
z9bE7%9!gMM`456^Z3mx6pj)o{YL^B;qzbq}s15Fv>4Qzvu46e>t2}?m2IJ_(azHn8
zaoo}`CEC%2CJK`im?cgclvzk?2^E}FyB^ovJ-`6j{T^-*3`wf<3Su>Ha6CGT)|fs>
zqNiT!>@FGdF-=VHEO<qfRp}njJ(_ZEMk4r~gxBSXdl&6eP5uVEdq?&)twN|`NFT$9
zc9|**1>L4N!?mnYTqH38iX8>5nKG(0!~!N&1I^>UUsM#);(;Xe{g*L>jBc(KbNJTR
zWkfMUI*F<+g1cH(vEFZeovlpACmwCubCq1Hf-IVw-zs|1++5uZ!a}zEj-eL9(S!LR
zS9R$uzUEY~?lf`eTm^M?1WKQGW%0V(NDCJCWklWqVPNWK){A?^I`%F2nV%iAs;bzz
zxix5Fpc@+s?JN0tc{oa1TC?FZuFSAAbZ4jjCkSbK+nb%nEpD#V<AVZhz&ep@2?kfl
zZ2OXMBZ+yzI~2Ecq(X%Wr4o${y+eEQBNc0GlS0hP+nEI>vw~5#un)Rv`-vfa)syvN
zVQusA@y~4pH(g8n5BLj>CDgLTy`lG>_WARo$_g|_poLtRG<}RL;jL0Av9J7}s|o?o
z$<|gPhb4}P<F_aWxO244vMRpcmh_d@xJ`tDPfgtbulFyR1Cd^485<_%E6qj+&t`i6
zxz_{2j*>URQA1|#u=~A4!B-KES6}nmqaHaU2o*qNXBVmpLYIaSsz823^DQ}PQv$Mf
zDZRU2{`CDWV2%MuaSS9CyMMo&?ke0~Vl~^@ZQs)(re8L?9P-eb{aP_G(&H*C5t2hy
zv%MBQ2M(r+k_3$?fYHiI$jI^vv$7zaDk$Ji&@}Fy;eOu50-%pB&K@lqOX))Ue_&9%
z+UMMNw}<>|CJ*jl#=j4rE57*LiN)?dw*P5IRvPLia?t($!N+3K^W6Xb-;M=4ZgAlP
z$N!v7nDN~{z|qL+w@aXr2+eZvM>O3(=fCP+e&aU#bum6Wo0O8m5YzAE?2L_hcShph
z|J)}arQ+k`qmfVc|M>5#-Yx@?rDz&rG%-M?hKJ~eDnKQ%+xE|KA3*;Pw)?kH%>V!Q
zExf<$(SN5I+WhBNa;mDFfdr&L&VeVlSPKp9{Y#PCKN=mI$F43ea>*P|GjBfwJN{!3
z|8-kM+5c_7n}*xto<4v691~Og&A*LLkpB^!-F^(i|NlUmgbz@N43uTve6KSW;9G&O
zcMVMP{+(P(e|LQ8<gag!QCFM7k;e*+i*`eJcYE8X0(}@3{QW2HKdfN%Hmf;#(qZ|5
z=ltDZrH1Y?X4ZAKJTDTjCyQhMXDcoG3J3OJWx`2r$Naz&eMx1_(S1uledTgFM`!;C
z_qk;4rcYKEsqezSc9Zaa0fLQFQ&W#->wsL}`Yi+e{QS<&&Sq>{c6K1pEph^03ZTmZ
zQWO-;=E8DM8H9No)!eD4{us=Nmz4=KFU#x<F_a)A#-?<QF5hP}%S=S%omlWX<H#Bh
zjwZ&3`Q_Q9yy}7alt^o3O~{eEZplX_;@MCu5>~p+2fWQ9HO}9iZqbRV``qcv`@ugj
zZ~zK_%Hwdya<E#CQ|bP@1T^4jhlNd*BK5uqofMnIScp-wn5@0G+)mK7YUr+B-;kD^
zdls%m7L~bY_=^p5-fk9tGM;+!Capy?NUhXC1rX|qi;Lg8=kfGbB?C0TJ0QR*O)3Li
z_@JV%!W*BSCF7Q~Bu)EaoR!&h2=ea6Th_nL+HQ1h`@JbRX38(osFmnYpY&gX;oq0|
z13hx(wHH^WF`ngm`PRAr45LWJkk@sA&16pG6u+1U&jw~+;ZU%P(|`3LX=(|-lEr%Q
z{=?Ji8SqD|x#dZ><Ub!>PqH}HC@{T8=qmPe#dG7^T$O%GoQp}!eZ#<c6Q4$H_V|t7
zhn2Bs<6*+W{P$*!ihPz$?Oa?=%U9-1JDHVDM1vuF-nFWHoCQf<uHX;I1Npdo*PghF
z>-^(WSgfX~$_$edE_PaN$})G-?!N8)*M4;%=+kd@ue{wLTwD2?a%7r&2mOxLTsc1j
ziE2#@q-PVpng_E&(?cJQS}wsq=qy-$%c`hee-QdD|D{^$&31LHHxETto0{F6iLi<D
z6Tz>!pJHB$h~?<lgHHY8r5q*`2hKE;Bc9s)<g@GlokUE&C)PpkybT+gE}KW^(q$K>
zUfXQE(NjSxCXFVvXlb9SrM$$;l9n6REKqrV4c}d{^_G!Gro17!Uj$L`>qxvbIml9B
zdBf$?m|Vj9Nm!mZ0@CN#r%=rI#`0(Irsm8!!xHaR`%nShSSHp`s#kc*&FU5%!LlAA
zA!2S1+<80e15MY&;{?mj>nvp0Gk4zE<v2XAj5bIS3wtlGxHl`tpzdjc0j<5SaG9aI
z)=0<J`ri1!19fBtJJp|BfxK`XlS-9(uXrywtFyjL>Vfbhx6jZL*e3qoLZwx!-(Z4;
zNM6w4DRt4SEw;1Cgq$v>Mf3dMhu>ew{+e)Xv>7z)eGvx=4`#>H8uc%nnxLHrTTEKU
z$iBeF4sg+gPjAMN>@d%&7&HpeZZ_O!Uk2y2P{0}lo(U&G`n1d;!3ur_;~!R5GEJA$
zPRqHxrqa)<L{wYNEcZHz#N*;GX&!@bzCT3j$dW1a7w~0IEwYEm)pY7sHcVFY@`7r*
zrg2G24N1tLjNwyeQ-=U*-k<ieZpjY)IXGrhGp$o{D*xgO#ADFeaZFl^X4CYP(e;V_
zuxt|Ud1^r0`#-^R?qwknTFG^7Os0+6!+b-#WYb&RQyHXA14!zkjos;_c^ZZeQ3dWe
z0c5$GXF;}wp=m8sq-9pdyK)z~7a61qK{69K*$D?Ox-J-j_{cpzK{azhosGZy8yX3q
z#l7F=n>mKa>0luDvv2V(GNjFgaEUkeI|_^oN5P(f9ef;5Ce^G)met2-dmqoyUy{vK
zl)_kVStX#&d|+-ronBn@y*OOSmW+&$cQqELm)sdrd?c(uo}9&cQ&G|;mSUOowH_}9
zDKhnD;mu%W<Ep)vcQ-ls()yi#d*z#aPoLgjD%ZWO1=@8~HSKPBDCZg4qA5{rO9uUR
z<(K(`_B#xY1C0lyS#+shN=Lt1MfKE*rZ~jgvdi)%LkY@lCL0wSFN*}GV$a_WGA)cv
z&(F`-iV6pWFV2xw_h2T|z2O#*y6nAa^h@YCRhz0c!IrF2mXG-Il4P+$PCf)rW!?}{
z|1n%>d5)9g{_7gX1a0!Ovokj>?j5{hk+z!cA*mz2as`sEJoPhhHR05UuEIc$IW@CX
zaU0{Kb`JNJA`30O3X}2(_9+S1?;>r!hccfS6xFJdT_2Bx5xUjUiGSG1(5S{7`oBv1
z%CNS!u3HW*?P-gaQlJ#K;_lW$a4YWa?ocGq(o!6XI~-hsyF+nLaY}Hv;BL7a=y|{I
zKJWc^*RN#no$QspR@Rzx%rWMavnmse9=8?bX5T4eLPv)$s1o94F{_@TWm{y%rex@t
z;h%%!PDGn*T{?$@<$nGhW;^Nhp`)DMO}_8@;a9V+OK%G+D?YFDeIS*We@U1FkPB+{
zpqqHU)*%zIyzV39K8dj4MnFYMZcfrCZxfrYHtr7kF?f&CTrWzW5g?X5HmX|CVG}xt
zTm$oqZepKn;N#sgI%ok5_hX6bcT&VmYX8FN@HN^_dx3;TLLs+})n!mXfXAn9(uNzm
z8z1!FF!?jwSM>O7RI-Mb2pM$fl#qHZW>!Cq)TqoRf?r4PqlG3eN9k$QD8n$F94zsB
zv0t+7)V{QhULZ}GG_Iw@-}2zzsg_NoT)hmnzhyanx|3sa9%X4oWfG_p@`ze_X;Qz<
zE8HScq{tnhh6z^q5axi>>KIt4sTF`yf6qanMw2vA2^mRMOdh(v(--v|?)e3`9=>B1
zNugzpzmFBJ7VOvF1gC`)nv$BzH@Jt&rOoHSY>WEFbo;1Kt!JD>-mD&fr+kw>%zIdG
z__MsBjivubg`<A>V@3L2iG<y}!phKBuHJfn3E>>Y^7u2v`l^Wzq*e%{Y_P=Ck?xzf
ziKZr&Z~L`coH4OZrfB<Yb}|Z+JHE_2l_fz0f5&DtZ@fZfL-zb#;ox9ZxOikV{vk_F
zreA$6O>6FOtDq2Lrxtq=STEJecX77=@V~o+cIKaB`+DqZBxRI4ivI0#aND6LF8bXy
z#+~FDUX}z~4P1SJDVM{xf7l;=1y93e=ZDl<R|-&(<=1WY_+`xOu03~iHQ$abe}uhh
z4-Hl#kI2i|<Yc4Y`#f$K`6S<7kf!&~<NLP{w<{X;#)kcLdhAJX^3jt|A&<;Gka3x(
zB6~JI$37lH=pc`WYTVn)1@mw&^UXGF`3II<2x#zUhp~%W#KEfaD$^Su%PV;UE=PWC
z5qIZ9F|@-om!9dY+BPuL4VHcc2P$ApCeP-PB^!A@{y_3Z(5jy=nLi@4z&_YzUq7iy
z+}z5JfO|8!lrI_mOP&QZmf@?OsqZr0Gq{b7K-RN2R6FdH{K+_PbsBv>qf<lV2p6+;
za1(r;q-x>>(%Htu*Aw6bFLB@vYMZIdS*18@UWAj&rO^rW0I3+;;GIwE@swV3$+>j*
z@ei#4^QjwM|J`n0TO~+JN(x|MvL?`TapAJPxS!|PZx4%@u}XJPijRL~qdZ+(LETxy
z@1T8x{m<gGAB{(+M3p0{h>)v#c`W_xAtj_vNE17KVqm=6a_Xy3MJl+xOytNL{H)`j
zirt={$O0hH^`xxq^{*&)8tJy1Jwru$sbPUQg}QKgY4#+<>y+~M=_KG8CrXt;lxfZi
z(O)MFoA7it(9X(HJuyfa`s1ezA=NY5<l>Ub?AHjEqQlL?N)BEvje;e`W*v3Ycv^A6
zMbs0_XEv@{qrZyn%kpVh7I!2Ja8bz8XD7zk*+ZU#w`QmJvD&;Rxb(tFlG#v1bgZ8b
zjc9A{W0lOS){_E>27^ra3cj&n<T8y{m^u}VA9COlLVgzgP0mAocG>Tr%h1FW9iV&u
zHFJrHaJdB`m{yXRX*7~0883DJxbCcfuD^2YgiiY)`B3OPo*b^KM8(`Q;%Pf*UF4fw
zN74_4yB9%ADCDm`0{>cQ&$L<fS$xv&1TG{%q9nYJ6<*O<rE+j|6nWD-jLG<|{?pm-
zekqjZP-?~%GA2jE+>Pg_Nh8Y9jpGGi4;@WI>v5(rH5)2nVQ$0m@4l?d8?XgoZq9f7
za2=(s=zhznRpFe!UAcS`l$fL27dI5P*}vjjoIs@^k3|mOfM<v%($y2=dX{Xz4%cFx
zJq03L_g3dr)*U1Tpbz`<K(-iC?q=7Ipk&+2g@{XV>?vLTVX8U&)m(Xr_eRji^rjSI
z?dSpt-3)9?DJQ|wf)fuktO(7L$tgs<Qh9i|)>xcvw&&8)E3a|eqhHBn2nD}Y=T9?V
zaiy)F(6r@ZW6=PH{^3#Y+K!6Wt?P+lkA+dCF`HHxnm|ttaYJjQx?*>S-;0TY8#79o
z3nO^jI~ZzESf~Nv6##0goW>7M2=>w72@??sqhG0stS66OXH=fAlb*Iz`7{0MCY^oG
zRnpqp`s>%n1ex#u<F+wVsSgTRrq3zwM||BQT_TK~M6<Y(1XI%W<Ze_<n7_5l&Tg`B
za0Wb<xiX(%gJxbv80(K)C3>0RXR&>c7BBt4&;qO1TjPVKPU6~qtye$QZ|YAx79TB-
ztoLoD@Xzn&%_pSmDT%${Ds1?M6BiU+#2511YKY0vx^4V<hN+2I-z%pHY>k*fxG%|m
zx3vj$lfx-4mRnqK{ljuSD1C>gJtuqzo{RFuZLFn2Dt9;sz9!DYD3YT9Ht!#yu8vQF
zU4*%In`J{@Z!X2!l!e<qe{wVR!&N9@a$Bv*q8dCq=RIJ6D0Gyr7lhw14u)6h@QtGG
zO`**E>qT?%0n@bW`)S=$<J4(nx9NB0D5~D|!h+#fjYf?bUUX&CxPv%Qh|M-}Doq-{
zyJe3IIF-D*^*GX&S(;aX$3#$?7VM~ui13U&sR#&_)!?liiDie##uoLOx#o7}iwL)w
zbrrFUMiQ=-S4zX)d>+MVG#X>KV=1iJSEC6C8XF0Br57+_HE#zBki2Cj=Y?nXTI{_h
zP{HLEx1ndl?Wo4d8hDn*>+aYyP-_kGXtL;MwXIZ;qc4pv8TE>ZD@4PH@P{U+8ARl&
zc$Lbr0aE}oYqm6(mMNm3JSvmCt=()OnQi{ZBwp%iaaI1Dp|^sKMwIo`W^2kFbOAJb
zr=&3$EcjjP31v`5`=UOcSdr@dh7nfSZ#ZEG&6s89lz;%^epm^i7e;?RyXEfltxc@y
zlrYXOXlnU5N?l+)7|K!`M}-)n_?&^;p)h7{YWCVxe^B}8;s&O=pBP(8Qw9w;u-Q}}
z2<+hLJ-L~N!GKh2`5)ieHl3(?d4>=_#UL8!?;ka1Z@E1w3ET+I%thUMTfZ}{dzJ4Y
zGZ%Mq0%3V1^apRkr?MI#1$0M1V}3Okhr%2?wfP2q%u*7qG@nTr3`<cNcyq{`n?xUy
ztcmSnqM&L~Sx^@5OY{6hK#gkvC_E6%)cgaGAuM-c)WE<Z#}yeF$Wad{g1rhUfMVEx
zU3B-JZw3cE(th$U1W?R*_sW);HfF1=o5%?7Njjd;jO|$(;OHOBL$jshbh8t05@sN3
z2=3frb9>zT?tLb{L$SH=!j=Ja`P7b34V{ReJL0HFFl{g0A4q<B7dx*7>Zt=w-&f>M
zkX2t_-`xD&(*04S_n^Q2zQckaT5J4&fg$!N9@do5;%UHoQ6KBn($%XfDAlc9{OZ__
z3No*OA=VZg@c;PWty3SGaTcVw3-^)=ZYMZit2{fsp&GsYnkl1H?W3zv-Ob79<QK>J
zgT-#<&^uvuTY<wCR_`ol9V^;ug1X!Z?2Qxr8QMnyj+<bxXZr2oqD9G#vcUcyW4=qM
zk?F<4cjwXp^C`1Z#*wZiDx!k+avFhPhLKHE8b(dkrq+1CXa9h-Sf5DbDBTB62UV^u
zTD0G~UV>}ro0yKP(h9;|z=+OaCA2Ul9G^`*yUdN9oq@r@m|kOG8;)1He+U4YVS)7d
znN(S8t-)VyNRvPPoC<Ndt!CDA!Ge0?;@w%2QOtD@j|%`DFtMJ=1#jN;d3b86$p`Rq
zcAmdz@{mI8ebQRxqpBDjrYt2bFu%dJpi!6!z&Y8k7Rb)Ikqtxtq5`58GLX3`Yq4)w
z9aK!Bb(tkQ(O)bX)N<vkqzkP#m^2LgRlJ<ZU^1u^Cidl%)g~l~9`q{f*9{rN1ftbJ
z-t*>ZHV(&A!bZ%Dpi!ZeSVs)EFAjW`GA5q%vfYn4l?iJ);;TOnz8tpD`kl1Kb`)xX
zfzDIYt*fE_+r>vyGox)j$ihbnW??8YJ{&v}63o7n{bcrEe}3t`WUnnmM8DlA_KjMu
zc@E~hw(6m+$|wt|aV+ENZ}z63rcY@Xmm@Xwa2-|g@YY?rsWBp9(2fAjcC2|`x9)`p
zCByVTX@)Fd{jvt??xNp6GRT^9Dp!tvMB^&4XV#)V&dolamD*0{{4%xQ;6fu$;7of;
z)r3D&BwI~x%T(s5GS=9`7eAb~bt7)nE~(O{eGx#(n{?)_Zeq0^JEWCxlV>Rm?$y=}
zJ3+1rh0H+y#onky9|mTs1?O|#O=d`K9fsNUkoWf&7-=J>zmwml#2X##g>KQanJ=Tl
ztj)NqG;u~$qH3JIwY!?JPGxbdwTMPRLJInu@oxnMlF?&wD8-m&H%Z909O8o=pP%v!
z(avio)Rc{}?!c<WY~v8qmPI9Ymben;LZ4bKi)u8t3C*taTSlv8C8ul6WUEu{PYU<h
z1}VSCbGPqC=&pBar3MGf^|A6$L(K9;K3VDZCD*Rg^JB>_)}OAMShJ(Si$5wwo<#Nv
zYJBju={K6L<)Gs81FoeeW?Rqh*t}gH(fhH=9A{0>FdP5&%-ZiO8U}`Ayq@Npxbn_J
z1u<<yITdk<Pwxn?N$7Z7E%KdSfnQJWoTzTcg)wFnt0q;=0iB$_Dwn^9Iaak|y9ARJ
zlr8e@4qGT7@QTRB>*t=(PNqlc?$V9}?L5RhRPt(&TzOZo=#x^_xTD{LwMUSK;+4+0
z8m$yBH!g(itS3{^AcA-jC8%NG;$rP5GLf0cpc=Ip<8F~>O|4vH<w$%)v67Br%aFx}
z7)vSf_GC%dm?;95=${)ZS$=Y@dL5|dLpN#CGWj2?q2-U+kcYeAvAw%12)LBPnc^k#
zN4r7FC^WG<+@Ep~0Y>6mUoU+x*_xV~bYdIFOx!1mZ}n0vrtO9Lsa_wSs1!=1v~}4E
zZXm`g75e3diz`s0cgvgeCX0%>ft`U)b8?WZb4z~6ayq%IY#NOd;a|OkLf>HTk5~z=
zgu$+QnQ+*xQdIaM%{h8j8gg-y+{}^1^4|FdIQcj_?IBk(RSJYzzwC%nJ+<5m62}Yq
z>~m^rYT!Bo-CeJzo(8KGDVLtD6Yjv_H#wHGkz>fPAF8U5KxvjbfBYUl^^h$R!#z2O
z`SxCGg9l@PiAdFU!4=e&;g>9_OO6XWco%;6LG|wPOSp|02Zd^u94Vu6CbR|}KK?UQ
zGgi6?lgS(3t6cNvT+<NmKHWhc4Vz0laS0R$HZkD}ART+nSG$(V-YYj85X9(&fV~|r
z;tD<jl?&0HGw1^*pO}^>G(xAPZ~Ek{e{(9E!&v~xJjf3e`XJ!4-D!Bq=eIqV?R2?+
zz8o$D96Tp@djFqVux$56W*$Dcl8I^LT+gojxxT1kk@Lvnmbk71EvJJtTLV7gmndt;
z`P)+hJTyCM1bJ6e7A(UP_z1di?|{}7>gotLi13wm4>9DcyCGU7xMwRYrri!{(f_6i
z9&$g|DcvcyA7G7FMc=jiM?ErvRYy(3g6<QuY8i!W%bfnSjn<hJ0~VH^#brP^EWUs=
zWAoBi>gLzbR_q4OUVIX|!q(R~-jAHNvbsTr|2(2`fF>3lRXDyr?zWF5rKF<Xwv&aw
zg4C*~G0j9WPpV&g(YtBHeIfb#RY&=w2n@wsTaMZ28j%_jR>jR$bmV~)$xZn)(Y)0b
znbj(W-91TOo4KW2v;()dG*C7UZ_lO2F@-`s%zarQ<Ih4|JOk+*+ROjmvSvisxa-g{
zu_n_}{ob$mv^sn!yT?_}EB~wdgn^!1UIH3&q}rm_jQFR2*=QxAmrjl9!p(f2sCfF~
zYC3kANGvBvFShwwck%+QSJD}5=>eTtSfDW$8}l8*kz}J>v6|JIBAM*hqP0p<!e&j2
z-gWQiVH(9;O;d8S>9rH+P3SFXwZiJ%B(2dt`~4OJ{ID9ve+KaRz@bIwk3Ynm&Uz@n
zxq3E@yY@JgjxX9z1LCriJipYV-Bcp~_TB=esu}eY>u7lD_+MX4)vFtAi3f_F%X)>k
z)*OM>cd-83YsL;xmrJXr=dT-3Mr3eu227vD4`eZE<Ph!5#Hntpxi+*fo=@+Vavgx$
z68iCsYu9#336owD4Jn?_CX%wO2nz=`W7P+|!i689XelDN<-m1uA7;w47x(OjyQd<t
ztAGrMPHvT)s(iKWd$Pu-*XIk9!CWrXT{A_B(2x11_Tak7%qkVR)u7O>fPCHc)XBrS
z{L~xAkxVN#Gqp81g~*yAPP+sT8P}ep^QVB0u5$RGn{jhJl`vWTw&08kEQ13iXsHch
zgwkQ?M%11Au>5>EWW!*+7RhBDx&%^lImn6Z`(2V-umRg@<JyX*^H$ma92usH69>0R
zFyd|@-NS_)ow+hyn0z=zb<}W+wFYa|v#{NALIX2hI2y<!UuRQ8+E$Xq$D1m093;oQ
zo?+D$j*A0%Z{K^ZKNw!E<9=RrGM_R?9?P1F*W!GGaxv({nqjp88Q=j-dFXm<gT|&>
zpdAN}^_a{x#5P~r{A>sP2h97|1m|(7t43e4!Q77XcaMN?oDD@58+)}=W6|;Uu|Ul|
zfrtX{+yQ42CeE^=rAIFs^2j<{sEz$8t$Ifo^vV;jGC8m;2ep9|Tmt7@+pFM5W$TTh
z78|T1qz8C9X-uD2bIRku`1UW{)U6DIeaxyk5*di62;s;yhFsAJM=$G=c5L;%jX#+z
z??BO~G=(bQZgp}2eI3+nsUGj;*l^8CPH-c>%@W|Nx^!$xhb^wWC3q{7tnF(1w!YK$
z(2P^V{W>b);Fh_sb^$5!TMrLDRs<+6VOEV_hEL7Hl6Q$xWpW+h;q+H5N26kiQkb>O
zsP+Cvb0-k2qQS+b9Eiq@*h;7DGM-05dR-|A`k)*WoHQ}Ud3X?xd0~E<5wc>AP@!~Z
zHaMNTeB$kmEM8m>?R1wHXFczFuvR}q0QO}3MlEI-&h+%B3<F(eBc|SgiXjH`0(NwM
zQ(Mn`<MSYOM&lrTw<B_njwcZ$$tT|_n=_?1@T+&SEBPuP(GxT?Ln{^df*kz6*>@D6
zp_p`Gse$M0B!Z_E_nRr1W^4;ExOPS_t`yVK6dYyjEIG2}_L%DCQ=J&rEMI|}Y2*IF
zidgzt+4%IT2AA8k*VAh=c3mo#)ArY#vtCA|1&gFmu;ig$Mkq|r+f%J0dEorL8GNrF
zg3%yKOupEGz2v8N*)edn85HXlG<jV^eO}GHI(rbT6h)q!swKg%t9EPPQW2c{KJ0~k
z6njomPrqkY`b=kehR$`bz8n?)hGV=;-w{8S0!gkiYAS;}35%nfi-?)<OT9@6mX@Ga
z+_~}pXLuGF6FGiitjuAH2;#1H^~1>bMw(9^Bw*u8oxc}OFf&@wZ98X2+??^x#uVN}
zSxX-$d#=ZT>5<yoEcCRW&-H%qe0?_IdPLN}Z&yn!+7R`XYSiL(ppKB_%DVz=j(+Cs
zd9{zPFGr9`ne^MydgDX$&D1X<3sm{-^Khvb)d|)WBd9{BXJYnV@P7ihXODl{kMC7f
z>IK)m5l#E59xG9MT!hBxjE{Msr>e?{QQamtiBB~8i$xqZ=q9vp>n*`ux8WY~vs))+
ztiGXi%fDl%aZ-<I>bj5ZSzoX~40J$3V*b1<%x#|bT!?PjCy$ZF>NS*MsbJca7b~T?
zZXtA1&y!NCWQ$MX<qRB$X+k>x8~haYHMW4R0$I!@o&NXJ&`_xWK?S$gaZ8)%4r<y1
zu-o_pr4*>|_U(^1ysc%>c2(vuD?D1TuE^?9U8t4!Y|}yAZ7s{DnmUdPuAVhv*;Cn&
zW$pf9OCWinkjsW$H5r{aoy?DPv{tmmWD}dTa~sWJG=KZ!c#fWFLe2_1p?WT<pSPs;
zP@->09D65ptIcy~EyIOJ&+py$CX}y?bH!qi)w#my!vgoimmIhq-yvgfA?k?<)vIO5
z?QcXSPZf)NS6iy11><5TV+@LjP4RQ?a1%}u05>7ksyKFY684{M95<iAk2Hg@7F_yQ
z4*V#eE8S#JjZ2%>djzQyXLyzPTySKaUfC&^FxIV{^)&qaHIwIJdoOG#3#>nDkH8xk
z@4R~ZrYMEd+TBfQQ=p5RM><&(aNU>s-{(zP50r|oTMtT0kJ%Tt60^od!!~xa6O-Ts
zV_G%KosO<1Lu>~CO)$XD()>VHixKxii6~I2lLbV<YekUU+>G`Zk7m?mY?U%*rO@NN
zOJ|jOtwg3dDq;qUWl!9oA(5YKSOcO8{=aEUAO%p?WsStNs!q*2GF>qWZio;stEWO_
zkU%aeBdb)Mn@`NqTqkX8hn;1CCO16Ok!;eP+>neo?5z1hw=O&wQFZ+WR0`mc@aZb-
zeNW{-%!CD>rH6qAxCHOFD(4gY)<fK(T`}isr+$`7HL9<emI7)ki%?>_D^Q)J(Ln5?
z=~{K!><whQzZdBYv0u=yT5G!MmUH&|yl-k%KiK6BZAs9Jm(o%S@snO}lmZo!4XbN*
ztkH8}%x4Ez^$X~J95L_dl@Id_d?(S6UPi+dREvV5(qg1$V%E`A)i?iF=U0ntzDYF1
zKm%FTaV8r+#0hnp;nty|!nw;W_-SMj2&5fk5SU@@=DfB1Res+pjc(}5OJ&Xo&~GVV
zLUc=u&k1K^QQX_RKpR_*+!u@nA!}P1VDu5wCKY|Xl(=*{oPa5loZ>^}Qz@oLQl2%6
z>J^Mk9K5+cBr`Ypopzx&)lVjqC~gyE&ce=gwQZi)=|~By;pRuF5-d}O)RJs2bLo0F
zf1tVe+1?2fEe9vanrN4wwhnRNuafqGj7|+Ld_IwRzq&rywu12R;4ZkhNxe)PmKw64
zWhPss*1ol}(lWKDkRab1bz}gjY~&(czHLjmX|r_e_3jwBHI77lG9m?j_}@?4uB$i{
z!}xQvB81o&Q$MT_*vhz3QArf^*~vnqb!Mk}R~^Y58)lc?eQ#14-#n084<`R;1F&Sb
z;Be7b3zsiNhZpUcQ8QKbCa=dgU9To>9`L9kso`gE{~aKNE-0OeH%_;SGPh~({CA-+
z+rX049c<N+&ruNEY-#ZdetJjFATg{<*LgkN;#zq??!0hq3SePyWQWB>jpwdo!!Q3J
zV_VYP76aOi&|LRs|91h)IU(aF@15&s4_pZ5MKV!$R7U3Jybo6-{e&I1i)wVM&JTC@
zK=q=u>)*2^!FLLlnqD5)TF-dAwHX~X(UPPy%=TVx)2y4Pjg2so_E(rqUhDQYDWgAH
z8{{LDyCfMaTfQ`GWyhri&oOstpSYQX9vRc=4F1dbK?TsE;+BI*JCs^1?O?C%!8Xsv
z>}=b;p`3u7#?^xN9e;qJVibT-1mIHCYECv^xoB_dj4A#|Di#s}hzJSERu0R<=yAq_
zd5SdoT9*<Pc&+wdq#LA(JGzWE`O-n`>ks%GQ3SFEXNzL%l4NAhjVY2HPh~FZ_O7?A
zZEoZEX?+`Zmk%$z2nh+JsASt-J#^)s&0`;$eBP!!)9lr~m8THBY7^7x2?dFMe;QSi
z<W|Mo(Srjez~04sN5UD~q<S91S@8>xyl!YY@?T1bIgn=juHCVqJ|9S(?6Y<GU-v4P
zRaxIywb+H|5scRK#gsrSRK#Vi@-LFZhjrE|i@$vNlAfL(f1gGRD6n()qZd4|0qyr3
z8yEMd=LuL<Brc*O14p=ptXNpIyw1MVk=R%fFwNz}hLxB)mu<+_OuaEr#zS0yS@8aM
zWUkt>!waqLy{TE1RF1r3Z3ET<0zhudyD6T{_Nkcot4+OhX)=ms?L;fvb91{BCQM;K
zQCg+YJsdv6@;GrtfT#YXPIpdGV>>zVM;&LN*re1HsnTK`?Q+2<#I&5<vZtT8QMu3S
z-WhX+6YnwsAG@<iz1PN2k~kN$-iexQ0=8w7^;pY`K1qY4%h%<X$sB1O>qS9J$}-Bo
zJ;%kP(AXT@9~Qol7gLr1ol8vIJimBxUEpe~+6JbplWy(p={ZY7_EKvO3Uj9)**_%U
zt6t;p8;kK-Biba{bFAmTsk*4w_HlC$4B)2@n~2%7RoEI074rEgxTQ+rmQNu&?qciX
z;?<5|mr%A`>v{pHSyW1LJYajqMu|$UAW-Repu<oV+K6xyAMm>XT8XV@T3ybH`3f7L
zGk+G~Z?%oQgiRI@-_sGl`$Rf8VEbO%gQV}(dI=0J9wFz*55i4Ego3D;)EmukrR#yU
zX{Oqh(B24EvdC`!8!1$*z|i)$6lVb(R}VI(52M3a5oX+{lKh!1sarQn7tXONamsnR
zy1&||JJV3tj2{Waee2EQhhjT`Ub)&SI!1px7aO&+IJnF;ynNl{k;H)J9QQ?NW{Yww
zl~e1ww4RiMRoB<*fzq{bc@No&^}-Ghiks4E4v|@uD3@qhGLj_r_sP0bj`pugpE8mB
zP@;{Ow)Kn8`*xnPH^D=!!+6uU+tJid{+ocMK=Bz2eh>r#ORE7UietkVf=><~pGRDn
z9cKRyvVswcQ5+V5i8cT62!~u{T;(RNCAx<tOp}fn(c|K2rTV}0IGx~!p}mVw0=PJ>
z;qp)<v~HkqlAN0?ZS+{U**^eJ?B&AAr|W)}D>Tm^0muy&3w3TKlSl18u#uUXaL6td
zguPnrwwlA%1<!^&X(Q`Mm-|G;XwG&^9OoL;J0s6!WmR6}f|cbuFwo46`wjz6Ry{Q{
zQL#d<D&DLo<Dgg>J(;NR5E|60Xqh<aF<f<d^YuR@OxuTnL;|AKNJyDvsY?w!F_GkC
zheU-or*;^vVwQW3<8Zgjqdc$CxhC1`{ilISw7%ijRY1Or6`S9B>-BA_KoZ)(qq^_2
zd*ip5Y4%?mnft65ybd?1uWtFH^!lS54ZeRE{?XzJKVmA>xw2%a8L0i#<(}(s(0>w{
zSZ-MZ)o^MhJoz~GYw!^T28AV`m8Dh*>>p!R4F!eaex{ZX*j>sz0ESnzvt!CYnv8oF
zgzo><0Iy0j)Er4$-P7}`H;NkPd1q10oJSFCEot7fs`g8?RNxf&s`0m1==%J;M4I%9
zt=C>?3K~(^i_?9<@uez!Nw)Y!3~w#9Lj=51LA?iqh|!kFh$o_!f$e)@4Z<<}5D@Y0
z)gG$`Y$)5@PN$@%y>85io1apm+uyxcO1{@#yYCee7xRkN;@x!9KQ1<FjtP)BuD`I2
zUCv3p+G*}-xFF6-mTA*o`DerBXjYMj8b4E2ZhWof7qgv{E3!Z2-&OyPD*!zGZi$sc
z1Ga2b>bUT&g#Zq14?5km#n>lDM3Xn02lvuJ;GV7LpS|cp$boni2os4>=9TE<^zaf$
zcJa?5&4`-~w96d&hmO@gGZ6Z#_V)Vf83{U4rrXVf8V@-4`Nhvm)!JLU?{z>@i84zJ
z{o@+c$Rg%qXIIy?wY5Iu@|hWJ&_mCtxL33cv}=LuupT4De@y<PqT_0xsQ907@|}_n
zX!qfm;zzeh6XY5G8!wNFgpWMk2NihaMa!1W=~)vB^YY%2kRXlTQBi?T-0h65q03L9
zfy>nVLLnt`4+KC%$NSSrf8XyJEY#jgRsbo-$-Uz}qp;bhwRv)&^!S|W(U8uSmUf0R
zVboiWQnRX>s>_?ho2NQN#N^>Jx^=9QoiFcnWTTDvMZ}2w&XXo(!W3)u2$GwOtaEvB
z?Tqdd;%x|Wwn9EZER@udD{har<cg1}{4fuNPW75XSlG}>Bb|*buz6~9L6Pyo#Utb~
zkksw2*@xb5C(PB2gNWJp^*g>#S7gkLdg&+|cD=|2f);`5X_XRjhyoG=-nzmD?~3;J
z0TMDtpIZmExJCo3SJd?d!CW!QL*4=^qP{Q-pfvab+a|$iyVp=+O6LV%HVU1Zsn}Tb
z3cL8t2p&{%-v%UTb}<1TIv-G*1WCwM#Xk?4U(+4*`h4(KNOAU^7$uXuyaH{6^+SZL
z_Iv608{nZ%MRpYSd2jv2BLa#9L^LbT$s~uITH$%Gi)3n*gulF8y<UB>yGzF~X<~|?
z4|+ji*jDKjLw)Pe0XtK3HdeF@aindx$C8w%>O~xCypZG$GYVMb4vTV#e4kNbwcYV;
z^#wV_7m)T!BuZZ_R0~8#7Ex#Cen}otVDa|rvj<=X=*r!`tshdC)CVXMhX5}1Dn%~;
z5_?2m@Y!Rh@-J06+Uh=J%KznRO9+Dib<GpU>>59Zu^CECl(g9qNsRg5AIqD|_Ka$%
z)yLuUc_h|}FR>qPWDmcK3L-J*od3TBA2K!FbT2QQ@-n)Yp~3Y9)d_O+Q>8pgy_kVd
z57ofadeAc>==&7i@Uy34Z@hH1=*d%Jk0FhnX6ve=WQNEkvjP8`vb#onKLx0`?=3Xu
zoW@4#_r;AO!k{q?wX+x@t3J_NEp<rtG$kfh;JorA5g8d-TAUCiT2i`kK+k%?#ZHgD
zK<=9YD{V5Yh?*05j<5uU3QCYnDSIp2@hq-Rnqc_%$H^`E*`T3EDxveb{#aM{dP>0g
zl@YwZ^Mgqjl5cS!iRIp@u^sbVjAFIXJUbXzfB##Hz=rkTB8u!5A`RoI^vFV?z@LU{
zxDcH(PDweCjIyGm%c4(VLDHL-3H3YzRHmy1<bHOAU|p&>FX)ld@@QspdjP;&28!>!
zN4lkEeCW-CQ4hTCioIXw4u7I)U4^GT2m^HI;G}Y0Xtv1qW5?G?(YpE{-Tmq0DV~$D
z$mqGJ^|&39_4np3t*SqK)g=DCwHhMuJ*wYdKrn`2t@c}#v{;_m)FB9!kD}@gx1!a<
zmG(ss<9#cmA}Wf78}R<&@QgknFaJ*ixb=_+>f8OHt|#|`C3{g`K0cthQB+b=@SjMD
zKH_feswMl+TK0#rj&%AU7yyX$_4W10$jCr0(u{`*_ppiodn=<y_r!l~0${~hjA(ed
z*_t{%flU!CBc)UKiBTLxQNJa~4!3dkkSJ*6y2u}UOeyW$Sl=OYlvB0CQnUZ1uSb+l
zZO8%ck=uAt`>u)Z=Z|LoAYk*YR^7X__c`JMVC}@3o>G3fUXqcAdro2iVfDL(=u;Sw
zZM2z6J?*9oW@q#3>13mc<?+jiscYEvq0iveI0ROWV@w^Wa$&+ihV{&Lece;YmD=?6
z?mKmez`@luE~4VY$ruC{q^Cij54-(PApzRmYHPd6x>EUYKv2Suvw6wvV^)Fd@L<B%
zhTUrmYF&kwVwqCgSvuY=N3P8}t^@>2SQlHZL;`x%mt<l@72K&;R?ApvEuT6vg@Pm5
zC=wDdOAaOWv{IdOzu~!Gi$tvOFC7BL`gWp~I(}|6GwNC9inGAvM<Z-3llvd|dHd<Z
zBjr0REp1LirbLz2jR1@`--?2T3r&QBUJdnoSR7l*o30Z$I_6sd7cE50Ea-Ppzr=#x
zp>(ZJkdx^{<UII6Z;=5z8ff+@53Y<3OUSo`CVn%KwlQfTm6tQ%KDPbg?UP?C3kz0)
zzC%Mpz_ynl6AR3ai+h9P^E32^o~Y=WDH#QOuFO|NM7Y&$q1RqpX*T&=K@u+%REvP>
z0k#FjH>woCpm|>hDiIRW)UA;NmDmAqN=HX0-uk`|NQHM3sy5N9xc)3yCwM-{D*!+J
zaolm`&x<5kE1{Tvd?u!zsMJ7Wn^mm-(BPW5#&TDvGR*;Cm6)uNA6f~45@b}Bm0w|g
zMg%^P-QGueczR+1U;24_2)23pBRWblURq#ueU(pu{RCmR9wv(y@P#BuD*2JW+)R$;
z$DhK4^o|(KU!>>g4B9Xn=Ap14pl2AP58fqf^6mZfHr@a%RG{H|0J}3F(D)9Nd73Fd
zBQhG`kp(W;XZ;JwUR<T9%407FIK8p{7P!m@QZO;?qH}lP*Pm~@ekk44^Y8oPdlTU^
kY7?+000K0T|F6@_Pq7L5dsi~N?>$6GQ8`etu)gpA0g|}>!2kdN

literal 0
HcmV?d00001

diff --git a/tutorials/llm/llama-3/llama3-lora-deploy-nim.ipynb b/tutorials/llm/llama-3/llama3-lora-deploy-nim.ipynb
new file mode 100755
index 000000000000..ca09986ffd59
--- /dev/null
+++ b/tutorials/llm/llama-3/llama3-lora-deploy-nim.ipynb
@@ -0,0 +1,393 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c0e56fcb",
+   "metadata": {},
+   "source": [
+    "# Multi-LoRA inference with NVIDIA NIM\n",
+    "\n",
+    "This is a demonstration of deploying multiple LoRA adapters with NVIDIA NIM. NIM supports LoRA adapters in .nemo (from NeMo Framework), and Hugging Face model formats. \n",
+    "\n",
+    "We will deploy the PubMedQA LoRA adapter from previous notebook, alongside two other previously trained LoRA adapters (GSM8K, SQuAD) that are available on NVIDIA NGC as examples.\n",
+    "\n",
+    "`NOTE`: While it's not necessary to complete the LoRA training and obtain the adapter from the previous notebook (\"Creating a LoRA adapter with NeMo Framework\") to follow along with this one, it is recommended if possible. You can still learn about LoRA deployment with NIM using the other adapters downloaded from NGC."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d95c164c-b7f2-41d8-8ce3-67656f7bee83",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "This notebook includes instructions to send an inference call to NVIDIA NIM using the Python `requests` library."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5fbf9e2-220b-4677-8a5c-68bba94858c8",
+   "metadata": {},
+   "source": [
+    "## Before you begin\n",
+    "Ensure that you satisfy the pre-requisites, and have completed the setup instructions provided in the README associated with this tutorial."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "144d8f05-9dad-425a-9ee8-7b54d7554569",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c83ea9c9-3ef4-4911-8bd3-cb9457dba5d6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f09747b0",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Check available LoRA models\n",
+    "\n",
+    "Once the NIM server is up and running, check the available models as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4489179d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "url = 'http://0.0.0.0:8000/v1/models'\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "data = response.json()\n",
+    "\n",
+    "print(json.dumps(data, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db8f40b4-7b43-4781-bf95-bf566a843422",
+   "metadata": {},
+   "source": [
+    "This will return all the models available for inference by NIM. In this case, it will return the base model `meta/llama3-8b-instruct`, as well as the LoRA adapters that were provided during NIM deployment - `llama3-8b-pubmed-qa` (if applicable), `llama3-8b-instruct-lora_vnemo-math-v1`, and `llama3-8b-instruct-lora_vnemo-squad-v1`. Note that their names match the folder names where their .nemo files are stored."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "151e8efd",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Multi-LoRA inference\n",
+    "\n",
+    "Inference can be performed by sending POST requests to the `/completions` endpoint.\n",
+    "\n",
+    "A few things to note:\n",
+    "* The `model` parameter in the payload specifies the model that the request will be directed to. This can be the base model `meta/llama3-8b-instruct`, or any of the LoRA models, such as `llama3-8b-pubmed-qa`.\n",
+    "* `max_tokens` parameter specifies the maximum number of tokens to generate. At any point, the cumulative number of input prompt tokens and specified number of output tokens to generate should not exceed the model's maximum context limit. For llama3-8b-instruct, the context length supported is 8192 tokens.\n",
+    "\n",
+    "Following code snippets show how it's possible to send requests belonging to different LoRAs (or tasks). NIM dynamically loads the LoRA adapters and serves the requests. It also internally handles the batching of requests belonging to different LoRAs to allow better performance and more efficient of compute."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49789d64-c07c-43ed-8ace-0167d6daf415",
+   "metadata": {},
+   "source": [
+    "### PubMedQA\n",
+    "\n",
+    "If you have trained the PubMedQA LoRA model and made it available via NIM inference, try sending an example from the test set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2dfd2083",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "url = 'http://0.0.0.0:8000/v1/completions'\n",
+    "headers = {\n",
+    "    'accept': 'application/json',\n",
+    "    'Content-Type': 'application/json'\n",
+    "}\n",
+    "\n",
+    "# Example from the PubMedQA test set\n",
+    "prompt=\"BACKGROUND: Sublingual varices have earlier been related to ageing, smoking and cardiovascular disease. The aim of this study was to investigate whether sublingual varices are related to presence of hypertension.\\nMETHODS: In an observational clinical study among 431 dental patients tongue status and blood pressure were documented. Digital photographs of the lateral borders of the tongue for grading of sublingual varices were taken, and blood pressure was measured. Those patients without previous diagnosis of hypertension and with a noted blood pressure \\u2265 140 mmHg and/or \\u2265 90 mmHg at the dental clinic performed complementary home blood pressure during one week. Those with an average home blood pressure \\u2265 135 mmHg and/or \\u2265 85 mmHg were referred to the primary health care centre, where three office blood pressure measurements were taken with one week intervals. Two independent blinded observers studied the photographs of the tongues. Each photograph was graded as none/few (grade 0) or medium/severe (grade 1) presence of sublingual varices. Pearson's Chi-square test, Student's t-test, and multiple regression analysis were applied. Power calculation stipulated a study population of 323 patients.\\nRESULTS: An association between sublingual varices and hypertension was found (OR = 2.25, p<0.002). Mean systolic blood pressure was 123 and 132 mmHg in patients with grade 0 and grade 1 sublingual varices, respectively (p<0.0001, CI 95 %). Mean diastolic blood pressure was 80 and 83 mmHg in patients with grade 0 and grade 1 sublingual varices, respectively (p<0.005, CI 95 %). Sublingual varices indicate hypertension with a positive predictive value of 0.5 and a negative predictive value of 0.80.\\nQUESTION: Is there a connection between sublingual varices and hypertension?\\n ### ANSWER (yes|no|maybe): \"\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"llama3-8b-pubmed-qa\",\n",
+    "    \"prompt\": prompt,\n",
+    "    \"max_tokens\": 128\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, headers=headers, json=data)\n",
+    "response_data = response.json()\n",
+    "\n",
+    "print(json.dumps(response_data, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8292214a-2b53-41dd-97c7-1ed93877bf01",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1877e910-ed46-417a-8b0f-89f13d9bdafb",
+   "metadata": {},
+   "source": [
+    "### Grade School Math (GSM8K dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "256d3771-b6a6-4d0d-89ef-680dbb34e515",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "url = 'http://0.0.0.0:8000/v1/completions'\n",
+    "headers = {\n",
+    "    'accept': 'application/json',\n",
+    "    'Content-Type': 'application/json'\n",
+    "}\n",
+    "\n",
+    "prompt = '''Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Answer:'''\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"llama3-8b-instruct-lora_vnemo-math-v1\",\n",
+    "    \"prompt\": prompt,\n",
+    "    \"max_tokens\": 128\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, headers=headers, json=data)\n",
+    "response_data = response.json()\n",
+    "\n",
+    "print(json.dumps(response_data, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3f56d091-ce70-44ea-a705-e350eb4d6e31",
+   "metadata": {},
+   "source": [
+    "### Extractive Question-Answering (SQuAD)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f50aa6e-0b9a-4834-b7d6-51a48f16eea6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "url = 'http://0.0.0.0:8000/v1/completions'\n",
+    "headers = {\n",
+    "    'accept': 'application/json',\n",
+    "    'Content-Type': 'application/json'\n",
+    "}\n",
+    "\n",
+    "prompt = '''CONTEXT: \"The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands.\\nQUESTION: What were the Norman dynasty famous for? ANSWER:'''\n",
+    "data = {\n",
+    "    \"model\": \"llama3-8b-instruct-lora_vnemo-squad-v1\",\n",
+    "    \"prompt\": prompt,\n",
+    "    \"max_tokens\": 128\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, headers=headers, json=data)\n",
+    "response_data = response.json()\n",
+    "\n",
+    "print(json.dumps(response_data, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b65afd7a",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## (Optional) Testing the accuracy of NIM inference\n",
+    "\n",
+    "If you followed the previous notebook on training a Llama-3-8b-Instruct LoRA adapter using NeMo Framework and evaluated the model accuracy, you can test the same using NIM inference for validation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7516c8c7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Ensure that the path to PubMedQA test data is correct\n",
+    "data_test = json.load(open(\"./pubmedqa/data/test_set.json\",'rt'))\n",
+    "\n",
+    "def read_jsonl (fname):\n",
+    "    obj = []\n",
+    "    with open(fname, 'rt') as f:\n",
+    "        st = f.readline()\n",
+    "        while st:\n",
+    "            obj.append(json.loads(st))\n",
+    "            st = f.readline()\n",
+    "    return obj\n",
+    "\n",
+    "prepared_test = read_jsonl(\"./pubmedqa/data/pubmedqa_test.jsonl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68511ac9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Send an inference request to the PubMedQA LoRA model\n",
+    "def infer(prompt):\n",
+    "\n",
+    "    url = 'http://0.0.0.0:8000/v1/completions'\n",
+    "    headers = {\n",
+    "        'accept': 'application/json',\n",
+    "        'Content-Type': 'application/json'\n",
+    "    }\n",
+    "\n",
+    "    data = {\n",
+    "        \"model\": \"llama3-8b-pubmed-qa\",\n",
+    "        \"prompt\": prompt,\n",
+    "        \"max_tokens\": 128\n",
+    "    }\n",
+    "\n",
+    "    response = requests.post(url, headers=headers, json=data)\n",
+    "    response_data = response.json()\n",
+    "\n",
+    "    return(response_data[\"choices\"][0][\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4f44cd6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "results = {}\n",
+    "sample_id = list(data_test.keys())\n",
+    "\n",
+    "for i, key in tqdm(enumerate(sample_id)):\n",
+    "    answer = infer(prepared_test[i]['input'].strip())\n",
+    "    answer = answer.lower()\n",
+    "    if 'yes' in answer:\n",
+    "        results[key] = 'yes'\n",
+    "    elif 'no' in answer:\n",
+    "        results[key] = 'no'\n",
+    "    elif 'maybe' in answer:\n",
+    "        results[key] = 'maybe'\n",
+    "    else:\n",
+    "        print(\"Malformed answer: \", answer)\n",
+    "        results[key] = 'maybe'\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "319f49ba-0b57-486e-977b-06c89466af60",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9942a1d6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# dump results\n",
+    "FILENAME=\"pubmedqa-llama-3-8b-lora-NIM.json\"\n",
+    "with(open(FILENAME, \"w\")) as f:\n",
+    "    json.dump(results, f)\n",
+    "\n",
+    "# Evaluation\n",
+    "!cp $FILENAME ./pubmedqa/\n",
+    "!cd ./pubmedqa/ && python evaluation.py $FILENAME"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d014d79",
+   "metadata": {},
+   "source": [
+    "NIM inference should provide comparable accuracy to NeMo Framework inference.\n",
+    "\n",
+    "Note that each individual answer also conform to the format we specified, i.e. `<<< {answer} >>>`."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tutorials/llm/llama-3/llama3-lora-nemofw.ipynb b/tutorials/llm/llama-3/llama3-lora-nemofw.ipynb
new file mode 100755
index 000000000000..3244bf18e818
--- /dev/null
+++ b/tutorials/llm/llama-3/llama3-lora-nemofw.ipynb
@@ -0,0 +1,595 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d3323204-1463-4df3-8c75-5e95b6d66ba1",
+   "metadata": {},
+   "source": [
+    "# Creating a Llama-3 LoRA adapter with NeMo Framework"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29f3d632-44a0-4e6c-9229-b70bbcff1e99",
+   "metadata": {},
+   "source": [
+    "This notebook showcases performing LoRA PEFT **Llama 3 8B** on [PubMedQA](https://pubmedqa.github.io/) using NeMo Framework. PubMedQA is a Question-Answering dataset for biomedical texts.\n",
+    "\n",
+    "> `NOTE:` Ensure that you run this notebook inside the [NeMo Framework container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) which has all the required dependencies. Instructions are available in the associated tutorial README."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50de4d53",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install ipywidgets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "deb6a910-a05e-4ae1-aac4-56e5092be2b4",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "---\n",
+    "##  Step-by-step instructions\n",
+    "\n",
+    "This notebook is structured into six steps:\n",
+    "1. Download Llama-3-8B-Instruct from Hugging Face\n",
+    "2. Convert Llama-3-8B-Instruct to NeMo format\n",
+    "3. Prepare the dataset\n",
+    "4. Run the PEFT finetuning script\n",
+    "5. Inference with NeMo Framework\n",
+    "6. Check the model accuracy\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1f8f06d-aa9b-49cf-b50b-023967fc9e1a",
+   "metadata": {},
+   "source": [
+    "### Step 1: Download the model from Hugging Face"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5c50597-53e9-4604-9b86-af4c8e6b027e",
+   "metadata": {},
+   "source": [
+    "> `NOTE:` Access to Meta-Llama-3-8B-Instruct is gated. Before you proceed, ensure that you have a Hugging Face account, and have requested the necessary permission from Hugging Face and Meta to download the model on the [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) page. Then, you can use your Hugging Face [access token](https://huggingface.co/docs/hub/en/security-tokens) to download the model in the following code snippet, which we will then convert and customize with NeMo Framework."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f14a2ea5-309b-4f78-8524-313043e9daeb",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import huggingface_hub\n",
+    "\n",
+    "# Set your Hugging Face access token\n",
+    "huggingface_hub.login(\"<YOUR_HUGGINGFACE_ACCESS_TOKEN>\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99125f50",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "os.makedirs(\"./Meta-Llama-3-8B-Instruct\" ,exist_ok=True)\n",
+    "huggingface_hub.snapshot_download(repo_id=\"meta-llama/Meta-Llama-3-8B-Instruct\", local_dir=\"Meta-Llama-3-8B-Instruct\", local_dir_use_symlinks=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18d5a8a9-41db-4186-a51a-a89d0501e1c0",
+   "metadata": {},
+   "source": [
+    "The Llama-3-8B-Instruct model will be downloaded to `./Meta-Llama-3-8B-Instruct`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49fc4629",
+   "metadata": {},
+   "source": [
+    "### Step 2: Convert Llama-3-8B-Instruct to NeMo format\n",
+    "\n",
+    "Run the below code to convert the model to the NeMo format. \n",
+    "\n",
+    "The generated `.nemo` file uses distributed checkpointing and can be loaded with any Tensor Parallel (TP) or Pipeline Parallel (PP) combination without reshaping or splitting. For more information on parallelisms in NeMo, refer to [NeMo Framework documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55331dd3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "# clear any previous temporary weights dir if any\n",
+    "rm -r model_weights\n",
+    "\n",
+    "python /opt/NeMo/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \\\n",
+    "  --precision bf16 \\\n",
+    "  --input_name_or_path=./Meta-Llama-3-8B-Instruct/ \\\n",
+    "  --output_path=./Meta-Llama-3-8B-Instruct.nemo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fafb86d7-6254-42d4-b9aa-ab8a723f90c1",
+   "metadata": {},
+   "source": [
+    "This will create a .nemo model file in current working directory."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ea5bd31",
+   "metadata": {},
+   "source": [
+    "### Step 3: Prepare the dataset\n",
+    "\n",
+    "Download the PubMedQA dataset and run the pre-processing script in the cloned directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "944b43c5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "# Download the dataset and prep. scripts\n",
+    "git clone https://github.com/pubmedqa/pubmedqa.git\n",
+    "\n",
+    "# split it into train/val/test datasets\n",
+    "cd pubmedqa/preprocess\n",
+    "python split_dataset.py pqal"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8025b2d4",
+   "metadata": {},
+   "source": [
+    "The following example shows what a single row looks inside of the PubMedQA train, validation and test splits.\n",
+    "\n",
+    "```json\n",
+    "\"18251357\": {\n",
+    "    \"QUESTION\": \"Does histologic chorioamnionitis correspond to clinical chorioamnionitis?\",\n",
+    "    \"CONTEXTS\": [\n",
+    "        \"To evaluate the degree to which histologic chorioamnionitis, a frequent finding in placentas submitted for histopathologic evaluation, correlates with clinical indicators of infection in the mother.\",\n",
+    "        \"A retrospective review was performed on 52 cases with a histologic diagnosis of acute chorioamnionitis from 2,051 deliveries at University Hospital, Newark, from January 2003 to July 2003. Third-trimester placentas without histologic chorioamnionitis (n = 52) served as controls. Cases and controls were selected sequentially. Maternal medical records were reviewed for indicators of maternal infection.\",\n",
+    "        \"Histologic chorioamnionitis was significantly associated with the usage of antibiotics (p = 0.0095) and a higher mean white blood cell count (p = 0.018). The presence of 1 or more clinical indicators was significantly associated with the presence of histologic chorioamnionitis (p = 0.019).\"\n",
+    "    ],\n",
+    "    \"reasoning_required_pred\": \"yes\",\n",
+    "    \"reasoning_free_pred\": \"yes\",\n",
+    "    \"final_decision\": \"yes\",\n",
+    "    \"LONG_ANSWER\": \"Histologic chorioamnionitis is a reliable indicator of infection whether or not it is clinically apparent.\"\n",
+    "},\n",
+    "```\n",
+    "\n",
+    "Use the following code to convert the train, validation, and test PubMedQA data into the `JSONL` format that NeMo needs for PEFT."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90f69729",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "def read_jsonl(fname):\n",
+    "    obj = []\n",
+    "    with open(fname, 'rt') as f:\n",
+    "        st = f.readline()\n",
+    "        while st:\n",
+    "            obj.append(json.loads(st))\n",
+    "            st = f.readline()\n",
+    "    return obj\n",
+    "\n",
+    "def write_jsonl(fname, json_objs):\n",
+    "    with open(fname, 'wt') as f:\n",
+    "        for o in json_objs:\n",
+    "            f.write(json.dumps(o)+\"\\n\")\n",
+    "            \n",
+    "def form_question(obj):\n",
+    "    st = \"\"    \n",
+    "    for i, label in enumerate(obj['LABELS']):\n",
+    "        st += f\"{label}: {obj['CONTEXTS'][i]}\\n\"\n",
+    "    st += f\"QUESTION: {obj['QUESTION']}\\n\"\n",
+    "    st += f\" ### ANSWER (yes|no|maybe): \"\n",
+    "    return st\n",
+    "\n",
+    "def convert_to_jsonl(data_path, output_path):\n",
+    "    data = json.load(open(data_path, 'rt'))\n",
+    "    json_objs = []\n",
+    "    for k in data.keys():\n",
+    "        obj = data[k]\n",
+    "        prompt = form_question(obj)\n",
+    "        completion = obj['final_decision']\n",
+    "        json_objs.append({\"input\": prompt, \"output\": f\"<<< {completion} >>>\"})\n",
+    "    write_jsonl(output_path, json_objs)\n",
+    "    return json_objs\n",
+    "\n",
+    "\n",
+    "test_json_objs = convert_to_jsonl(\"pubmedqa/data/test_set.json\", \"pubmedqa/data/pubmedqa_test.jsonl\")\n",
+    "train_json_objs = convert_to_jsonl(\"pubmedqa/data/pqal_fold0/train_set.json\", \"pubmedqa/data/pubmedqa_train.jsonl\")\n",
+    "dev_json_objs = convert_to_jsonl(\"pubmedqa/data/pqal_fold0/dev_set.json\", \"pubmedqa/data/pubmedqa_val.jsonl\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62777542",
+   "metadata": {},
+   "source": [
+    "> `Note:` In the output, we enforce the inclusion of “<<<” and “>>>“ markers which would allow verification of the LoRA tuned model during inference. This is  because the base model can produce “yes” / “no” responses based on zero-shot templates as well."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04a3fc36",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# clear up cached mem-map file\n",
+    "!rm pubmedqa/data/*idx*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ddd0f2a",
+   "metadata": {},
+   "source": [
+    "After running the above script, you will see  `pubmedqa_train.jsonl`, `pubmedqa_val.jsonl`, and `pubmedqa_test.jsonl` files appear in the data directory.\n",
+    "\n",
+    "This is what an example will be formatted like after the script has converted the PubMedQA data into `JSONL` -\n",
+    "\n",
+    "```json\n",
+    "{\"input\": \"QUESTION: Failed IUD insertions in community practice: an under-recognized problem?\\nCONTEXT: The data analysis was conducted to describe the rate of unsuccessful copper T380A intrauterine device (IUD) insertions among women using the IUD for emergency contraception (EC) at community family planning clinics in Utah.\\n ...  ### ANSWER (yes|no|maybe): \",\n",
+    "\"output\": \"<<< yes >>>\"}\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0eb1d887",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Step 4: Run PEFT finetuning script for LoRA\n",
+    "\n",
+    "NeMo framework includes a high level python script for fine-tuning  [megatron_gpt_finetuning.py](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py) that can abstract away some of the lower level API calls. Once you have your model downloaded and the dataset ready, LoRA fine-tuning with NeMo is essentially just running this script!\n",
+    "\n",
+    "For this demonstration, this training run is capped by `max_steps`, and validation is carried out every `val_check_interval` steps. If the validation loss does not improve after a few checks, training is halted to avoid overfitting.\n",
+    "\n",
+    "> `NOTE:` In the block of code below, pass the paths to your train, test and validation data files as well as path to the .nemo model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2c129f9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "# Set paths to the model, train, validation and test sets.\n",
+    "MODEL=\"./Meta-Llama-3-8B-Instruct.nemo\"\n",
+    "TRAIN_DS=\"[./pubmedqa/data/pubmedqa_train.jsonl]\"\n",
+    "VALID_DS=\"[./pubmedqa/data/pubmedqa_val.jsonl]\"\n",
+    "TEST_DS=\"[./pubmedqa/data/pubmedqa_test.jsonl]\"\n",
+    "TEST_NAMES=\"[pubmedqa]\"\n",
+    "\n",
+    "SCHEME=\"lora\"\n",
+    "TP_SIZE=1\n",
+    "PP_SIZE=1\n",
+    "\n",
+    "OUTPUT_DIR=\"./results/Meta-Llama-3-8B-Instruct\"\n",
+    "rm -r $OUTPUT_DIR\n",
+    "\n",
+    "torchrun --nproc_per_node=1 \\\n",
+    "/opt/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \\\n",
+    "    exp_manager.exp_dir=${OUTPUT_DIR} \\\n",
+    "    exp_manager.explicit_log_dir=${OUTPUT_DIR} \\\n",
+    "    trainer.devices=1 \\\n",
+    "    trainer.num_nodes=1 \\\n",
+    "    trainer.precision=bf16-mixed \\\n",
+    "    trainer.val_check_interval=20 \\\n",
+    "    trainer.max_steps=500 \\\n",
+    "    model.megatron_amp_O2=True \\\n",
+    "    ++model.mcore_gpt=True \\\n",
+    "    model.tensor_model_parallel_size=${TP_SIZE} \\\n",
+    "    model.pipeline_model_parallel_size=${PP_SIZE} \\\n",
+    "    model.micro_batch_size=1 \\\n",
+    "    model.global_batch_size=8 \\\n",
+    "    model.restore_from_path=${MODEL} \\\n",
+    "    model.data.train_ds.num_workers=0 \\\n",
+    "    model.data.validation_ds.num_workers=0 \\\n",
+    "    model.data.train_ds.file_names=${TRAIN_DS} \\\n",
+    "    model.data.train_ds.concat_sampling_probabilities=[1.0] \\\n",
+    "    model.data.validation_ds.file_names=${VALID_DS} \\\n",
+    "    model.peft.peft_scheme=${SCHEME}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf4331fd-da30-4e29-8477-3085118e4a7b",
+   "metadata": {},
+   "source": [
+    "This will create a LoRA adapter - a file named `megatron_gpt_peft_lora_tuning.nemo` in `./results/Meta-Llama-3-8B-Instruct/checkpoints/`. We'll use this later.\n",
+    "\n",
+    "To further configure the run above -\n",
+    "\n",
+    "* **A different PEFT technique**: The `peft.peft_scheme` parameter determines the technique being used. In this case, we did LoRA, but NeMo Framework supports other techniques as well - such as P-tuning, Adapters, and IA3. For more information, refer to the [PEFT support matrix](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/nemo_megatron/peft/landing_page.html). For example, for P-tuning, simply set \n",
+    "\n",
+    "```bash\n",
+    "model.peft.peft_scheme=\"ptuning\" # instead of \"lora\"\n",
+    "```\n",
+    "\n",
+    "* **Tuning Llama-3 70B**: You will need 8xA100 or 8xH100 GPUs. Provide the path to it's .nemo checkpoint (similar to the download and conversion steps earlier), and change the model parallelization settings for Llama-3 70B PEFT to distribute across the GPUs. It is also recommended to run the fine-tuning script from a terminal directly instead of Jupyter when using more than 1 GPU.\n",
+    "```bash\n",
+    "model.tensor_model_parallel_size=8\n",
+    "model.pipeline_model_parallel_size=1\n",
+    "```\n",
+    "\n",
+    "You can override many such configurations while running the script. A full set of possible configurations is located in [NeMo Framework Github](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53979a4d",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Step 5: Inference with NeMo Framework\n",
+    "\n",
+    "Running text generation within the framework is also possible with running a Python script. Note that is more for testing and validation, not a full-fledged  deployment solution like NVIDIA NIM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00d1e3f8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Check that the LORA model file exists\n",
+    "!ls -l ./results/Meta-Llama-3-8B-Instruct/checkpoints"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3430a0b0-05a0-4179-8750-151d492bb9ae",
+   "metadata": {},
+   "source": [
+    "In the code snippet below, the following configurations are worth noting - \n",
+    "\n",
+    "1. `model.restore_from_path` to the path for the Meta-Llama-3-8B-Instruct.nemo file.\n",
+    "2. `model.peft.restore_from_path` to the path for the PEFT checkpoint that was created in the fine-tuning run in the last step.\n",
+    "3. `model.test_ds.file_names` to the path of the pubmedqa_test.jsonl file\n",
+    "\n",
+    "If you have made any changes in model or experiment paths, please ensure they are configured correctly below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "568eb35d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "MODEL=\"./Meta-Llama-3-8B-Instruct.nemo\"\n",
+    "TEST_DS=\"[./pubmedqa/data/pubmedqa_test.jsonl]\"\n",
+    "TEST_NAMES=\"[pubmedqa]\"\n",
+    "SCHEME=\"lora\"\n",
+    "TP_SIZE=1\n",
+    "PP_SIZE=1\n",
+    "\n",
+    "# This is where your LoRA checkpoint was saved\n",
+    "PATH_TO_TRAINED_MODEL=\"./results/Meta-Llama-3-8B-Instruct/checkpoints/megatron_gpt_peft_lora_tuning.nemo\"\n",
+    "\n",
+    "# The generation run will save the generated outputs over the test dataset in a file prefixed like so\n",
+    "OUTPUT_PREFIX=\"pubmedQA_result_\"\n",
+    "\n",
+    "python /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \\\n",
+    "    model.restore_from_path=${MODEL} \\\n",
+    "    model.peft.restore_from_path=${PATH_TO_TRAINED_MODEL} \\\n",
+    "    trainer.devices=1 \\\n",
+    "    trainer.num_nodes=1 \\\n",
+    "    model.data.test_ds.file_names=${TEST_DS} \\\n",
+    "    model.data.test_ds.names=${TEST_NAMES} \\\n",
+    "    model.data.test_ds.global_batch_size=1 \\\n",
+    "    model.data.test_ds.micro_batch_size=1 \\\n",
+    "    model.data.test_ds.tokens_to_generate=3 \\\n",
+    "    model.tensor_model_parallel_size=${TP_SIZE} \\\n",
+    "    model.pipeline_model_parallel_size=${PP_SIZE} \\\n",
+    "    inference.greedy=True \\\n",
+    "    model.data.test_ds.output_file_path_prefix=${OUTPUT_PREFIX} \\\n",
+    "    model.data.test_ds.write_predictions_to_file=True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2fe048f9",
+   "metadata": {},
+   "source": [
+    "### Step 6: Check the model accuracy\n",
+    "\n",
+    "Now that the results are in, let's read the results and calculate the accuracy on the pubmedQA task. You can compare your accuracy results with the public leaderboard at https://pubmedqa.github.io/.\n",
+    "\n",
+    "Let's take a look at one of the predictions in the generated output file. The `pred` key indicates what was generated."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa5c0fdc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!tail -n 1 pubmedQA_result__test_pubmedqa_inputs_preds_labels.jsonl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1c91df7",
+   "metadata": {},
+   "source": [
+    "Note that the model produces output in the specified format, such as `<<< no >>>`.\n",
+    "\n",
+    "The following snippet loads the generated output and calculates accuracy in comparison to the test set using the `evaluation.py` script included in the PubMedQA repo."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "900f81c2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "answers = []\n",
+    "with open(\"pubmedQA_result__test_pubmedqa_inputs_preds_labels.jsonl\",'rt') as f:\n",
+    "    st = f.readline()\n",
+    "    while st:\n",
+    "        answers.append(json.loads(st))\n",
+    "        st = f.readline()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74e1bbce",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "data_test = json.load(open(\"./pubmedqa/data/test_set.json\",'rt'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a85926e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "results = {}\n",
+    "sample_id = list(data_test.keys())\n",
+    "\n",
+    "for i, key in enumerate(sample_id):\n",
+    "    answer = answers[i]['pred']\n",
+    "    if 'yes' in answer:\n",
+    "        results[key] = 'yes'\n",
+    "    elif 'no' in answer:\n",
+    "        results[key] = 'no'\n",
+    "    elif 'maybe' in answer:\n",
+    "        results[key] = 'maybe'\n",
+    "    else:\n",
+    "        print(\"Malformed answer: \", answer)\n",
+    "        results[key] = 'maybe'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fea1a217",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Dump results in a format that can be ingested by PubMedQA evaluation file\n",
+    "FILENAME=\"pubmedqa-llama-3-8b-lora.json\"\n",
+    "with(open(FILENAME, \"w\")) as f:\n",
+    "    json.dump(results, f)\n",
+    "\n",
+    "# Evaluation\n",
+    "!cp $FILENAME ./pubmedqa/\n",
+    "!cd ./pubmedqa/ && python evaluation.py $FILENAME"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9909283e-e1f8-450e-a730-403e22f621ad",
+   "metadata": {},
+   "source": [
+    "For the Llama-3-8B-Instruct model, you should see accuracy comparable to the below:\n",
+    "```\n",
+    "Accuracy 0.786000\n",
+    "Macro-F1 0.550305\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}