From a153b8c58d56aa930749587017ee70d56f75445e Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Fri, 22 Nov 2024 13:04:32 -0500
Subject: [PATCH 01/11] Fix transcribe speech (#11379)

* add explicit bracket for or operation

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add cfg arg

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
---
 examples/asr/transcribe_speech.py          | 3 +++
 nemo/collections/asr/models/ctc_models.py  | 1 +
 nemo/collections/asr/models/rnnt_models.py | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index f1d61edc990e..5c4a636e8b1c 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -276,6 +276,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     # we will adjust this flag if the model does not support it
     compute_langs = cfg.compute_langs
 
+    if cfg.timestamps:
+        cfg.return_hypotheses = True
+
     # Check whether model and decoder type match
     if isinstance(asr_model, EncDecCTCModel):
         if cfg.decoder_type and cfg.decoder_type != 'ctc':
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 3df6a7352c4d..76dcd13cca50 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -160,6 +160,7 @@ def transcribe(
             A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as 
             paths2audio_files
         """
+        timestamps = timestamps or (override_config.timestamps if override_config is not None else None)
         if timestamps is not None:
             # else retain the decoder state (users can set it using change_decoding_strategy)
             if timestamps or (override_config is not None and override_config.timestamps):
diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
index a6408b5e935e..e4d1abd0b50c 100644
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@@ -285,7 +285,7 @@ def transcribe(
             * A list of greedy transcript texts / Hypothesis
             * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis.
         """
-
+        timestamps = timestamps or (override_config.timestamps if override_config is not None else None)
         if timestamps is not None:
             if timestamps or (override_config is not None and override_config.timestamps):
                 logging.info(

From d033737c96c7639f2e3912a0f6fea65be2267688 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Fri, 22 Nov 2024 14:10:10 -0500
Subject: [PATCH 02/11] Add llama 3.2 1b and 3b (#11335)

* add llama 3.2 1b and 3b

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* add recipe to init

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix path

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py           |   4 +
 nemo/collections/llm/gpt/model/__init__.py |   4 +
 nemo/collections/llm/gpt/model/llama.py    |  39 ++-
 nemo/collections/llm/recipes/__init__.py   |   4 +
 nemo/collections/llm/recipes/llama32_1b.py | 270 +++++++++++++++++++++
 nemo/collections/llm/recipes/llama32_3b.py | 270 +++++++++++++++++++++
 6 files changed, 588 insertions(+), 3 deletions(-)
 create mode 100644 nemo/collections/llm/recipes/llama32_1b.py
 create mode 100644 nemo/collections/llm/recipes/llama32_3b.py

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index c5c2e007bc1e..c36da39b43c7 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -73,6 +73,8 @@
     Llama31Config8B,
     Llama31Config70B,
     Llama31Config405B,
+    Llama32Config1B,
+    Llama32Config3B,
     LlamaConfig,
     LlamaModel,
     MaskedTokenLossReduction,
@@ -171,6 +173,8 @@
     "Llama31Config8B",
     "Llama31Config70B",
     "Llama31Config405B",
+    "Llama32Config1B",
+    "Llama32Config3B",
     "CodeLlamaConfig7B",
     "CodeLlamaConfig13B",
     "CodeLlamaConfig34B",
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 152309536f5b..9f186ebba90f 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -59,6 +59,8 @@
     Llama31Config8B,
     Llama31Config70B,
     Llama31Config405B,
+    Llama32Config1B,
+    Llama32Config3B,
     LlamaConfig,
     LlamaModel,
 )
@@ -134,6 +136,8 @@
     "Llama31Config8B",
     "Llama31Config70B",
     "Llama31Config405B",
+    "Llama32Config1B",
+    "Llama32Config3B",
     "NemotronConfig",
     "Nemotron3Config4B",
     "Nemotron3Config8B",
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index f5f6de6c79e7..a7e995addb83 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -14,6 +14,7 @@
 
 import math
 from dataclasses import dataclass
+from functools import partial
 from pathlib import Path
 from typing import TYPE_CHECKING, Annotated, Callable, Optional
 
@@ -86,7 +87,7 @@ class Llama2Config70B(LlamaConfig):
 
 
 @dataclass
-class Llama3Config(GPTConfig):
+class Llama3Config(LlamaConfig):
     num_query_groups: int = 8
     hidden_dropout: float = 0.0
     attention_dropout: float = 0.0
@@ -182,6 +183,32 @@ class Llama31Config405B(Llama31Config):
     make_vocab_size_divisible_by: int = 128
 
 
+@dataclass
+class Llama32Config1B(Llama31Config):
+    scale_factor: int = 32
+    share_embeddings_and_output_weights: bool = True
+    rotary_base: int = 500_000
+    num_layers: int = 16
+    hidden_size: int = 2048
+    ffn_hidden_size: int = 8192
+    num_attention_heads: int = 32
+    num_query_groups: int = 8
+    make_vocab_size_divisible_by: int = 128
+
+
+@dataclass
+class Llama32Config3B(Llama31Config):
+    scale_factor: int = 32
+    share_embeddings_and_output_weights: bool = True
+    rotary_base: int = 500_000
+    num_layers: int = 28
+    hidden_size: int = 3072
+    ffn_hidden_size: int = 8192
+    num_attention_heads: int = 24
+    num_query_groups: int = 8
+    make_vocab_size_divisible_by: int = 128
+
+
 @dataclass
 class CodeLlamaConfig7B(Llama2Config7B):
     rotary_base: int = 1_000_000
@@ -252,6 +279,9 @@ def convert_state(self, source, target):
             "model.norm.weight": "decoder.final_layernorm.weight",
             "lm_head.weight": "output_layer.weight",
         }
+        if getattr(source.config, "tie_word_embeddings", False):
+            # llama 3.2 1B and 3B models have no shared input output embeddings
+            del mapping["lm_head.weight"]
 
         return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1])
 
@@ -275,7 +305,7 @@ def make_vocab_size_divisible_by(vocab_size):
 
         if getattr(source, 'rope_scaling', None) is not None and source.rope_scaling.get('rope_type') == 'llama3':
             # Apply Llama3.1 customize rope scaling
-            cls = Llama31Config
+            cls = partial(Llama31Config, scale_factor=source.rope_scaling.get("factor", 8.0))
         else:
             cls = LlamaConfig
         output = cls(
@@ -289,7 +319,7 @@ def make_vocab_size_divisible_by(vocab_size):
             rotary_base=source.rope_theta,
             gated_linear_unit=True,
             make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
-            share_embeddings_and_output_weights=False,
+            share_embeddings_and_output_weights=getattr(source, "tie_word_embeddings", False),
             fp16=(dtype_from_hf(source) == torch.float16),
             bf16=(dtype_from_hf(source) == torch.bfloat16),
             params_dtype=dtype_from_hf(source),
@@ -355,6 +385,7 @@ def config(self) -> "HFLlamaConfig":
             num_key_value_heads=source.num_query_groups,
             rope_theta=source.rotary_base,
             vocab_size=self.tokenizer.vocab_size,
+            tie_word_embeddings=source.share_embeddings_and_output_weights,
         )
 
 
@@ -509,6 +540,8 @@ def apply_rope_scaling(
     "Llama31Config8B",
     "Llama31Config70B",
     "Llama31Config405B",
+    "Llama32Config1B",
+    "Llama32Config3B",
     "CodeLlamaConfig7B",
     "CodeLlamaConfig13B",
     "CodeLlamaConfig34B",
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 449592298d41..1db88f633e89 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -33,6 +33,8 @@
     llama31_8b,
     llama31_70b,
     llama31_405b,
+    llama32_1b,
+    llama32_3b,
     mamba2_1_3b,
     mamba2_2_7b,
     mamba2_8b,
@@ -89,6 +91,8 @@
     "llama31_8b",
     "llama31_70b",
     "llama31_405b",
+    "llama32_1b",
+    "llama32_3b",
     "mamba2_130m",
     "mamba2_370m",
     "mamba2_780m",
diff --git a/nemo/collections/llm/recipes/llama32_1b.py b/nemo/collections/llm/recipes/llama32_1b.py
new file mode 100644
index 000000000000..32675adf3686
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama32_1b.py
@@ -0,0 +1,270 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import lightning.pytorch as pl
+import nemo_run as run
+import torch
+from lightning.pytorch.callbacks.callback import Callback
+from megatron.core.distributed import DistributedDataParallelConfig
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
+from nemo.collections.llm.gpt.model.llama import Llama32Config1B, LlamaModel
+from nemo.collections.llm.peft import PEFT_STR2CLS
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "llama32_1b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llama3.2 1B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llama3.2 1B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=llama32_1b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    conf = run.Config(Llama32Config1B)
+    conf.seq_length = 8192
+    return run.Config(LlamaModel, config=conf)
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Llama3.2 1B model.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=llama32_1b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses extensive parallelism to handle the large model size efficiently.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Llama3.2 1B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory llama32_1b
+            $ nemo llm pretrain --factory "llama32_1b(num_nodes=1, name='my_1b_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="llama32_1b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for the large 8B model and requires significant computational resources.
+    """
+    recipe = run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+    seq_length: Optional[int] = None,
+    packed_sequence: Optional[bool] = None,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llama3.2 1B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        seq_length (int): Maximum number of tokens per microbatch.
+        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
+            maximum seq_length for better efficiency.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory llama32_1b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llama32_1b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+
+    # For unpacked sequence, most samples in SQuAD dataset are shorter than 2K
+    if seq_length is None:
+        seq_length = 4096 if packed_sequence else 2048
+
+    recipe = default_finetune_recipe(
+        model(), "meta-llama/Llama-3.2-1B", dir, name, num_nodes, num_gpus_per_node, packed_sequence
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
+        recipe.peft.dim = 8
+        recipe.peft.alpha = 16
+        recipe.optim.config.use_distributed_optimizer = False
+
+        # some settings currently do not function correctly with LoRA
+        recipe.model.config.cross_entropy_loss_fusion = False
+
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+
+    # Sequence length settings in the model and dataset must agree
+    recipe.model.config.seq_length = seq_length
+    recipe.data.seq_length = seq_length
+    if packed_sequence:
+        recipe.data.dataset_kwargs = {'pad_to_max_length': True}
+        recipe.data.packed_sequence_specs = run.Config(PackedSequenceSpecs, packed_sequence_size=seq_length)
+
+    return recipe
diff --git a/nemo/collections/llm/recipes/llama32_3b.py b/nemo/collections/llm/recipes/llama32_3b.py
new file mode 100644
index 000000000000..d78ea0b50983
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama32_3b.py
@@ -0,0 +1,270 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional
+
+import lightning.pytorch as pl
+import nemo_run as run
+import torch
+from lightning.pytorch.callbacks.callback import Callback
+from megatron.core.distributed import DistributedDataParallelConfig
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
+from nemo.collections.llm.gpt.model.llama import Llama32Config3B, LlamaModel
+from nemo.collections.llm.peft import PEFT_STR2CLS
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
+from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "llama32_3b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a Llama3.2 3B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the Llama3.2 3B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=llama32_3b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    conf = run.Config(Llama32Config3B)
+    conf.seq_length = 8192
+    return run.Config(LlamaModel, config=conf)
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for Llama3.2 3B model.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=llama32_3b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
+            >>> print(trainer_config)
+
+    Note:
+        This configuration uses extensive parallelism to handle the large model size efficiently.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=pretrain, name=NAME)
+def pretrain_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
+    """
+    Create a pre-training recipe for Llama3.2 3B model.
+
+    This function sets up a complete configuration for pre-training, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for pre-training.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain --factory llama32_3b
+            $ nemo llm pretrain --factory "llama32_3b(num_nodes=1, name='my_3b_pretrain')"
+
+        Python API usage:
+            >>> recipe = pretrain_recipe(name="llama32_3b_pretrain", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe is optimized for the large 8B model and requires significant computational resources.
+    """
+    recipe = run.Partial(
+        fn,
+        model=model(),
+        trainer=trainer(
+            num_nodes=num_nodes,
+            num_gpus_per_node=num_gpus_per_node,
+            callbacks=[run.Config(TimingCallback)],
+        ),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
+        log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
+        optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
+        resume=default_resume(),
+    )
+
+    return recipe
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    peft_scheme: Optional[str] = 'lora',
+    seq_length: Optional[int] = None,
+    packed_sequence: Optional[bool] = None,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Llama3.2 3B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        seq_length (int): Maximum number of tokens per microbatch.
+        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
+            maximum seq_length for better efficiency.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory llama32_3b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="llama32_3b_finetune", num_nodes=1)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+
+    # For unpacked sequence, most samples in SQuAD dataset are shorter than 2K
+    if seq_length is None:
+        seq_length = 4096 if packed_sequence else 2048
+
+    recipe = default_finetune_recipe(
+        model(), "meta-llama/Llama-3.2-3B", dir, name, num_nodes, num_gpus_per_node, packed_sequence
+    )
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
+        recipe.peft.dim = 8
+        recipe.peft.alpha = 16
+        recipe.optim.config.use_distributed_optimizer = False
+
+        # some settings currently do not function correctly with LoRA
+        recipe.model.config.cross_entropy_loss_fusion = False
+
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+
+    # Sequence length settings in the model and dataset must agree
+    recipe.model.config.seq_length = seq_length
+    recipe.data.seq_length = seq_length
+    if packed_sequence:
+        recipe.data.dataset_kwargs = {'pad_to_max_length': True}
+        recipe.data.packed_sequence_specs = run.Config(PackedSequenceSpecs, packed_sequence_size=seq_length)
+
+    return recipe

From ba7a68255bb2be0d449f7c63ed43178f78e188fd Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Fri, 22 Nov 2024 21:22:25 +0200
Subject: [PATCH 03/11] mlm conversion & tiktokenizer support (#11349)

* mlm conversion fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add tiktoken support for nemotron -> hf

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* additional params

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* add ci test for mlm conversion

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add ci test for mlm ckpt conversion

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove extra if statement

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix typo

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix if statement

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix paths

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update paths

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               | 121 ++++++++++++++++++
 .../megatron_ckpt_to_nemo.py                  |   9 +-
 nemo/collections/nlp/models/nlp_model.py      |  17 ++-
 .../convert_nemotron_nemo_to_hf.py            |  38 +++++-
 4 files changed, 182 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 49c6c55ca778..b82bbc65cfc1 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -2109,6 +2109,126 @@ jobs:
     #    }
     #  }
 
+  L2_Megatron_LM_To_NeMo_Conversion:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_LM_To_NeMo_Conversion') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=1 Megatron-LM/pretrain_gpt.py \
+          --mock-data \
+          --distributed-timeout-minutes 60 \
+          --use-mcore-models \
+          --no-mmap-bin-files \
+          --untie-embeddings-and-output-weights \
+          --disable-bias-linear \
+          --train-samples 80 \
+          --init-method-std 0.014 \
+          --position-embedding-type rope \
+          --rotary-base 1000000 \
+          --rotary-percent 1.0 \
+          --squared-relu \
+          --num-layers 4 \
+          --hidden-size 384 \
+          --num-attention-heads 8 \
+          --group-query-attention \
+          --num-query-groups 8 \
+          --ffn-hidden-size 1536 \
+          --kv-channels 128 \
+          --normalization RMSNorm \
+          --attention-dropout 0.0 \
+          --hidden-dropout 0.0 \
+          --exit-duration-in-mins 5750 \
+          --tensor-model-parallel-size 1 \
+          --pipeline-model-parallel-size 1 \
+          --seq-length 8192 \
+          --max-position-embeddings 8192 \
+          --micro-batch-size 1 \
+          --global-batch-size 8 \
+          --lr 6e-4 \
+          --min-lr 6e-6 \
+          --weight-decay 0.1 \
+          --clip-grad 1.0 \
+          --lr-decay-style cosine \
+          --log-interval 1 \
+          --eval-iters 1 \
+          --eval-interval 10 \
+          --tokenizer-type GPT2BPETokenizer \
+          --tokenizer-model /home/TestData/nlp/gpt2_tokenizer \
+          --vocab-file /home/TestData/nlp/gpt2_tokenizer/vocab.json \
+          --merge-file /home/TestData/nlp/gpt2_tokenizer/merges.txt \
+          --save /tmp/mlm_conversion_ckpt \
+          --save-interval 10 \
+          --ckpt-format torch_dist \
+          --ckpt-fully-parallel-save \
+          --ckpt-fully-parallel-load \
+          --async-save \
+          --ckpt-assume-constant-structure \
+          --timing-log-option minmax \
+          --log-params-norm \
+          --log-num-zeros-in-grad \
+          --log-throughput \
+          --bf16 \
+          --adam-beta1 0.9 \
+          --adam-beta2 0.95 \
+          --use-distributed-optimizer \
+          --overlap-grad-reduce \
+          --overlap-param-gather \
+          --manual-gc \
+          --num-workers 2
+        
+        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+          model.data.data_impl=mock \
+          model.data.data_prefix=[] \
+          model.skip_train=True \
+          model.transformer_engine=True \
+          model.use_flash_attention=False \
+          model.normalization=rmsnorm \
+          model.num_layers=4 \
+          model.hidden_size=384 \
+          model.ffn_hidden_size=1536 \
+          model.num_attention_heads=8 \
+          model.num_query_groups=8 \
+          model.bias=False \
+          model.bias_activation_fusion=False \
+          model.bias_dropout_add_fusion=True \
+          model.masked_softmax_fusion=True \
+          model.encoder_seq_length=8192 \
+          model.max_position_embeddings=8192 \
+          model.data.seq_length=8192 \
+          model.activation=squared-relu \
+          model.transformer_block_type=True \
+          model.micro_batch_size=1 \
+          model.global_batch_size=8 \
+          ++model.rotary_base=1000000 \
+          model.rotary_percentage=1.0 \
+          model.apply_query_key_layer_scaling=False \
+          ++model.group_query_attention=True \
+          model.apply_rope_fusion=True \
+          model.kv_channels=128 \
+          ++model.bert_binary_head=True \
+          ++model.position_embedding_type=rope \
+          ++model.add_position_embedding=True \
+          trainer.limit_val_batches=1 \
+          exp_manager.exp_dir=/tmp/nemo_conversion_ckpt
+
+        python -m torch.distributed.launch --nproc_per_node=1 examples/nlp/language_modeling/megatron_ckpt_to_nemo.py \
+          --checkpoint_folder /tmp/mlm_conversion_ckpt \
+          --checkpoint_name iter_0000010 \
+          --nemo_file_path /tmp/mlm_to_nemo_test.nemo \
+          --tensor_model_parallel_size 1 \
+          --pipeline_model_parallel_size 1 \
+          --gpus_per_node 1 \
+          --model_type gpt \
+          --hparams_file /tmp/nemo_conversion_ckpt/megatron_gpt/version_0/hparams.yaml \
+          --convert_mlm
+
+      AFTER_SCRIPT: |
+        rm -rf /tmp/nemo_conversion_ckpt
+        rm -rf /tmp/mlm_conversion_ckpt
+        rm -rf /tmp/mlm_to_nemo_test.nemo
+
   L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4432,6 +4552,7 @@ jobs:
       - L2_RAG_Pipeline_Generating
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_Skip_Train
+      - L2_Megatron_LM_To_NeMo_Conversion
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_Drop_Optimizer_States_TP2
diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
index b46f8f459ff0..4b9fab987dc7 100644
--- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
@@ -112,6 +112,11 @@ def get_args():
         choices=['32-true', '16-mixed', 'bf16-mixed'],
         help="Precision value for the trainer that matches with precision of the ckpt",
     )
+    parser.add_argument(
+        "--convert_mlm",
+        action="store_true",
+        help="Use this flag to convert megatron-lm checkpoints.",
+    )
 
     args = parser.parse_args()
     return args
@@ -195,7 +200,9 @@ def convert(local_rank, rank, world_size, args):
     )
 
     if args.model_type == 'gpt':
-        model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer)
+        model = MegatronGPTModel.load_from_checkpoint(
+            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer, load_mlm=args.convert_mlm
+        )
     elif args.model_type == 'sft':
         model = MegatronGPTSFTModel.load_from_checkpoint(
             checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py
index 0c61b085bc7f..6a87eb28723c 100644
--- a/nemo/collections/nlp/models/nlp_model.py
+++ b/nemo/collections/nlp/models/nlp_model.py
@@ -397,7 +397,22 @@ def dummy():
                         model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
                     model.trainer.strategy.setup_environment()
                 sharded_state_dict = model.sharded_state_dict()
-                checkpoint['state_dict'] = sharded_state_dict
+                if kwargs.get("load_mlm", False):
+                    mlm_sharded_state_dict = {}
+                    for k, v in sharded_state_dict.items():
+                        # Remove 'model.' from the sharded_state_dict keys
+                        new_key = k.replace('model.', '', 1)
+
+                        # Update the key attribute of the ShardedTensor value
+                        new_value = v
+                        if hasattr(v, 'key'):
+                            new_value.key = v.key.replace('model.', '', 1)
+
+                        # Add the updated key-value pair to the new dictionary
+                        mlm_sharded_state_dict[new_key] = new_value
+                    checkpoint['state_dict'] = mlm_sharded_state_dict
+                else:
+                    checkpoint['state_dict'] = sharded_state_dict
                 # load the checkpoint from disk
                 checkpoint = dist_checkpointing.load(sharded_state_dict=checkpoint, checkpoint_dir=checkpoint_dir)
                 # restore the weights
diff --git a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py
index 392e3628ccdb..2f66773f8724 100644
--- a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py
@@ -21,7 +21,7 @@
 import torch
 from lightning.pytorch import Trainer
 from transformers import LlamaTokenizer, PreTrainedTokenizerFast
-from transformers.convert_slow_tokenizer import LlamaConverter
+from transformers.convert_slow_tokenizer import LlamaConverter, TikTokenConverter
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -130,6 +130,20 @@ def convert_hf_config(nemo_config, tokenizer, vocab_size, dtype, hf_output_path,
     json.dump(hf_config, open(f"{hf_output_path}/config.json", "w"), indent=2)
 
 
+def convert_tiktoken(vocab_file) -> None:
+    with open(vocab_file, 'r') as f:
+        vocab = json.load(f)
+    os.remove(vocab_file)
+
+    lines = []
+    for line in vocab:
+        lines.append(f"{line['token_bytes']} {line['rank']}")
+
+    for line in lines:
+        with open(vocab_file, 'a') as f:
+            f.write(line + '\n')
+
+
 def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None:
     """
     Convert NeMo weights to HF weights
@@ -323,6 +337,28 @@ def extract_nemotron_tokenizer(nemo_file, model_config, output_hf_path, nemo_tok
         )
         tokenizer.save_pretrained(output_hf_path)
         logging.info(f"Setencepiece tokenizer has been saved to {output_tokenizer}")
+    elif tokenizer_cfg.library == "tiktoken":
+        tokenizer_fn = tokenizer_cfg.model[5:]
+        special_tokens = ["<unk>", "<s>", "</s>"]
+        import tarfile
+
+        archive = tarfile.open(nemo_file, "r")
+        tokenizer_filename = "./" + tokenizer_fn  # exclude 'nemo:' prefix
+        archive.extract(tokenizer_filename, output_hf_path)
+        archive.close()
+        vocab_file = os.path.join(output_hf_path, tokenizer_fn)
+        convert_tiktoken(vocab_file)
+        converted_tokenizer = TikTokenConverter(
+            vocab_file=vocab_file, additional_special_tokens=special_tokens
+        ).converted()
+        os.remove(vocab_file)
+        tokenizer = PreTrainedTokenizerFast(
+            tokenizer_object=converted_tokenizer,
+            model_input_names=["input_ids", "attention_mask"],
+            bos_token="<s>",
+            eos_token="</s>",
+        )
+        tokenizer.save_pretrained(output_hf_path)
     elif isinstance(nemo_tokenizer, AutoTokenizer):
         nemo_tokenizer.tokenizer.save_pretrained(output_hf_path)
         logging.info(f"HF AutoTokenizer has been saved to {output_hf_path}")

From 7ec58fab6fe990efb6abf18b68bc2eceffbbd457 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 22 Nov 2024 11:25:39 -0800
Subject: [PATCH 04/11] nit: remove non-strictly needed lines

---
 .github/workflows/cicd-main.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index b82bbc65cfc1..a4b2baa59550 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -2224,11 +2224,6 @@ jobs:
           --hparams_file /tmp/nemo_conversion_ckpt/megatron_gpt/version_0/hparams.yaml \
           --convert_mlm
 
-      AFTER_SCRIPT: |
-        rm -rf /tmp/nemo_conversion_ckpt
-        rm -rf /tmp/mlm_conversion_ckpt
-        rm -rf /tmp/mlm_to_nemo_test.nemo
-
   L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml

From 9d80f84bc101282046707d55ed2b1ef490f31a80 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Fri, 22 Nov 2024 15:12:47 -0800
Subject: [PATCH 05/11] add metric calc (#11381)

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
---
 .../llama-3/nemo2-sft-peft/nemo2-peft.ipynb   | 25 +++++++++++++++++++
 .../llama-3/nemo2-sft-peft/nemo2-sft.ipynb    | 25 +++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
index cd3bae1cc627..aa463e2b84be 100644
--- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
@@ -499,6 +499,31 @@
     "```"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 5. Calculate Evaluation Metrics\n",
+    "\n",
+    "We can evaluate the model's predictions by calculating the Exact Match (EM) and F1 scores.\n",
+    "- Exact Match is a binary measure (0 or 1) checking if the model outputs match one of the\n",
+    "ground truth answer exactly.\n",
+    "- F1 score is the harmonic mean of precision and recall for the answer words.\n",
+    "\n",
+    "Below is a script that computes these metrics. The sample scores can be improved by training the model further and performing hyperparameter tuning. In this notebook, we only train for 20 steps.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!python /opt/NeMo/scripts/metric_calculation/peft_metric_calc.py --pred_file peft_prediction.jsonl --label_field \"original_answers\" --pred_field \"prediction\""
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
index 479d81928e98..e84ff916fc4e 100644
--- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
@@ -606,6 +606,31 @@
     "{\"input\": \"Muckle Water is a long, narrow fresh water loch on Ward Hill on Rousay, Orkney, Scotland. It is the biggest loch on the island and is popular for fishing. It can be reached by a track from the roadside. The Suso Burn on the north eastern shore drains the loch into the Sound of Rousay.\\n\\nWhere is Muckle Water?\", \"category\": \"closed_qa\", \"label\": \"Muckle water is located in Rousay, Orkney, Scotland.\", \"prediction\": \" Muckle Water is a long, narrow fresh water loch on Ward Hill on Rousay,\"}\n",
     "```"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 5. Calculate Evaluation Metrics\n",
+    "\n",
+    "We can evaluate the model's predictions by calculating the Exact Match (EM) and F1 scores.\n",
+    "- Exact Match is a binary measure (0 or 1) checking if the model outputs match one of the\n",
+    "ground truth answer exactly.\n",
+    "- F1 score is the harmonic mean of precision and recall for the answer words.\n",
+    "\n",
+    "Below is a script that computes these metrics. The sample scores can be improved by training the model further and performing hyperparameter tuning. In this notebook, we only train for 20 steps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!python /opt/NeMo/scripts/metric_calculation/peft_metric_calc.py --pred_file sft_prediction.jsonl --label_field \"label\" --pred_field \"prediction\""
+   ]
   }
  ],
  "metadata": {

From e83d3eaa968120cd22112215510af0c7fd7ccc90 Mon Sep 17 00:00:00 2001
From: Michal Futrega <mfutrega@nvidia.com>
Date: Sat, 23 Nov 2024 22:08:00 +0100
Subject: [PATCH 06/11] Enable packed dataset for validation; add
 a2a_experimental argument (#11378)

* Enable packed dataset for validation; add a2a_experimental argument

* Apply isort and black reformatting

Signed-off-by: michal2409 <michal2409@users.noreply.github.com>

---------

Signed-off-by: michal2409 <michal2409@users.noreply.github.com>
Co-authored-by: michal2409 <michal2409@users.noreply.github.com>
---
 nemo/collections/llm/gpt/data/fine_tuning.py  | 49 ++++++++++++++-----
 .../llm/gpt/data/packed_sequence.py           | 30 +++++++++---
 nemo/collections/llm/peft/lora.py             |  3 ++
 3 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py
index 8fcef72f3bd9..0d866bb600fe 100644
--- a/nemo/collections/llm/gpt/data/fine_tuning.py
+++ b/nemo/collections/llm/gpt/data/fine_tuning.py
@@ -117,17 +117,28 @@ def prepare_data(self) -> None:
         """
         Prepare packed sequence data
         """
-        if self.packed_sequence_size > 0 and not self.train_path_packed.is_file():
+        if self.packed_sequence_size > 0:
             from nemo.collections.llm.gpt.data.packed_sequence import prepare_packed_sequence_data
 
-            prepare_packed_sequence_data(
-                input_path=self.train_path,
-                output_path=self.train_path_packed,
-                packed_sequence_size=self.packed_sequence_size,
-                tokenizer=self.tokenizer,
-                max_seq_length=self.seq_length,
-                seed=self.seed,
-            )
+            if not self.train_path_packed.is_file():
+                prepare_packed_sequence_data(
+                    input_path=self.train_path,
+                    output_path=self.train_path_packed,
+                    packed_sequence_size=self.packed_sequence_size,
+                    tokenizer=self.tokenizer,
+                    max_seq_length=self.seq_length,
+                    seed=self.seed,
+                )
+
+            if not self.validation_path_packed.is_file():
+                prepare_packed_sequence_data(
+                    input_path=self.validation_path,
+                    output_path=self.validation_path_packed,
+                    packed_sequence_size=self.packed_sequence_size,
+                    tokenizer=self.tokenizer,
+                    max_seq_length=self.seq_length,
+                    seed=self.seed,
+                )
 
     def setup(self, stage: str):
         """Called by pytorch lightning in datamodule setup"""
@@ -195,7 +206,7 @@ def val_dataloader(self) -> DataLoader:
         # pylint: disable=C0115,C0116
         return self._create_dataloader(
             self._create_dataset(
-                self.validation_path,
+                self.validation_path if self.packed_sequence_size <= 0 else self.validation_path_packed,
                 is_test=True,
                 **self.dataset_kwargs,
             ),
@@ -249,8 +260,8 @@ def train_path_packed(self) -> Path:
         """Path to training dataset file for packed sequence. The file path contains a reference to the
         tokenizer/model name since packed sequence dataset consists of tokenized indices."""
         if self.packed_sequence_size > 0:
-            if self.packed_sequence_specs.packed_data_path is not None:
-                return self.packed_sequence_specs.packed_data_path
+            if self.packed_sequence_specs.packed_train_data_path is not None:
+                return self.packed_sequence_specs.packed_train_data_path
             tokenizer_model_name = self._extract_tokenizer_model_name()
             folder_name = self.dataset_root / "packed" / tokenizer_model_name
             folder_name.mkdir(parents=True, exist_ok=True)
@@ -258,6 +269,20 @@ def train_path_packed(self) -> Path:
         else:
             raise ValueError("`train_path_packed` invalid since packed sequence size is not specified.")
 
+    @property
+    def validation_path_packed(self) -> Path:
+        """Path to validation dataset file for packed sequence. The file path contains a reference to the
+        tokenizer/model name since packed sequence dataset consists of tokenized indices."""
+        if self.packed_sequence_size > 0:
+            if self.packed_sequence_specs.packed_val_data_path is not None:
+                return self.packed_sequence_specs.packed_val_data_path
+            tokenizer_model_name = self._extract_tokenizer_model_name()
+            folder_name = self.dataset_root / "packed" / tokenizer_model_name
+            folder_name.mkdir(parents=True, exist_ok=True)
+            return folder_name / f"validation_{self.packed_sequence_size}.npy"
+        else:
+            raise ValueError("`validation_path_packed` invalid since packed sequence size is not specified.")
+
     @property
     def validation_path(self) -> Path:
         """Path to validation dataset file"""
diff --git a/nemo/collections/llm/gpt/data/packed_sequence.py b/nemo/collections/llm/gpt/data/packed_sequence.py
index 153e79f94391..345489ea0b63 100644
--- a/nemo/collections/llm/gpt/data/packed_sequence.py
+++ b/nemo/collections/llm/gpt/data/packed_sequence.py
@@ -101,15 +101,31 @@ class PackedSequenceSpecs:
     This field is set by llm.finetune api.
     """
 
-    packed_data_path: str = None
+    packed_train_data_path: str = None
     """
-    If specified, use the packed dataset from this file instead of the default path.
+    If specified, use this file for the packed training dataset instead of the default path.
+    """
+
+    packed_val_data_path: str = None
+    """
+    If specified, use this file for the packed validation dataset instead of the default path.
     """
 
     def __post_init__(self):
-        if self.packed_data_path is not None:
-            self.packed_data_path = Path(self.packed_data_path)
+        if self.packed_train_data_path is not None:
+            self.packed_train_data_path = Path(self.packed_train_data_path)
+            assert (
+                self.packed_train_data_path.suffix == ".npy"
+            ), f"packed training data file must be a .npy file: {self.packed_train_data_path}"
+            assert (
+                self.packed_train_data_path.exists()
+            ), f"packed training data file does not exist: {self.packed_train_data_path}"
+
+        if self.packed_val_data_path is not None:
+            self.packed_val_data_path = Path(self.packed_val_data_path)
+            assert (
+                self.packed_val_data_path.suffix == ".npy"
+            ), f"packed validation data file must be a .npy file: {self.packed_val_data_path}"
             assert (
-                self.packed_data_path.suffix == ".npy"
-            ), f"packed data file must be a .npy file: {self.packed_data_path}"
-            assert self.packed_data_path.exists(), f"packed data file does not exist: {self.packed_data_path}"
+                self.packed_val_data_path.exists()
+            ), f"packed validation data file does not exist: {self.packed_val_data_path}"
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index 57cdda3a2871..205cde071fa7 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -124,6 +124,7 @@ class LoRA(PEFT):
         dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0.
         dropout_position (Literal['pre', 'post'], optional): Position for applying dropout.
             Can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'post'.
+        a2a_experimental (bool): Enables the experimental All-to-All (A2A) communication strategy. Defaults to False.
 
     Example:
     --------
@@ -151,6 +152,7 @@ class LoRA(PEFT):
     dropout_position: Literal['pre', 'post'] = 'post'
     lora_A_init_method: str = "xavier"
     lora_B_init_method: str = "zero"
+    a2a_experimental: bool = False
 
     def transform(self, m: nn.Module, name=None, prefix=None):
         """
@@ -224,6 +226,7 @@ def wildcard_match(pattern, key):
                 model_parallel_config=getattr(m, "config", None),
                 alpha=self.alpha,
                 is_expert=is_expert_linear(full_name),
+                a2a_experimental=self.a2a_experimental,
             )
             return AdapterParallelAdd(m, adapter)
         return m

From 3afcde032355efa735f670db72f7efa361ab26dc Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Sun, 24 Nov 2024 17:27:47 -0500
Subject: [PATCH 07/11] Fix DDP unused param error when TE is enabled in NeMo
 Lite (#11364)

* Fix DDP unused param error when TE is enabled

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Added partial function for te

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
---
 examples/llm/sft/hf.py                        | 23 ++++++++++---------
 .../gpt/model/hf_auto_model_for_causal_lm.py  |  6 +++++
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py
index 59b8b4ad3491..1d282312b130 100755
--- a/examples/llm/sft/hf.py
+++ b/examples/llm/sft/hf.py
@@ -19,7 +19,7 @@
 
 from nemo import lightning as nl
 from nemo.collections import llm
-from nemo.lightning.pytorch.accelerate.transformer_engine import is_te_accelerated, te_accelerate
+from nemo.lightning.pytorch.accelerate.transformer_engine import is_te_accelerated
 from nemo.lightning.pytorch.callbacks import ModelCallback
 
 
@@ -75,16 +75,17 @@ def squad(tokenizer) -> pl.LightningDataModule:
         grad_clip = None
     use_dist_samp = False
 
-    model = llm.HfAutoModelForCausalLM(args.model)
-    tokenizer = model.tokenizer
+    model_accelerator = None
+    if args.model_accelerator == "te":
+        from functools import partial
+        from nemo.lightning.pytorch.accelerate.transformer_engine import te_accelerate
 
-    callbacks = []
-    if args.model_accelerator:
-        if args.model_accelerator == "te":
-            model_transform = ModelCallback(
-                on_train_start=lambda model: te_accelerate(model, fp8_autocast=args.fp8_autocast)
-            )
-            callbacks.append(model_transform)
+        model_accelerator = partial(te_accelerate, fp8_autocast=args.fp8_autocast)
+
+    from nemo.lightning.pytorch.accelerate.transformer_engine import te_accelerate
+
+    model = llm.HfAutoModelForCausalLM(model_name=args.model, model_accelerator=model_accelerator)
+    tokenizer = model.tokenizer
 
     llm.api.finetune(
         model=model,
@@ -100,7 +101,7 @@ def squad(tokenizer) -> pl.LightningDataModule:
             accumulate_grad_batches=10,
             gradient_clip_val=grad_clip,
             use_distributed_sampler=use_dist_samp,
-            callbacks=callbacks,
+            callbacks=[],
             logger=wandb,
         ),
         optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
index c0f02d706ceb..26e4604adc43 100644
--- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
+++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -39,6 +39,7 @@ def __init__(
         tokenizer=None,
         loss_fn=masked_cross_entropy,
         model_transform=None,
+        model_accelerator=None,
         trust_remote_code=False,
     ):
         super().__init__()
@@ -50,6 +51,7 @@ def __init__(
         self.load_pretrained_weights = load_pretrained_weights
         self.is_hf_model = True
         self.model_transform = model_transform
+        self.model_accelerator = model_accelerator
         self.trust_remote_code = trust_remote_code
 
     @property
@@ -78,6 +80,10 @@ def configure_model(self):
 
             config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code)
             self.model = AutoModelForCausalLM.from_config(config, trust_remote_code=self.trust_remote_code)
+
+        if self.model_accelerator is not None:
+            self.model_accelerator(self.model)
+
         self.model.train()
 
     def forward(self, input_ids, attention_mask=None, labels=None, loss_mask=None):

From 5094b2e53836adf0a50d455c70513c774cf6523a Mon Sep 17 00:00:00 2001
From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Date: Sun, 24 Nov 2024 17:42:57 -0800
Subject: [PATCH 08/11] Update llama32 vision (mllama) use attention bias
 (#11316)

* update recipe

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix mllama mock ds

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update to use attention bias

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* remove example

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix docstring mock.py

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix docstring language.py

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix docstring language.py

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix docstring mllama/base.py

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix docstring mllama/language.py

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* bump mcore

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* Add scripts for mllama

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* update script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix pylint

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* revert Dockerfile.ci

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* update script match recipe

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update recipes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update mllama 90b recipe

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: Oliver Koenig <okoenig@nvidia.com>
---
 nemo/collections/vlm/mllama/data/mock.py      |  40 ++++
 nemo/collections/vlm/mllama/model/base.py     |  61 +++--
 nemo/collections/vlm/mllama/model/language.py |  80 ++++++-
 nemo/collections/vlm/mllama/model/vision.py   |  13 +-
 nemo/collections/vlm/recipes/mllama_11b.py    |  33 +--
 nemo/collections/vlm/recipes/mllama_90b.py    |  25 ++-
 scripts/vlm/mllama_finetune.py                | 212 ++++++++++++++++++
 scripts/vlm/mllama_generation.py              | 164 ++++++++++++++
 8 files changed, 573 insertions(+), 55 deletions(-)
 create mode 100644 scripts/vlm/mllama_finetune.py
 create mode 100644 scripts/vlm/mllama_generation.py

diff --git a/nemo/collections/vlm/mllama/data/mock.py b/nemo/collections/vlm/mllama/data/mock.py
index fae92b097200..4d078c745492 100644
--- a/nemo/collections/vlm/mllama/data/mock.py
+++ b/nemo/collections/vlm/mllama/data/mock.py
@@ -25,6 +25,26 @@
 
 
 class MockDataModule(pl.LightningDataModule):
+    """
+    Mock DataModule for testing and development.
+    Generates synthetic data for training, validation, and testing purposes.
+
+    Args:
+        seq_length (int): Sequence length for the generated data.
+        decoder_seq_length (Optional[int]): Decoder sequence length if applicable, used in pp.
+        vocab_size (int): Size of the vocabulary of tokenizer.
+        crop_size (Tuple[int, int]): Image crop size (height, width).
+        micro_batch_size (int): Micro batch size for data loading.
+        global_batch_size (int): Global batch size across all processes.
+        rampup_batch_size (Optional[List[int]]): Batch size ramp-up configuration.
+        num_train_samples (int): Number of training samples to generate.
+        num_val_samples (int): Number of validation samples to generate.
+        num_test_samples (int): Number of test samples to generate.
+        num_workers (int): Number of workers for data loading.
+        pin_memory (bool): Whether to pin memory for data loading.
+        persistent_workers (bool): Whether workers should remain persistent.
+    """
+
     def __init__(
         self,
         seq_length: int = 2048,
@@ -66,6 +86,7 @@ def __init__(
         )
 
     def setup(self, stage: str = "") -> None:
+        """Set up datasets for the specified stage."""
         self._train_ds = _MockMLlamaDataset(
             self.vocab_size, self.crop_size, "train", self.num_train_samples, self.decoder_seq_length
         )
@@ -77,21 +98,25 @@ def setup(self, stage: str = "") -> None:
         )
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
+        """Returns the DataLoader for training."""
         if not hasattr(self, "_train_ds"):
             self.setup()
         return self._create_dataloader(self._train_ds)
 
     def val_dataloader(self) -> EVAL_DATALOADERS:
+        """Returns the DataLoader for validation."""
         if not hasattr(self, "_validation_ds"):
             self.setup()
         return self._create_dataloader(self._validation_ds)
 
     def test_dataloader(self) -> EVAL_DATALOADERS:
+        """Returns the DataLoader for testing."""
         if not hasattr(self, "_test_ds"):
             self.setup()
         return self._create_dataloader(self._test_ds)
 
     def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        """Creates a DataLoader for the specified dataset."""
         return DataLoader(
             dataset,
             num_workers=self.num_workers,
@@ -103,6 +128,18 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
 
 
 class _MockMLlamaDataset(Dataset):
+    """
+    Mock dataset for generating synthetic data with text and image components.
+
+    Args:
+        vocab_size (int): Vocabulary size for text data.
+        crop_size (Tuple[int, int]): Image crop size (height, width).
+        name (str): Name of the dataset split ('train', 'valid', 'test').
+        num_samples (int): Number of samples in the dataset.
+        seq_length (int): Sequence length for the text data.
+        seed (int): Seed for random number generation.
+    """
+
     def __init__(
         self,
         vocab_size,
@@ -127,13 +164,16 @@ def __init__(
         self.position_ids = torch.arange(self.seq_length, dtype=torch.int64)
 
     def __len__(self) -> int:
+        """Returns the number of samples in the dataset."""
         return self.length
 
     def _get_text(self, idx: int) -> np.ndarray:
+        """Generates a random sequence of integers representing text tokens."""
         np_gen = np.random.default_rng(seed=(self.seed + idx))
         return np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64)
 
     def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        """Generates a single data sample."""
         # Generate data of the expected size and datatype (based on GPTDataset).
         np_gen = np.random.default_rng(seed=(self.seed + idx))
         tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length + 1], dtype=np.int64))
diff --git a/nemo/collections/vlm/mllama/model/base.py b/nemo/collections/vlm/mllama/model/base.py
index d417af27aedd..9279936e23d7 100644
--- a/nemo/collections/vlm/mllama/model/base.py
+++ b/nemo/collections/vlm/mllama/model/base.py
@@ -47,7 +47,8 @@
 from nemo.utils import logging
 
 
-def llama_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
+def mllama_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
+    """Mllama data step."""
     from megatron.core import parallel_state
 
     # Based on: https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py#L87
@@ -96,7 +97,8 @@ def llama_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
     return output
 
 
-def llama_forward_step(model, batch) -> torch.Tensor:
+def mllama_forward_step(model, batch) -> torch.Tensor:
+    """Mllama model forward step."""
     forward_config = {
         "batch_images": batch["batch_images"],
         "batch_masks": batch["batch_masks"],
@@ -114,13 +116,15 @@ def llama_forward_step(model, batch) -> torch.Tensor:
 
 
 def set_input_tensor(self, tensor):
+    """Placeholder for `set_input_tensor` method for PP implementation."""
     pass
 
 
 @dataclass
 class CrossAttentionVisionConfig(TransformerConfig, io.IOMixin):
-    # core params
+    """Configuration for llama vision model."""
 
+    # core params
     bias_activation_fusion: bool = True
     bias_dropout_add_fusion: bool = True
 
@@ -150,9 +154,11 @@ class CrossAttentionVisionConfig(TransformerConfig, io.IOMixin):
 
     @property
     def max_aspect_ratio_id(self) -> int:
+        # pylint: disable=C0115,C0116
         return len(self.supported_aspect_ratios)
 
     def configure_model(self) -> "CrossAttentionVisionModel":
+        """Configure mllama vision model."""
         return CrossAttentionVisionModel(
             self,
         )
@@ -160,6 +166,10 @@ def configure_model(self) -> "CrossAttentionVisionModel":
 
 @dataclass
 class CrossAttentionTextConfig(Llama31Config):
+    """
+    Configuration for llama model with cross-attention layers to take in multimodal features.
+    """
+
     rotary_base: int = 500_000
     seq_length: int = 8192
     num_layers: int = 32
@@ -171,12 +181,14 @@ class CrossAttentionTextConfig(Llama31Config):
     apply_rope_fusion: bool = False
 
     def _init_fusion_schedule(self, num_layers: int) -> List[int]:
-        llama_layers = list(range(self.num_layers))
+        """Initialize self-attention layer / cross-attention layer fusion schedule"""
+        mllama_layers = list(range(self.num_layers))
         # uniformly spread the layers
-        k = math.ceil(len(llama_layers) / num_layers)
-        return llama_layers[::-1][::k][:num_layers][::-1]
+        k = math.ceil(len(mllama_layers) / num_layers)
+        return mllama_layers[::-1][::k][:num_layers][::-1]
 
     def configure_model(self, tokenizer, pre_process=True, post_process=True):
+        """Configure mllama text model."""
         self.fusion_schedule = self._init_fusion_schedule(self.num_cross_attention_layers)
         vp_size = self.virtual_pipeline_model_parallel_size
         if vp_size:
@@ -225,6 +237,8 @@ def configure_model(self, tokenizer, pre_process=True, post_process=True):
 
 @dataclass
 class MLlamaModelConfig(TransformerConfig, io.IOMixin):
+    """Combined configuration for multimodal vision-language model."""
+
     language_model_config: Optional[CrossAttentionTextConfig] = None
     vision_model_config: Optional[CrossAttentionVisionConfig] = None
 
@@ -237,8 +251,8 @@ class MLlamaModelConfig(TransformerConfig, io.IOMixin):
     language_model_from_pretrained: Optional[str] = None  # TODO
     vision_model_from_pretrained: Optional[str] = None  # TODO
 
-    forward_step_fn: Callable = llama_forward_step
-    data_step_fn: Callable = llama_data_step
+    forward_step_fn: Callable = mllama_forward_step
+    data_step_fn: Callable = mllama_data_step
 
     def __post_init__(self):
         if self.language_model_config is not None:
@@ -246,6 +260,7 @@ def __post_init__(self):
                 setattr(self, attr, getattr(self.language_model_config, attr))
 
     def configure_model(self, tokenizer) -> "MLlamaBaseModel":
+        """Configure mllama model."""
         from megatron.core import parallel_state as ps
 
         self.language_model_config.tensor_model_parallel_size = self.tensor_model_parallel_size
@@ -274,6 +289,8 @@ def configure_model(self, tokenizer) -> "MLlamaBaseModel":
 
 
 class CrossAttentionVisionModel(MegatronModule):
+    """Mllama vision model."""
+
     def __init__(self, config) -> None:
         super().__init__(config=config)
         return_intermediate = "3,7,15,23,30"
@@ -303,6 +320,7 @@ def __init__(self, config) -> None:
         self.vision_projection.encoder.skip_bias_add = False  # Temporary fix for a MCore side bug
 
     def forward(self, images: torch.Tensor, aspect_ratio_ids: torch.Tensor) -> torch.Tensor:
+        """Forward."""
         # vision_tokens: (B, T, D)
         # aspect_ratio_ids: (B, 1)
         # h: (B, T, D)
@@ -313,10 +331,13 @@ def forward(self, images: torch.Tensor, aspect_ratio_ids: torch.Tensor) -> torch
         return vision_tokens
 
     def set_input_tensor(self, tensor):
+        # pylint: disable=C0115,C0116
         pass
 
 
 class MLlamaBaseModel(MegatronModule):
+    """Mllama base model combining vision and text models with cross-attention."""
+
     def __init__(
         self,
         config: MLlamaModelConfig,
@@ -356,10 +377,6 @@ def __init__(
         self.patch_size = 14
         self.image_res = vision_model_config.vision_chunk_size
         self.max_num_chunks = vision_model_config.vision_max_num_chunks
-        logging.warning("[WARNING] NeMo Mllama will always pad images to max number of tiles. A fix is coming soon!")
-
-    def setup_cache(self, max_batch_size: int, dtype: torch.dtype):
-        self.language_model.setup_cache(max_batch_size, dtype)
 
     def compute_xattn_caches_masks(
         self,
@@ -369,6 +386,7 @@ def compute_xattn_caches_masks(
         num_chunks: torch.Tensor,
         total_len: int,
     ) -> Tuple[List, torch.Tensor, torch.Tensor]:
+        """Compute xattn caches masks used in text model."""
         bsz, nimg, nchunk, ntok, image_token_dim = vision_orig_shape
 
         xattn_caches = [
@@ -408,6 +426,7 @@ def forward(
         full_text_row_masked_out_mask: Optional[torch.Tensor] = None,
         xattn_caches: Optional[List] = None,
     ) -> torch.Tensor:
+        """Forward."""
         if xattn_caches is None:
             bsz, max_num_images = batch_images.size(0), batch_images.size(1)
             vision_orig_shape = (
@@ -418,8 +437,8 @@ def forward(
                 self.config.hidden_size,
             )
             skip_vision_encoder = False
-            num_chunks[num_chunks > 0] = self.max_num_chunks
             if max_num_images == 0:
+                num_chunks[num_chunks > 0] = self.max_num_chunks
                 skip_vision_encoder = True
 
             if self.encoder_hidden_state is not None:
@@ -489,6 +508,8 @@ def set_input_tensor(self, input_tensor) -> None:
 
 
 class MLlamaModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
+    """Lightning Module for the MLlama model."""
+
     def __init__(
         self,
         config: MLlamaModelConfig,
@@ -506,6 +527,7 @@ def __init__(
         self._validation_loss_reduction = None
 
     def configure_model(self) -> None:
+        """Configure mllama model"""
         if not hasattr(self, "module"):
             self.module: MLlamaBaseModel = self.config.configure_model(self.tokenizer)
 
@@ -522,7 +544,7 @@ def forward(
         full_text_row_masked_out_mask: Optional[torch.Tensor] = None,
         xattn_caches: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-
+        """Forward."""
         output_tensor = self.module(
             position_ids=position_ids,
             tokens=tokens,
@@ -539,22 +561,26 @@ def forward(
         return output_tensor
 
     def data_step(self, dataloader_iter) -> Dict[str, torch.Tensor]:
+        # pylint: disable=C0115,C0116
         return self.config.data_step_fn(dataloader_iter)
 
     def forward_step(self, batch) -> torch.Tensor:
+        # pylint: disable=C0115,C0116
         return self.config.forward_step_fn(self, batch)
 
     def training_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # pylint: disable=C0115,C0116
         # In mcore the loss-function is part of the forward-pass (when labels are provided)
         return self.forward_step(batch)
 
     def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # pylint: disable=C0115,C0116
         # In mcore the loss-function is part of the forward-pass (when labels are provided)
-
         return self.forward_step(batch)
 
     @property
     def training_loss_reduction(self) -> MaskedTokenLossReduction:
+        # pylint: disable=C0115,C0116
         if not self._training_loss_reduction:
             self._training_loss_reduction = MaskedTokenLossReduction()
 
@@ -562,6 +588,7 @@ def training_loss_reduction(self) -> MaskedTokenLossReduction:
 
     @property
     def validation_loss_reduction(self) -> MaskedTokenLossReduction:
+        # pylint: disable=C0115,C0116
         if not self._validation_loss_reduction:
             self._validation_loss_reduction = MaskedTokenLossReduction(validation_step=True)
 
@@ -573,8 +600,8 @@ def validation_loss_reduction(self) -> MaskedTokenLossReduction:
     "MLlamaModelConfig",
     "CrossAttentionTextConfig",
     "CrossAttentionVisionConfig",
-    "llama_data_step",
-    "llama_forward_step",
+    "mllama_data_step",
+    "mllama_forward_step",
     "transformer_engine_layer_spec",
     "local_layer_spec",
 ]
diff --git a/nemo/collections/vlm/mllama/model/language.py b/nemo/collections/vlm/mllama/model/language.py
index b8985e53c54c..5d4cc2e09f21 100644
--- a/nemo/collections/vlm/mllama/model/language.py
+++ b/nemo/collections/vlm/mllama/model/language.py
@@ -60,6 +60,10 @@
 
 @dataclass
 class MLlamaCrossAttentionSubmodules:
+    """
+    Defines the submodules required for cross-attention layers in the Llama architecture.
+    """
+
     linear_q: Union[ModuleSpec, type] = None
     linear_kv: Union[ModuleSpec, type] = None
     core_attention: Union[ModuleSpec, type] = None
@@ -69,6 +73,10 @@ class MLlamaCrossAttentionSubmodules:
 
 
 class CrossAttentionTextModel(MCoreGPTModel):
+    """
+    GPT-based model with integrated cross-attention layers for multimodal tasks.
+    """
+
     def __init__(
         self,
         config: TransformerConfig,
@@ -122,6 +130,7 @@ def __init__(
             self._thresh = self.num_frozen_embeddings - 1
 
     def get_partially_trainable_embedding(self, x):
+        """Get word embedding w/ few extra learnable tokens."""
         xz = torch.zeros_like(x, device=x.device)
         oz = torch.ones_like(x, device=x.device)
         x_orig = torch.minimum(x, torch.tensor(self._thresh, device=x.device))
@@ -148,7 +157,7 @@ def forward(
         packed_seq_params: PackedSeqParams = None,
         extra_block_kwargs: dict = None,
     ) -> Tensor:
-
+        """Forward."""
         # Decoder embedding.
         if decoder_input is not None:
             pass
@@ -171,6 +180,9 @@ def forward(
             )
             rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
 
+        dtype = decoder_input.dtype
+        cross_attention_bias = cross_attention_masks.to(dtype) * torch.finfo(dtype).min
+
         # Run decoder.
         hidden_states = self.decoder(
             hidden_states=decoder_input,
@@ -178,9 +190,10 @@ def forward(
             inference_params=inference_params,
             rotary_pos_emb=rotary_pos_emb,
             packed_seq_params=packed_seq_params,
-            cross_attention_masks=cross_attention_masks,
+            cross_attention_masks=None,
             full_text_row_masked_out_mask=full_text_row_masked_out_mask,
             xattn_caches=xattn_caches,
+            cross_attention_bias=cross_attention_bias,
             **(extra_block_kwargs or {}),
         )
 
@@ -203,6 +216,10 @@ def forward(
 
 
 class CrossAttentionTransformerBlock(TransformerBlock):
+    """
+    Transformer block with integrated cross-attention layers for multimodal tasks.
+    """
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -220,7 +237,7 @@ def __init__(self, *args, **kwargs):
                     submodules=TransformerLayerSubmodules(
                         cross_attention=ModuleSpec(
                             module=MLlamaCrossAttention,
-                            params={"attn_mask_type": AttnMaskType.arbitrary},
+                            params={"attn_mask_type": AttnMaskType.no_mask},
                             submodules=MLlamaCrossAttentionSubmodules(
                                 linear_q=TELayerNormColumnParallelLinear,  # This wraps attention_norm before attention
                                 linear_kv=TEColumnParallelLinear,
@@ -250,6 +267,7 @@ def __init__(self, *args, **kwargs):
         assert len(self.xattn_layers) == len(self.layers), 'Check PP implementation for cross attention layers!'
 
     def _get_layer_offset(self):
+        """Get correct layer offset when encoder pipeline parallel size > 0."""
         encoder_pipeline_model_parallel_size = getattr(self.config, "encoder_pipeline_model_parallel_size", 0)
         decoder_pipeline_model_parallel_rank = (
             parallel_state.get_pipeline_model_parallel_rank() - encoder_pipeline_model_parallel_size
@@ -264,9 +282,12 @@ def forward(
         cross_attention_masks: Tensor = None,
         full_text_row_masked_out_mask: Tensor = None,
         rotary_pos_emb: Tensor = None,
+        attention_bias: Tensor = None,
+        cross_attention_bias: Tensor = None,
         inference_params: InferenceParams = None,
         packed_seq_params: PackedSeqParams = None,
     ):
+        """Forward."""
         # hidden_states (float): [s, b, h]
         # attention_mask (bool): [1, 1, s, s]
 
@@ -324,6 +345,7 @@ def forward(
                                 xattn_cache=xattn_caches[l_no],
                                 full_text_row_masked_out_mask=full_text_row_masked_out_mask,
                                 rotary_pos_emb=rotary_pos_emb,
+                                cross_attention_bias=cross_attention_bias,
                                 inference_params=inference_params,
                                 packed_seq_params=packed_seq_params,
                             )
@@ -331,6 +353,7 @@ def forward(
                                 hidden_states=hidden_states,
                                 attention_mask=attention_mask,
                                 rotary_pos_emb=rotary_pos_emb,
+                                attention_bias=attention_bias,
                                 inference_params=inference_params,
                                 packed_seq_params=packed_seq_params,
                             )
@@ -361,6 +384,7 @@ def forward(
     def sharded_state_dict(
         self, prefix: str = '', sharded_offsets: tuple = (), metadata: dict = None
     ) -> ShardedStateDict:
+        """Update shareded state dict for cross-attention layers"""
         sharded_state_dict = {}
 
         layer_prefix = f'{prefix}layers.'
@@ -399,6 +423,10 @@ def sharded_state_dict(
 
 
 class CrossAttentionTransformerLayer(TransformerLayer):
+    """
+    Transformer layer with cross-attention for integration.
+    """
+
     def __init__(
         self,
         config: TransformerConfig,
@@ -417,6 +445,7 @@ def __init__(
         self.gate_ffn = nn.Parameter(torch.zeros(1, dtype=self.config.params_dtype))
 
     def compute_xattn_kv_cache(self, xattn_tokens: Tensor) -> Tensor:
+        """Compute cross-attention kv cahce."""
         return self.cross_attention._compute_xattn_kv_cache(xattn_tokens)
 
     def forward(
@@ -426,9 +455,11 @@ def forward(
         xattn_cache=None,
         full_text_row_masked_out_mask=None,
         rotary_pos_emb=None,
+        cross_attention_bias=None,
         inference_params=None,
         packed_seq_params=None,
     ):
+        """Forward."""
         # hidden_states: [s, b, h]
 
         # Residual connection.
@@ -444,6 +475,7 @@ def forward(
             xattn_cache=xattn_cache,
             full_text_row_masked_out_mask=full_text_row_masked_out_mask,
             rotary_pos_emb=rotary_pos_emb,
+            cross_attention_bias=cross_attention_bias,
             inference_params=inference_params,
         )
 
@@ -507,11 +539,13 @@ def __call__(
         return hidden_states, None
 
     def compute_xattn_kv_cache(self, xattn_tokens: Tensor) -> Optional[Tensor]:
+        # pylint: disable=C0115,C0116
         return None
 
 
 class MLlamaCrossAttention(Attention):
-    """Cross-attention layer class for Llama VLM support
+    """
+    Cross-attention layer for Llama multimodal tasks.
 
     Cross-attention layer takes input with size [s, b, h] and context with size
     [s, b, h] and returns output of the same size.
@@ -574,6 +608,7 @@ def __init__(
         )
 
     def get_key_value_tensors(self, key_value_states):
+        """Get key value tensors."""
         mixed_kv, _ = self.linear_kv(key_value_states)
 
         # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
@@ -590,7 +625,7 @@ def get_key_value_tensors(self, key_value_states):
         return key, value
 
     def get_query_tensor(self, hidden_states):
-
+        """ "Get query tensor."""
         # Attention head [sq, b, h] --> [sq, b, hp]
         query, _ = self.linear_q(hidden_states)
 
@@ -607,6 +642,7 @@ def get_query_tensor(self, hidden_states):
         return query
 
     def get_query_key_value_tensors(self, hidden_states, key_value_states):
+        """Get query key value tensors."""
         query = self.get_query_tensor(hidden_states)
         key, value = self.get_key_value_tensors(key_value_states)
         return query, key, value
@@ -619,8 +655,17 @@ def forward(
         full_text_row_masked_out_mask=None,
         inference_params=None,
         rotary_pos_emb=None,
+        rotary_pos_cos=None,
+        rotary_pos_sin=None,
+        cross_attention_bias=None,
         packed_seq_params=None,
     ):
+        """Forward."""
+        # hidden_states: [sq, b, h]
+        if self.config.flash_decode:
+            rotary_pos_emb = None
+        else:
+            assert rotary_pos_cos is None and rotary_pos_sin is None
 
         # For self attention we just duplicate the rotary_pos_emb if it isn't already
         if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
@@ -637,8 +682,8 @@ def forward(
         # ===================================================
         # Adjust key, value, and rotary_pos_emb for inference
         # ===================================================
-        key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
-            inference_params, key, value, rotary_pos_emb
+        query, key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, query, key, value, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin
         )
 
         if packed_seq_params is not None:
@@ -650,9 +695,6 @@ def forward(
         # core attention computation
         # ==================================
 
-        # In TE "True" means masked out
-        cross_attention_masks = torch.where(cross_attention_masks == 0, False, True)
-
         if self.checkpoint_core_attention and self.training:
             core_attn_out = self._checkpointed_attention_forward(
                 query,
@@ -660,6 +702,7 @@ def forward(
                 value,
                 cross_attention_masks,
                 attn_mask_type=attn_mask_type,
+                attention_bias=cross_attention_bias,
                 packed_seq_params=packed_seq_params,
             )
         else:
@@ -669,6 +712,7 @@ def forward(
                 value,
                 cross_attention_masks,
                 attn_mask_type=attn_mask_type,
+                attention_bias=cross_attention_bias,
                 packed_seq_params=packed_seq_params,
             )
 
@@ -702,8 +746,22 @@ def apply_rope_scaling(
     high_freq_factor: int = 4,
     old_context_len: int = 8192,
 ):
+    """
+    Apply scaling to rotary embeddings for positional encoding.
+
+    Args:
+        inv_freq (Tensor): Tensor of inverse frequencies.
+        factor (int): Scaling factor for medium-to-high frequencies.
+        low_freq_factor (int): Factor for identifying low frequencies.
+        high_freq_factor (int): Factor for identifying high frequencies.
+        old_context_len (int): Original context length for scaling computation.
+
+    Returns:
+        Tensor: Scaled inverse frequencies.
+    """
     logging.info(
-        f"Apply rope scaling with factor={factor}, low_freq_factor={low_freq_factor}, high_freq_factor={high_freq_factor}, old_context_len={old_context_len}."
+        f"Apply rope scaling with factor={factor}, low_freq_factor={low_freq_factor}, "
+        f"high_freq_factor={high_freq_factor}, old_context_len={old_context_len}."
     )
 
     low_freq_wavelen = old_context_len / low_freq_factor
diff --git a/nemo/collections/vlm/mllama/model/vision.py b/nemo/collections/vlm/mllama/model/vision.py
index f023cc7bf943..bb58ad093cd6 100644
--- a/nemo/collections/vlm/mllama/model/vision.py
+++ b/nemo/collections/vlm/mllama/model/vision.py
@@ -120,15 +120,16 @@ def build_encoder_attention_mask(
         torch.Tensor: Tensor containing the attention mask.
     """
     masks = []
+    dtype = x.dtype
     for ar_id in ar_ids:
         arx = supported_aspect_ratios[ar_id - 1]
         mask_i = torch.ones((num_chunks, x.shape[1] // num_chunks), device=x.device)
         mask_i[: arx[0] * arx[1], :ntok] = 0
         mask_i = mask_i.view(num_chunks * x.shape[1] // num_chunks, -1)
-        mask_i = (mask_i @ mask_i.T).type(torch.bool)
+        mask_i = mask_i @ mask_i.T
         mask_i = mask_i.unsqueeze(0)
         masks.append(mask_i)
-    masks = torch.stack(masks)
+    masks = torch.stack(masks).to(dtype) * torch.finfo(dtype).min
     return masks
 
 
@@ -197,6 +198,7 @@ def forward_with_return_intermediate(
     context: Tensor = None,
     context_mask: Tensor = None,
     rotary_pos_emb: Tensor = None,
+    attention_bias: Tensor = None,
     inference_params: InferenceParams = None,
     packed_seq_params: PackedSeqParams = None,
     return_intermediate: List[int] = None,
@@ -253,6 +255,7 @@ def forward_with_return_intermediate(
                 context=context,
                 context_mask=context_mask,
                 rotary_pos_emb=rotary_pos_emb,
+                attention_bias=attention_bias,
                 packed_seq_params=packed_seq_params,
             )
         else:
@@ -269,6 +272,7 @@ def forward_with_return_intermediate(
                             context=context,
                             context_mask=context_mask,
                             rotary_pos_emb=rotary_pos_emb,
+                            attention_bias=attention_bias,
                             inference_params=inference_params,
                             packed_seq_params=packed_seq_params,
                         )
@@ -506,6 +510,7 @@ def forward(
             attention_mask=attention_mask,
             inference_params=inference_params,
             rotary_pos_emb=rotary_pos_emb,
+            attention_bias=attention_bias,
             packed_seq_params=packed_seq_params,
         )
 
@@ -690,11 +695,12 @@ def forward(self, images: torch.Tensor, ar_ids: torch.Tensor) -> torch.Tensor:
         x = x.view(bsz * num_concurrent_media, -1, dim)
 
         npad, attn_mask = 0, None
-        attn_mask = build_encoder_attention_mask(x, ar_ids, ntok, num_chunks, self.config.supported_aspect_ratios)
+        attn_bias = build_encoder_attention_mask(x, ar_ids, ntok, num_chunks, self.config.supported_aspect_ratios)
         x = x.transpose(0, 1).contiguous()
         x, int_x = self.transformer(
             hidden_states=x,
             attention_mask=attn_mask,
+            attention_bias=attn_bias,
             return_intermediate=self.return_intermediate,
         )
 
@@ -709,6 +715,7 @@ def forward(self, images: torch.Tensor, ar_ids: torch.Tensor) -> torch.Tensor:
         x = self.global_transformer(
             hidden_states=x,
             attention_mask=None,
+            attention_bias=attn_bias,
         )
         x = x.transpose(0, 1)
         x = x.reshape(bsz * num_concurrent_media, num_chunks, ntok + npad, dim)
diff --git a/nemo/collections/vlm/recipes/mllama_11b.py b/nemo/collections/vlm/recipes/mllama_11b.py
index e4842ae63d52..4b08606900e3 100644
--- a/nemo/collections/vlm/recipes/mllama_11b.py
+++ b/nemo/collections/vlm/recipes/mllama_11b.py
@@ -26,6 +26,7 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
 from nemo.collections.vlm.mllama.data.mock import MockDataModule
+from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mllama_11b"
 
@@ -46,7 +47,7 @@ def model() -> run.Config[pl.LightningModule]:
             >>> model_config = model()
             >>> print(model_config)
     """
-    return run.Config(vlm.MLlamaModel, config=run.Config(vlm.MLlamaConfig11B))
+    return run.Config(vlm.MLlamaModel, config=run.Config(vlm.MLlamaConfig11BInstruct))
 
 
 @run.cli.factory(target=llm.finetune, name=NAME)
@@ -107,6 +108,7 @@ def finetune_recipe(
         plugins=bf16_mixed(),
         strategy=strategy,
         val_check_interval=100,
+        callbacks=[run.Config(TimingCallback)],
     )
 
     recipe = run.Partial(
@@ -115,34 +117,37 @@ def finetune_recipe(
         trainer=trainer,
         data=run.Config(
             MockDataModule,
-            seq_length=4100,  # encoder (vision) seq length
-            decoder_seq_length=512,  # decoder (llm) seq length
-            global_batch_size=16,
-            micro_batch_size=2,
+            seq_length=6404,  # encoder (vision) seq length
+            decoder_seq_length=2048,  # decoder (llm) seq length
+            global_batch_size=2,
+            micro_batch_size=1,
             vocab_size=128256,
-            crop_size=(448, 448),
+            crop_size=(560, 560),
             num_workers=0,
         ),
         log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=2.0e-07, warmup_steps=150),
-        resume=nemo_resume("meta-llama/Llama-3.2-11B-Vision"),
+        resume=nemo_resume("meta-llama/Llama-3.2-11B-Vision-Instruct"),
     )
 
     if peft_scheme is None or peft_scheme.lower() == 'none':
         recipe.trainer.strategy.tensor_model_parallel_size = 2
         recipe.optim.config.lr = 2e-05
     elif peft_scheme.lower() == 'lora':
+        # pylint: disable=line-too-long
+        """Adapted from https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/configs/peft.py"""
         recipe.peft = run.Config(
             vlm.LoRA,
-            freeze_vision_model=False,
+            freeze_vision_model=True,
             target_modules=[
-                "*.language_model.*.linear_qkv",
-                "*.language_model.*.linear_q",
-                "*.language_model.*.linear_kv",
-                "*.language_model.*.linear_proj",
-                "*.language_model.*.linear_fc1",
-                "*.language_model.*.linear_fc2",
+                "linear_qkv",
+                "linear_q",
+                "linear_kv",
             ],
+            dim=8,
+            alpha=32,
+            dropout=0.05,
+            dropout_position="pre",
         )
         recipe.optim.config.lr = 1e-4
     else:
diff --git a/nemo/collections/vlm/recipes/mllama_90b.py b/nemo/collections/vlm/recipes/mllama_90b.py
index 28a6ff7ff9a6..12e0329fc6dd 100644
--- a/nemo/collections/vlm/recipes/mllama_90b.py
+++ b/nemo/collections/vlm/recipes/mllama_90b.py
@@ -26,6 +26,7 @@
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
 from nemo.collections.vlm.mllama.data.mock import MockDataModule
+from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mllama_90b"
 
@@ -46,7 +47,7 @@ def model() -> run.Config[pl.LightningModule]:
             >>> model_config = model()
             >>> print(model_config)
     """
-    return run.Config(vlm.MLlamaModel, config=run.Config(vlm.MLlamaConfig90B))
+    return run.Config(vlm.MLlamaModel, config=run.Config(vlm.MLlamaConfig90BInstruct))
 
 
 @run.cli.factory(target=llm.finetune, name=NAME)
@@ -107,6 +108,7 @@ def finetune_recipe(
         plugins=bf16_mixed(),
         strategy=strategy,
         val_check_interval=100,
+        callbacks=[run.Config(TimingCallback)],
     )
 
     recipe = run.Partial(
@@ -116,7 +118,7 @@ def finetune_recipe(
         data=run.Config(
             MockDataModule,
             seq_length=6404,  # encoder (vision) seq length
-            decoder_seq_length=512,  # decoder (llm) seq length
+            decoder_seq_length=2048,  # decoder (llm) seq length
             global_batch_size=16,
             micro_batch_size=2,
             vocab_size=128256,
@@ -125,23 +127,26 @@ def finetune_recipe(
         ),
         log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=2.0e-07, warmup_steps=150),
-        resume=nemo_resume("meta-llama/Llama-3.2-90B-Vision"),
+        resume=nemo_resume("meta-llama/Llama-3.2-90B-Vision-Instruct"),
     )
 
     if peft_scheme is None or peft_scheme.lower() == 'none':
         raise ValueError("Full finetuning recipe for Llama-3.2-90B model will be supported soon.")
     elif peft_scheme.lower() == 'lora':
+        # pylint: disable=line-too-long
+        """Adapted from https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/configs/peft.py"""
         recipe.peft = run.Config(
             vlm.LoRA,
-            freeze_vision_model=False,
+            freeze_vision_model=True,
             target_modules=[
-                "*.language_model.*.linear_qkv",
-                "*.language_model.*.linear_q",
-                "*.language_model.*.linear_kv",
-                "*.language_model.*.linear_proj",
-                "*.language_model.*.linear_fc1",
-                "*.language_model.*.linear_fc2",
+                "linear_qkv",
+                "linear_q",
+                "linear_kv",
             ],
+            dim=8,
+            alpha=32,
+            dropout=0.05,
+            dropout_position="pre",
         )
         recipe.optim.config.lr = 1e-4
     else:
diff --git a/scripts/vlm/mllama_finetune.py b/scripts/vlm/mllama_finetune.py
new file mode 100644
index 000000000000..2b6990a03aa5
--- /dev/null
+++ b/scripts/vlm/mllama_finetune.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+from megatron.core.optimizer import OptimizerConfig
+from pytorch_lightning.loggers import WandbLogger
+from transformers import AutoProcessor
+
+from nemo import lightning as nl
+from nemo.collections import llm, vlm
+from nemo.collections.vlm import ImageDataConfig
+from nemo.collections.vlm.mllama.data.lazy import MLlamaLazyDataModule
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
+from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
+from nemo.utils.exp_manager import TimingCallback
+
+
+def main(args):
+    """
+    Main function for setting up and training the MLLama model.
+
+    This function prepares the data module, model, training strategy,
+    logger, checkpointing, and optimizer configuration. It then starts
+    the training loop using PyTorch Lightning's trainer.
+
+    Args:
+        args (argparse.Namespace): The command-line arguments passed to the script.
+    """
+    # Setting gbs, mbs, and max_steps from arguments
+    gbs = args.gbs
+    mbs = args.mbs
+    max_steps = args.max_steps
+
+    # encoder (vision) seq length
+    # ((img_res / patch_size) ** 2 + cls_token) * num_tiles, = ((560 / 14) ** 2 + 1) * 4 = 6404
+    seq_length = 6404
+    decoder_seq_length = 1024  # decoder (llm) seq length
+
+    if args.restore_path is not None and args.restore_path.startswith("nemo://"):
+        model_id = args.restore_path[len("nemo://") :]
+    else:
+        model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    processor = AutoProcessor.from_pretrained(model_id)
+    image_processor = processor.image_processor
+    tokenizer = processor.tokenizer
+
+    # Data configuration
+    data_config = ImageDataConfig(
+        image_folder=args.image_folder,
+        conv_template="mllama",
+    )
+
+    # Data module setup
+    data = MLlamaLazyDataModule(
+        paths=args.data_path,
+        data_config=data_config,
+        seq_length=seq_length,
+        decoder_seq_length=decoder_seq_length,
+        global_batch_size=gbs,
+        micro_batch_size=mbs,
+        tokenizer=tokenizer,
+        image_processor=image_processor,
+        num_workers=16,
+    )
+
+    model_configs = {
+        "meta-llama/Llama-3.2-11B-Vision": vlm.MLlamaConfig11B,
+        "meta-llama/Llama-3.2-11B-Vision-Instruct": vlm.MLlamaConfig11BInstruct,
+        "meta-llama/Llama-3.2-90B-Vision": vlm.MLlamaConfig90B,
+        "meta-llama/Llama-3.2-90B-Vision-Instruct": vlm.MLlamaConfig90BInstruct,
+    }
+    conf = model_configs[model_id]()
+    if args.pp_size > 1:
+        conf.language_model_config.first_pipeline_num_layers = 0
+    model = vlm.MLlamaModel(conf, tokenizer=tokenizer)
+
+    # Training strategy setup
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=args.tp_size,
+        pipeline_model_parallel_size=args.pp_size,
+        encoder_pipeline_model_parallel_size=args.encoder_pp_size,
+        pipeline_dtype=torch.bfloat16,
+    )
+
+    # Checkpoint callback setup
+    checkpoint_callback = nl.ModelCheckpoint(
+        save_last=True,
+        monitor="reduced_train_loss",
+        save_top_k=6,
+        every_n_train_steps=100,
+        dirpath=args.log_dir,
+    )
+
+    # Trainer setup
+    trainer = nl.Trainer(
+        num_nodes=args.num_nodes,
+        devices=args.devices,
+        max_steps=max_steps,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+        callbacks=[checkpoint_callback, TimingCallback()],
+        val_check_interval=500,
+        limit_val_batches=gbs,
+        log_every_n_steps=1,
+        num_sanity_val_steps=0,
+    )
+
+    # Logger setup
+    nemo_logger = nl.NeMoLogger(
+        log_dir=args.log_dir,
+        name=args.name,
+        wandb=WandbLogger(project=args.wandb_project, name=args.name) if args.wandb_project is not None else None,
+    )
+
+    # Auto resume setup
+    resume = nl.AutoResume(
+        resume_if_exists=True,
+        resume_ignore_no_checkpoint=True,
+        resume_from_directory=args.log_dir,
+        restore_config=nl.RestoreConfig(path=args.restore_path) if args.restore_path is not None else None,
+    )
+
+    # Optimizer and scheduler setup
+    opt_config = OptimizerConfig(
+        optimizer='adam',
+        lr=args.lr,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        use_distributed_optimizer=True,
+        bf16=True,
+    )
+    sched = CosineAnnealingScheduler(
+        max_steps=trainer.max_steps,
+        warmup_steps=100,
+        constant_steps=0,
+        min_lr=args.lr,
+    )
+    opt = MegatronOptimizerModule(opt_config, sched)
+
+    # PEFT setup
+    if args.peft == 'lora':
+        peft = vlm.peft.LoRA(
+            freeze_vision_model=True,
+            target_modules=[
+                "linear_qkv",
+                "linear_q",
+                "linear_kv",
+            ],
+            dim=8,
+            alpha=32,
+            dropout=0.05,
+            dropout_position="pre",
+        )
+    else:
+        peft = None
+
+    llm.finetune(
+        model=model,
+        data=data,
+        trainer=trainer,
+        peft=peft,
+        log=nemo_logger,
+        optim=opt,
+        resume=resume,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Mllama Model Training Script")
+
+    parser.add_argument(
+        "--restore_path", type=str, required=False, default=None, help="Path to restore model from checkpoint"
+    )
+    parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset")
+    parser.add_argument("--image_folder", type=str, required=True, help="Path to the image folder")
+    parser.add_argument(
+        "--log_dir",
+        type=str,
+        required=False,
+        default="/results",
+        help="Directory for logging and checkpoints",
+    )
+    parser.add_argument("--devices", type=int, required=False, default=1)
+    parser.add_argument("--num_nodes", type=int, required=False, default=1)
+    parser.add_argument("--max_steps", type=int, required=False, default=5190)
+    parser.add_argument("--tp_size", type=int, required=False, default=1)
+    parser.add_argument("--pp_size", type=int, required=False, default=1)
+    parser.add_argument("--encoder_pp_size", type=int, required=False, default=0)
+    parser.add_argument("--name", type=str, required=False, default="neva_pretrain")
+    parser.add_argument("--peft", type=str, default='none', help="none | lora")
+    parser.add_argument("--wandb_project", type=str, required=False, default=None)
+    parser.add_argument("--gbs", type=int, required=False, default=64, help="Global batch size")
+    parser.add_argument("--mbs", type=int, required=False, default=2, help="Micro batch size")
+    parser.add_argument("--lr", type=float, required=False, default=2.0e-06, help="Learning rate")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/scripts/vlm/mllama_generation.py b/scripts/vlm/mllama_generation.py
new file mode 100644
index 000000000000..4ebf2d0055ad
--- /dev/null
+++ b/scripts/vlm/mllama_generation.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import requests
+import torch
+from PIL import Image
+from transformers import AutoProcessor
+
+from nemo import lightning as nl
+from nemo.collections import vlm
+from nemo.collections.vlm.mllama.model.utils import create_vision_mask_tensor
+
+model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+
+def load_image(image_url: str) -> Image.Image:
+    # pylint: disable=C0115,C0116
+    try:
+        response = requests.get(image_url, stream=True)
+        response.raise_for_status()
+        image = Image.open(response.raw)
+        return image
+    except requests.exceptions.RequestException as e:
+        print(f"Error loading image from {image_url}: {e}")
+        return None
+
+
+def generate(model, processor, image, text):
+    # pylint: disable=C0115,C0116
+    tokenizer = processor.tokenizer
+
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "text", "text": text}],
+        }
+    ]
+    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    batch = processor(image, input_text, add_special_tokens=False, return_tensors="pt")
+
+    input_ids = batch["input_ids"].cuda(non_blocking=True)
+    position_ids = (
+        torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device).unsqueeze(0).expand_as(input_ids)
+    )
+    num_tiles = processor.image_processor.preprocess(image, return_tensors='pt')["num_tiles"]
+
+    min_prompt_len = position_ids.shape[-1]
+
+    input_ids = input_ids[:, :min_prompt_len]
+    generated_ids = input_ids.clone()
+
+    from tqdm import tqdm
+
+    for cur_pos in tqdm(range(min_prompt_len, min_prompt_len + 100)):
+        with torch.no_grad():
+            position_ids = torch.arange(0, cur_pos, dtype=torch.long, device="cuda").reshape(1, -1)
+            batch_masks = create_vision_mask_tensor(generated_ids[0])
+
+            output = model(
+                batch_images=batch["pixel_values"].cuda(non_blocking=True),
+                batch_masks=[batch_masks],
+                num_chunks=torch.tensor(num_tiles),
+                aspect_ratio_ids=batch["aspect_ratio_ids"].cuda(non_blocking=True),
+                tokens=generated_ids,
+                position_ids=position_ids,
+            )
+
+            next_token_ids = torch.argmax(output[:, -1], dim=-1, keepdim=True)
+            # Broadcast the tensor from rank 0 to all other ranks
+            torch.distributed.broadcast(next_token_ids, src=0)
+            generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
+            if (next_token_ids == tokenizer.eos_token_id).all():
+                break
+
+    generated_ids = generated_ids.tolist()
+    generated_texts = tokenizer.decode(generated_ids[0][min_prompt_len:])
+
+    if torch.distributed.get_rank() == 0:
+        print("======== GENERATED TEXT OUTPUT ========")
+        print(f"{generated_texts}")
+        print("=======================================")
+    return generated_texts
+
+
+def main(args) -> None:
+    # pylint: disable=C0115,C0116
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=args.tp_size,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+    )
+    trainer = nl.Trainer(
+        devices=args.tp_size,
+        max_steps=1000,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+        val_check_interval=1000,
+        limit_val_batches=50,
+    )
+
+    processor = AutoProcessor.from_pretrained(model_id)
+    tokenizer = processor.tokenizer
+
+    fabric = trainer.to_fabric()
+
+    if args.load_from_hf:
+        model = fabric.import_model(f"hf://{model_id}", vlm.MLlamaModel)
+    else:
+        model = vlm.MLlamaModel(vlm.MLlamaConfig11BInstruct(), tokenizer=tokenizer)
+        model = fabric.load_model(args.local_model_path, model)
+
+    model = model.module.cuda()
+    model.eval()
+    model = model.to(torch.bfloat16)
+
+    # Load the image
+    raw_image = load_image(args.image_url)
+    if raw_image is None:
+        return  # Exit if the image can't be loaded
+
+    generate(model, processor, image=raw_image, text="<|image|>\nDescribe the image.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument(
+        "--load_from_hf",
+        action="store_true",
+        help="Flag to indicate whether to load the model from Hugging Face hub.",
+    )
+    parser.add_argument(
+        "--local_model_path",
+        type=str,
+        default=None,
+        help="Local path to the model if not loading from Hugging Face.",
+    )
+    parser.add_argument(
+        "--image_url",
+        type=str,
+        # pylint: disable=line-too-long
+        default="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",
+        help="URL of the image to use for inference.",
+    )
+    parser.add_argument("--devices", type=int, required=False, default=1)
+    parser.add_argument("--tp_size", type=int, required=False, default=1)
+    parser.add_argument("--pp_size", type=int, required=False, default=1)
+    parser.add_argument("--encoder_pp_size", type=int, required=False, default=0)
+
+    args = parser.parse_args()
+    main(args)

From ee072617f15f2d47cb1888853af60d81c56e7fba Mon Sep 17 00:00:00 2001
From: nune-tadevosyan <152167970+nune-tadevosyan@users.noreply.github.com>
Date: Mon, 25 Nov 2024 12:18:34 +0400
Subject: [PATCH 09/11] Lhotse support for transcribe_speech_parallel (#11249)

* Lhotse support for transcribe_speech_parallel

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* Removing prints

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Remove

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Adding shard_id

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Handling empty text fields

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* Changing keys

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Key

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Commented issues

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* Commented issues

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* test for lhotse metadata return

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* test for lhotse metadata return

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Small change

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* Support for RNNT and CTC model

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Support for all models

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Small change

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* Tests for predict_step

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

* Adding support for force_map_dataset

Signed-off-by: Nune <ntadevosyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>

---------

Signed-off-by: Nune <ntadevosyan@nvidia.com>
Signed-off-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>
Co-authored-by: nune-tadevosyan <nune-tadevosyan@users.noreply.github.com>
---
 examples/asr/transcribe_speech_parallel.py    | 10 +++-
 .../asr/data/audio_to_text_dataset.py         |  8 +++-
 .../asr/data/audio_to_text_lhotse.py          |  7 ++-
 .../asr/models/configs/asr_models_config.py   | 11 +++++
 nemo/collections/asr/models/ctc_bpe_models.py | 12 +++--
 nemo/collections/asr/models/ctc_models.py     | 11 +++--
 .../asr/models/hybrid_rnnt_ctc_bpe_models.py  |  8 +++-
 .../asr/models/hybrid_rnnt_ctc_models.py      |  4 +-
 .../collections/asr/models/rnnt_bpe_models.py |  8 +++-
 nemo/collections/asr/models/rnnt_models.py    | 11 +++--
 .../asr/models/transformer_bpe_models.py      |  8 +++-
 .../common/data/lhotse/dataloader.py          | 43 ++++++++++++-----
 .../asr/test_asr_ctc_encoder_model_bpe.py     | 33 +++++++++++++
 .../asr/test_asr_ctcencdec_model.py           | 35 ++++++++++++++
 .../asr/test_asr_hybrid_rnnt_ctc_model_bpe.py | 15 ++++++
 .../test_asr_hybrid_rnnt_ctc_model_char.py    | 17 +++++++
 .../asr/test_asr_lhotse_dataset.py            | 32 +++++++++++++
 .../asr/test_asr_rnnt_encdec_model.py         | 17 +++++++
 .../asr/test_asr_rnnt_encoder_model_bpe.py    | 48 +++++++++++++++----
 19 files changed, 296 insertions(+), 42 deletions(-)

diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py
index bdf54ea67f7d..d60099acd379 100644
--- a/examples/asr/transcribe_speech_parallel.py
+++ b/examples/asr/transcribe_speech_parallel.py
@@ -163,6 +163,14 @@ def main(cfg: ParallelTranscriptionConfig):
     cfg.predict_ds.return_sample_id = True
     cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds, train_ds=model.cfg.train_ds)
 
+    if cfg.predict_ds.use_lhotse:
+        OmegaConf.set_struct(cfg.predict_ds, False)
+        cfg.trainer.use_distributed_sampler = False
+        cfg.predict_ds.force_finite = True
+        cfg.predict_ds.force_map_dataset = True
+        cfg.predict_ds.do_transcribe = True
+        OmegaConf.set_struct(cfg.predict_ds, True)
+
     if isinstance(model, EncDecMultiTaskModel):
         cfg.trainer.use_distributed_sampler = False
         OmegaConf.set_struct(cfg.predict_ds, False)
@@ -172,7 +180,7 @@ def main(cfg: ParallelTranscriptionConfig):
 
     trainer = ptl.Trainer(**cfg.trainer)
 
-    if isinstance(model, EncDecMultiTaskModel):
+    if cfg.predict_ds.use_lhotse:
         OmegaConf.set_struct(cfg.predict_ds, False)
         cfg.predict_ds.global_rank = trainer.global_rank
         cfg.predict_ds.world_size = trainer.world_size
diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py
index 76537a8b2b78..f91710de3cb3 100644
--- a/nemo/collections/asr/data/audio_to_text_dataset.py
+++ b/nemo/collections/asr/data/audio_to_text_dataset.py
@@ -867,10 +867,16 @@ def write_on_batch_end(
                 sample = sample_id
                 if isinstance(sample, lhotse.cut.MixedCut):
                     sample = sample.first_non_padding_cut
+                if sample.recording.sources[0].source != '':
+                    item["audio_filepath"] = sample.recording.sources[0].source
+                else:
+                    item["audio_filepath"] = sample.id
                 item["audio_filepath"] = sample.recording.sources[0].source
                 item["offset"] = sample.start
                 item["duration"] = sample.duration
-                item["text"] = sample.supervisions[0].text
+                item["text"] = sample.supervisions[0].text or ''
+                if hasattr(sample, 'shard_id'):
+                    item["shard_id"] = sample.shard_id
                 item["pred_text"] = transcribed_text
                 self.outf.write(json.dumps(item) + "\n")
                 self.samples_num += 1
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse.py b/nemo/collections/asr/data/audio_to_text_lhotse.py
index f916ae1de56b..0ae3059a9296 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse.py
@@ -43,17 +43,18 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
             'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
         }
 
-    def __init__(self, tokenizer):
+    def __init__(self, tokenizer, return_cuts=False):
         super().__init__()
         self.tokenizer = TokenizerWrapper(tokenizer)
         self.load_audio = AudioSamples(fault_tolerant=True)
+        self.return_cuts = return_cuts
 
     def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]:
         audio, audio_lens, cuts = self.load_audio(cuts)
         tokens = [
             torch.cat(
                 [
-                    torch.as_tensor(s.tokens if hasattr(s, "tokens") else self.tokenizer(s.text, s.language))
+                    torch.as_tensor(s.tokens if hasattr(s, "tokens") else self.tokenizer(s.text or "", s.language))
                     for s in c.supervisions
                 ],
                 dim=0,
@@ -62,6 +63,8 @@ def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]:
         ]
         token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long)
         tokens = collate_vectors(tokens, padding_value=0)
+        if self.return_cuts:
+            return audio, audio_lens, tokens, token_lens, cuts.drop_in_memory_data()
         return audio, audio_lens, tokens, token_lens
 
 
diff --git a/nemo/collections/asr/models/configs/asr_models_config.py b/nemo/collections/asr/models/configs/asr_models_config.py
index 29dbbe06d1f8..081233da5d32 100644
--- a/nemo/collections/asr/models/configs/asr_models_config.py
+++ b/nemo/collections/asr/models/configs/asr_models_config.py
@@ -41,6 +41,17 @@ class ASRDatasetConfig(nemo.core.classes.dataset.DatasetConfig):
     shard_manifests: bool = False
     shuffle_n: int = 0
 
+    # lhotse support
+    use_lhotse: bool = False
+    tarred_random_access: bool = False
+    use_bucketing: bool = False
+    batch_duration: Optional[int] = None
+    quadratic_duration: Optional[int] = None
+    bucket_batch_size: Optional[int] = None
+    bucket_duration_bins: Optional[list] = None
+    num_buckets: Optional[int] = 0
+    pin_memory: bool = False
+
     # Optional
     int_values: Optional[int] = None
     augmentor: Optional[Dict[str, Any]] = None
diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py
index 79c22794de01..1f84989c8ebe 100644
--- a/nemo/collections/asr/models/ctc_bpe_models.py
+++ b/nemo/collections/asr/models/ctc_bpe_models.py
@@ -97,9 +97,15 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
         if config.get("use_lhotse"):
             return get_lhotse_dataloader_from_config(
                 config,
-                global_rank=self.global_rank,
-                world_size=self.world_size,
-                dataset=LhotseSpeechToTextBpeDataset(tokenizer=self.tokenizer),
+                # During transcription, the model is initially loaded on the CPU.
+                # To ensure the correct global_rank and world_size are set,
+                # these values must be passed from the configuration.
+                global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"),
+                world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"),
+                dataset=LhotseSpeechToTextBpeDataset(
+                    tokenizer=self.tokenizer,
+                    return_cuts=config.get("do_transcribe", False),
+                ),
                 tokenizer=self.tokenizer,
             )
 
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 76dcd13cca50..ae8c35220931 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -309,8 +309,11 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
         if config.get("use_lhotse"):
             return get_lhotse_dataloader_from_config(
                 config,
-                global_rank=self.global_rank,
-                world_size=self.world_size,
+                # During transcription, the model is initially loaded on the CPU.
+                # To ensure the correct global_rank and world_size are set,
+                # these values must be passed from the configuration.
+                global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"),
+                world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"),
                 dataset=LhotseSpeechToTextBpeDataset(
                     tokenizer=make_parser(
                         labels=config.get('labels', None),
@@ -319,6 +322,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
                         blank_id=config.get('blank_index', -1),
                         do_normalize=config.get('normalize_transcripts', False),
                     ),
+                    return_cuts=config.get("do_transcribe", False),
                 ),
             )
 
@@ -614,7 +618,8 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
             return_hypotheses=False,
         )
 
-        sample_id = sample_id.cpu().detach().numpy()
+        if isinstance(sample_id, torch.Tensor):
+            sample_id = sample_id.cpu().detach().numpy()
         return list(zip(sample_id, transcribed_texts))
 
     def validation_pass(self, batch, batch_idx, dataloader_idx=0):
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
index 7e8720ee3ad8..cd04a5ad2462 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
@@ -140,10 +140,14 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
         if config.get("use_lhotse"):
             return get_lhotse_dataloader_from_config(
                 config,
-                global_rank=self.global_rank,
-                world_size=self.world_size,
+                # During transcription, the model is initially loaded on the CPU.
+                # To ensure the correct global_rank and world_size are set,
+                # these values must be passed from the configuration.
+                global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"),
+                world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"),
                 dataset=LhotseSpeechToTextBpeDataset(
                     tokenizer=self.tokenizer,
+                    return_cuts=config.get("do_transcribe", False),
                 ),
                 tokenizer=self.tokenizer,
             )
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
index 34dd9aae5711..1f63c617cea2 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
@@ -519,8 +519,8 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
         best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor(
             encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False
         )
-
-        sample_id = sample_id.cpu().detach().numpy()
+        if isinstance(sample_id, torch.Tensor):
+            sample_id = sample_id.cpu().detach().numpy()
         return list(zip(sample_id, best_hyp_text))
 
     def validation_pass(self, batch, batch_idx, dataloader_idx):
diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py
index c92bcfaaef7a..cd8667f2f0fe 100644
--- a/nemo/collections/asr/models/rnnt_bpe_models.py
+++ b/nemo/collections/asr/models/rnnt_bpe_models.py
@@ -509,10 +509,14 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
         if config.get("use_lhotse"):
             return get_lhotse_dataloader_from_config(
                 config,
-                global_rank=self.global_rank,
-                world_size=self.world_size,
+                # During transcription, the model is initially loaded on the CPU.
+                # To ensure the correct global_rank and world_size are set,
+                # these values must be passed from the configuration.
+                global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"),
+                world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"),
                 dataset=LhotseSpeechToTextBpeDataset(
                     tokenizer=self.tokenizer,
+                    return_cuts=config.get("do_transcribe", False),
                 ),
                 tokenizer=self.tokenizer,
             )
diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
index e4d1abd0b50c..78038d404107 100644
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@@ -469,8 +469,11 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
         if config.get("use_lhotse"):
             return get_lhotse_dataloader_from_config(
                 config,
-                global_rank=self.global_rank,
-                world_size=self.world_size,
+                # During transcription, the model is initially loaded on the CPU.
+                # To ensure the correct global_rank and world_size are set,
+                # these values must be passed from the configuration.
+                global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"),
+                world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"),
                 dataset=LhotseSpeechToTextBpeDataset(
                     tokenizer=make_parser(
                         labels=config.get('labels', None),
@@ -479,6 +482,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
                         blank_id=config.get('blank_index', -1),
                         do_normalize=config.get('normalize_transcripts', False),
                     ),
+                    return_cuts=config.get("do_transcribe", False),
                 ),
             )
 
@@ -814,7 +818,8 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
             encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False
         )
 
-        sample_id = sample_id.cpu().detach().numpy()
+        if isinstance(sample_id, torch.Tensor):
+            sample_id = sample_id.cpu().detach().numpy()
         return list(zip(sample_id, best_hyp_text))
 
     def validation_pass(self, batch, batch_idx, dataloader_idx=0):
diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 8d0f2b2223a3..4692cb662b4b 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -225,10 +225,14 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             config = self._update_default_values(config)
             return get_lhotse_dataloader_from_config(
                 config,
-                global_rank=self.global_rank,
-                world_size=self.world_size,
+                # During transcription, the model is initially loaded on the CPU.
+                # To ensure the correct global_rank and world_size are set,
+                # these values must be passed from the configuration.
+                global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"),
+                world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"),
                 dataset=LhotseSpeechToTextBpeDataset(
                     tokenizer=self.tokenizer,
+                    return_cuts=config.get("do_transcribe", False),
                 ),
                 tokenizer=self.tokenizer,
             )
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 98b63a07fa9d..bf6b77ad907e 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -147,6 +147,28 @@ class LhotseDataLoadingConfig:
     # In most cases (such as regular multi-GPU training) it will result in a deadlock due to
     # a different number of steps on different DDP ranks.
     force_finite: bool = False
+    # The following two options may be used to override auto-detection of appropriate PyTorch dataset flavor
+    #   for your data types. PyTorch DataLoader uses two objects to yield data: dataset and sampler.
+    # *Map-dataset flavor.* There is one sampler per GPU that lives in the training loop process;
+    #   it selects the examples to be prepared by map-dataset class. Each batch selection determined by the sampler
+    #   is then passed by the dataloader to one of its worker processes to be processed by the dataset class.
+    # *Iterable-dataset flavor.* Each dataloading worker has its own sampler replica instead;
+    #   the sampler must have the logic for either data deduplication or unique order shuffling to avoid
+    #   duplicated data across workers and GPUs. Lhotse relies on unique order shuffling.
+    # The default settings are:
+    # * use iterable dataset for tarred audio data.
+    # * use iterable dataset for any text data.
+    # * use map dataset for non-tarred audio data (we might change this in the future)
+    force_map_dataset: bool = False
+    force_iterable_dataset: bool = False
+
+
+def determine_use_iterable_dataset(use_iterable_dataset: bool, config: DictConfig) -> bool:
+    assert not (
+        config.force_map_dataset and config.force_iterable_dataset
+    ), "Conflicting options: force_map_dataset=True and force_iterable_dataset=True"
+    use_iterable_dataset = (use_iterable_dataset or config.force_iterable_dataset) and not config.force_map_dataset
+    return use_iterable_dataset
 
 
 def get_lhotse_dataloader_from_config(
@@ -176,7 +198,6 @@ def get_lhotse_dataloader_from_config(
     Note that ``tokenizer`` can be any tokenizer type (e.g. both SentencePiece and Aggregate tokenizers work).
     """
     logging.info("We will be using a Lhotse DataLoader.")
-
     config = make_structured_with_schema_warnings(config)
 
     maybe_set_cuda_expandable_segments(enabled=config.cuda_expandable_segments)
@@ -186,8 +207,8 @@ def get_lhotse_dataloader_from_config(
     fix_random_seed(seed)
 
     # 1. Load a manifest as a Lhotse CutSet.
-    cuts, is_tarred = read_cutset_from_config(config)
-
+    cuts, use_iterable_dataset = read_cutset_from_config(config)
+    use_iterable_dataset = determine_use_iterable_dataset(use_iterable_dataset, config)
     # Apply channel selector
     if config.channel_selector is not None:
         logging.info('Using channel selector %s.', config.channel_selector)
@@ -202,7 +223,7 @@ def get_lhotse_dataloader_from_config(
     if tokenizer is not None and config.pretokenize:
         from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
 
-        if not is_tarred:
+        if not use_iterable_dataset:
             logging.warning(
                 "You are using a non-tarred dataset and requested tokenization during data sampling (pretokenize=True). "
                 "This will cause the tokenization to happen in the main (GPU) process, possibly impacting the training speed "
@@ -317,8 +338,8 @@ def get_lhotse_dataloader_from_config(
             duration_bins=determine_bucket_duration_bins(config),
             num_cuts_for_bins_estimate=config.num_cuts_for_bins_estimate,
             buffer_size=config.bucket_buffer_size,
-            rank=0 if is_tarred else global_rank,
-            world_size=1 if is_tarred else world_size,
+            rank=0 if use_iterable_dataset else global_rank,
+            world_size=1 if use_iterable_dataset else world_size,
         )
     else:
         # Non-bucketing sampler, similar to original NeMo dataloading without bucketing,
@@ -335,8 +356,8 @@ def get_lhotse_dataloader_from_config(
             drop_last=config.drop_last,
             shuffle_buffer_size=config.shuffle_buffer_size,
             seed=config.shard_seed,
-            rank=0 if is_tarred else global_rank,
-            world_size=1 if is_tarred else world_size,
+            rank=0 if use_iterable_dataset else global_rank,
+            world_size=1 if use_iterable_dataset else world_size,
         )
 
     if config.concatenate_samples:
@@ -368,7 +389,7 @@ def get_lhotse_dataloader_from_config(
         )
 
     # 4. Creating dataloader.
-    if is_tarred and not config.tarred_random_access:
+    if use_iterable_dataset and not config.tarred_random_access:
         # Wrapper here is necessary when using NeMo tarred data or Lhotse Shar data,
         # because then I/O happens upon sampler iteration. Normally, the sampler resides
         # in the training loop process, but when we use iterable dataset, we can move it to
@@ -601,8 +622,8 @@ class DurationFilter:
     """Callable, returns ``True`` if a cut's duration is in range [d_min, d_max] and ``False`` otherwise."""
 
     def __init__(self, d_min: float, d_max: float) -> None:
-        self.d_min = d_min
-        self.d_max = d_max
+        self.d_min = d_min if d_min is not None else -1.0
+        self.d_max = d_max if d_max is not None else float("inf")
 
     def __call__(self, example) -> bool:
         if isinstance(example, Cut):
diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
index 247906247091..02442291a918 100644
--- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
+++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
@@ -19,9 +19,12 @@
 
 import pytest
 import torch
+from lhotse import CutSet, MonoCut
+from lhotse.testing.dummies import DummyManifest
 from omegaconf import DictConfig
 
 from nemo.collections.asr.data import audio_to_text
+from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.models import configs
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.parts.submodules import ctc_beam_decoding as beam_decode
@@ -118,6 +121,18 @@ def test_forward(self, asr_model):
         diff = torch.max(torch.abs(logprobs_instance - logprobs_batch))
         assert diff <= 1e-6
 
+    @pytest.mark.unit
+    def test_predict_step(self, asr_model):
+        asr_model = asr_model.eval()
+        cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True)
+        dataset = LhotseSpeechToTextBpeDataset(tokenizer=asr_model.tokenizer, return_cuts=True)
+        batch = dataset[cuts]
+        outputs = asr_model.predict_step(batch, 0)
+        assert len(outputs) == 1
+        assert len(outputs[0]) == 2
+        assert isinstance(outputs[0][0], MonoCut)
+        assert isinstance(outputs[0][1], str)
+
     @pytest.mark.with_downloads()
     @pytest.mark.unit
     def test_save_restore_artifact(self, asr_model):
@@ -333,6 +348,15 @@ def test_ASRDatasetConfig_for_AudioToBPEDataset(self):
             'bucketing_strategy',
             'bucketing_weights',
             'channel_selector',
+            'use_lhotse',
+            'tarred_random_access',
+            'use_bucketing',
+            'batch_duration',
+            'quadratic_duration',
+            'bucket_batch_size',
+            'bucket_duration_bins',
+            'num_buckets',
+            'pin_memory',
         ]
 
         REMAP_ARGS = {'trim_silence': 'trim', 'labels': 'tokenizer'}
@@ -372,6 +396,15 @@ def test_ASRDatasetConfig_for_TarredAudioToBPEDataset(self):
             'bucketing_strategy',
             'bucketing_weights',
             'max_utts',
+            'use_lhotse',
+            'tarred_random_access',
+            'use_bucketing',
+            'batch_duration',
+            'quadratic_duration',
+            'bucket_batch_size',
+            'bucket_duration_bins',
+            'num_buckets',
+            'pin_memory',
         ]
 
         REMAP_ARGS = {
diff --git a/tests/collections/asr/test_asr_ctcencdec_model.py b/tests/collections/asr/test_asr_ctcencdec_model.py
index 28a07fd54663..55451758578f 100644
--- a/tests/collections/asr/test_asr_ctcencdec_model.py
+++ b/tests/collections/asr/test_asr_ctcencdec_model.py
@@ -15,12 +15,16 @@
 
 import pytest
 import torch
+from lhotse import CutSet, MonoCut
+from lhotse.testing.dummies import DummyManifest
 from omegaconf import DictConfig, OmegaConf, open_dict
 
 import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.data import audio_to_text
+from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.models import EncDecCTCModel, configs
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
+from nemo.collections.common.parts.preprocessing.parsers import make_parser
 from nemo.utils.config_utils import assert_dataclass_signature_match, update_model_config
 
 
@@ -131,6 +135,19 @@ def test_forward(self, asr_model):
         diff = torch.max(torch.abs(logprobs_instance - logprobs_batch))
         assert diff <= 1e-6
 
+    @pytest.mark.unit
+    def test_predict_step(self, asr_model):
+        token_list = [" ", "a", "b", "c"]
+        asr_model = asr_model.eval()
+        cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True)
+        dataset = LhotseSpeechToTextBpeDataset(tokenizer=make_parser(labels=token_list), return_cuts=True)
+        batch = dataset[cuts]
+        outputs = asr_model.predict_step(batch, 0)
+        assert len(outputs) == 1
+        assert len(outputs[0]) == 2
+        assert isinstance(outputs[0][0], MonoCut)
+        assert isinstance(outputs[0][1], str)
+
     @pytest.mark.unit
     def test_vocab_change(self, asr_model):
         old_vocab = copy.deepcopy(asr_model.decoder.vocabulary)
@@ -274,6 +291,15 @@ def test_ASRDatasetConfig_for_AudioToCharDataset(self):
             'bucketing_strategy',
             'bucketing_weights',
             'channel_selector',
+            'use_lhotse',
+            'tarred_random_access',
+            'use_bucketing',
+            'batch_duration',
+            'quadratic_duration',
+            'bucket_batch_size',
+            'bucket_duration_bins',
+            'num_buckets',
+            'pin_memory',
         ]
 
         REMAP_ARGS = {'trim_silence': 'trim'}
@@ -307,6 +333,15 @@ def test_ASRDatasetConfig_for_TarredAudioToCharDataset(self):
             'bucketing_strategy',
             'bucketing_weights',
             'max_utts',
+            'use_lhotse',
+            'tarred_random_access',
+            'use_bucketing',
+            'batch_duration',
+            'quadratic_duration',
+            'bucket_batch_size',
+            'bucket_duration_bins',
+            'num_buckets',
+            'pin_memory',
         ]
 
         REMAP_ARGS = {
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
index 1743acc6878c..d13c879e47f9 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py
@@ -18,8 +18,11 @@
 
 import pytest
 import torch
+from lhotse import CutSet, MonoCut
+from lhotse.testing.dummies import DummyManifest
 from omegaconf import DictConfig
 
+from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.models.hybrid_rnnt_ctc_bpe_models import EncDecHybridRNNTCTCBPEModel
 from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode
 from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode
@@ -166,6 +169,18 @@ def test_forward(self, hybrid_asr_model):
         diff = torch.max(torch.abs(logits_instance - logprobs_batch))
         assert diff <= 1e-6
 
+    @pytest.mark.unit
+    def test_predict_step(self, hybrid_asr_model):
+        hybrid_asr_model = hybrid_asr_model.eval()
+        cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True)
+        dataset = LhotseSpeechToTextBpeDataset(tokenizer=hybrid_asr_model.tokenizer, return_cuts=True)
+        batch = dataset[cuts]
+        outputs = hybrid_asr_model.predict_step(batch, 0)
+        assert len(outputs) == 1
+        assert len(outputs[0]) == 2
+        assert isinstance(outputs[0][0], MonoCut)
+        assert isinstance(outputs[0][1], str)
+
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
         not NUMBA_RNNT_LOSS_AVAILABLE,
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
index 5362966e2e9e..b5c34e197237 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
@@ -16,14 +16,18 @@
 
 import pytest
 import torch
+from lhotse import CutSet, MonoCut
+from lhotse.testing.dummies import DummyManifest
 from omegaconf import DictConfig, ListConfig
 
+from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.models import EncDecHybridRNNTCTCModel
 from nemo.collections.asr.modules import RNNTDecoder, RNNTJoint, SampledRNNTJoint, StatelessTransducerDecoder
 from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode
 from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
 from nemo.collections.asr.parts.utils import rnnt_utils
+from nemo.collections.common.parts.preprocessing.parsers import make_parser
 from nemo.core.utils import numba_utils
 from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__
 from nemo.utils.config_utils import assert_dataclass_signature_match
@@ -164,6 +168,19 @@ def test_forward(self, hybrid_asr_model):
         diff = torch.max(torch.abs(logprobs_instance - logprobs_batch))
         assert diff <= 1e-6
 
+    @pytest.mark.unit
+    def test_predict_step(self, hybrid_asr_model):
+        token_list = [" ", "a", "b", "c"]
+        hybrid_asr_model = hybrid_asr_model.eval()
+        cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True)
+        dataset = LhotseSpeechToTextBpeDataset(tokenizer=make_parser(labels=token_list), return_cuts=True)
+        batch = dataset[cuts]
+        outputs = hybrid_asr_model.predict_step(batch, 0)
+        assert len(outputs) == 1
+        assert len(outputs[0]) == 2
+        assert isinstance(outputs[0][0], MonoCut)
+        assert isinstance(outputs[0][1], str)
+
     @pytest.mark.skipif(
         not NUMBA_RNNT_LOSS_AVAILABLE,
         reason='RNNTLoss has not been compiled with appropriate numba version.',
diff --git a/tests/collections/asr/test_asr_lhotse_dataset.py b/tests/collections/asr/test_asr_lhotse_dataset.py
index 5a1450e606ac..c131fac70310 100644
--- a/tests/collections/asr/test_asr_lhotse_dataset.py
+++ b/tests/collections/asr/test_asr_lhotse_dataset.py
@@ -65,3 +65,35 @@ def test_lhotse_asr_dataset(tokenizer):
     assert tokens[2].tolist() == [1, 7, 10, 19, 20, 21, 1, 20, 6, 4, 16, 15, 5]
 
     assert token_lens.tolist() == [11, 11, 13]
+
+
+def test_lhotse_asr_dataset_metadata(tokenizer):
+
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True)
+
+    cuts[0].id = "cuts0"
+    cuts[1].id = "cuts1"
+    cuts[0].supervisions = [
+        SupervisionSegment(id="cuts0-sup0", recording_id=cuts[0].recording_id, start=0.2, duration=0.5, text="first"),
+    ]
+    cuts[1].supervisions = [
+        SupervisionSegment(id="cuts1-sup0", recording_id=cuts[1].recording_id, start=0, duration=1, text=""),
+    ]
+
+    datasets_metadata = LhotseSpeechToTextBpeDataset(tokenizer=tokenizer, return_cuts=True)
+    batch = datasets_metadata[cuts]
+    assert isinstance(batch, tuple)
+    assert len(batch) == 5
+
+    _, _, _, _, cuts_metadata = batch
+
+    assert cuts_metadata[0].supervisions[0].text == "first"
+    assert cuts_metadata[1].supervisions[0].text == ""
+    assert cuts_metadata[0].id == "cuts0"
+    assert cuts_metadata[1].id == "cuts1"
+
+    assert cuts_metadata[0].supervisions[0].duration == 0.5
+    assert cuts_metadata[0].supervisions[0].start == 0.2
+
+    assert cuts_metadata[1].supervisions[0].duration == 1
+    assert cuts_metadata[1].supervisions[0].start == 0.0
diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py
index d68088fce376..5e810243c919 100644
--- a/tests/collections/asr/test_asr_rnnt_encdec_model.py
+++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py
@@ -17,13 +17,17 @@
 import pytest
 import torch
 import torch.nn.functional as F
+from lhotse import CutSet, MonoCut
+from lhotse.testing.dummies import DummyManifest
 from omegaconf import DictConfig, ListConfig
 
+from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.models import EncDecRNNTModel
 from nemo.collections.asr.modules import HATJoint, RNNTDecoder, RNNTJoint, SampledRNNTJoint, StatelessTransducerDecoder
 from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode
 from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode
 from nemo.collections.asr.parts.utils import rnnt_utils
+from nemo.collections.common.parts.preprocessing.parsers import make_parser
 from nemo.core.utils import numba_utils
 from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__
 from nemo.utils.config_utils import assert_dataclass_signature_match
@@ -296,6 +300,19 @@ def test_forward(self, asr_model):
         diff = torch.max(torch.abs(logprobs_instance - logprobs_batch))
         assert diff <= 1e-6
 
+    @pytest.mark.unit
+    def test_predict_step(self, asr_model):
+        token_list = [" ", "a", "b", "c"]
+        asr_model = asr_model.eval()
+        cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True)
+        dataset = LhotseSpeechToTextBpeDataset(tokenizer=make_parser(labels=token_list), return_cuts=True)
+        batch = dataset[cuts]
+        outputs = asr_model.predict_step(batch, 0)
+        assert len(outputs) == 1
+        assert len(outputs[0]) == 2
+        assert isinstance(outputs[0][0], MonoCut)
+        assert isinstance(outputs[0][1], str)
+
     @pytest.mark.skipif(
         not NUMBA_RNNT_LOSS_AVAILABLE,
         reason='RNNTLoss has not been compiled with appropriate numba version.',
diff --git a/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py b/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py
index 960445061e24..aba364868e88 100644
--- a/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py
+++ b/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py
@@ -18,8 +18,11 @@
 
 import pytest
 import torch
+from lhotse import CutSet, MonoCut
+from lhotse.testing.dummies import DummyManifest
 from omegaconf import DictConfig
 
+from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.models import ASRModel
 from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel
 from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode
@@ -64,12 +67,18 @@ def asr_model(test_data_dir):
 
     decoder = {
         '_target_': 'nemo.collections.asr.modules.RNNTDecoder',
-        'prednet': {'pred_hidden': model_defaults['pred_hidden'], 'pred_rnn_layers': 1,},
+        'prednet': {
+            'pred_hidden': model_defaults['pred_hidden'],
+            'pred_rnn_layers': 1,
+        },
     }
 
     joint = {
         '_target_': 'nemo.collections.asr.modules.RNNTJoint',
-        'jointnet': {'joint_hidden': 32, 'activation': 'relu',},
+        'jointnet': {
+            'joint_hidden': 32,
+            'activation': 'relu',
+        },
     }
 
     decoding = {'strategy': 'greedy_batch', 'greedy': {'max_symbols': 30}}
@@ -123,7 +132,8 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
 
 class TestEncDecRNNTBPEModel:
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.with_downloads()
     @pytest.mark.unit
@@ -137,7 +147,8 @@ def test_constructor(self, asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_forward(self, asr_model):
@@ -170,9 +181,22 @@ def test_forward(self, asr_model):
         diff = torch.max(torch.abs(logits_instance - logprobs_batch))
         assert diff <= 1e-6
 
+    @pytest.mark.unit
+    def test_predict_step(self, asr_model):
+        asr_model = asr_model.eval()
+        cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True)
+        dataset = LhotseSpeechToTextBpeDataset(tokenizer=asr_model.tokenizer, return_cuts=True)
+        batch = dataset[cuts]
+        outputs = asr_model.predict_step(batch, 0)
+        assert len(outputs) == 1
+        assert len(outputs[0]) == 2
+        assert isinstance(outputs[0][0], MonoCut)
+        assert isinstance(outputs[0][1], str)
+
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_save_restore_artifact(self, asr_model):
@@ -190,7 +214,8 @@ def test_save_restore_artifact(self, asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_save_restore_artifact_spe(self, asr_model, test_data_dir):
@@ -236,7 +261,8 @@ def test_save_restore_artifact_agg(self, asr_model, test_data_dir):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_vocab_change(self, test_data_dir, asr_model):
@@ -266,7 +292,8 @@ def test_vocab_change(self, test_data_dir, asr_model):
 
     @pytest.mark.with_downloads()
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     @pytest.mark.unit
     def test_decoding_change(self, asr_model):
@@ -309,7 +336,8 @@ def test_decoding_change(self, asr_model):
     @pytest.mark.with_downloads()
     @pytest.mark.unit
     @pytest.mark.skipif(
-        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+        not NUMBA_RNNT_LOSS_AVAILABLE,
+        reason='RNNTLoss has not been compiled with appropriate numba version.',
     )
     def test_save_restore_nested_model(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -330,7 +358,7 @@ def test_save_restore_nested_model(self):
 
             # Check size of the checkpoint, which contains weights from pretrained model + linear layer
             fp_weights = os.path.join(tmp_dir, 'model_weights.ckpt')
-            assert os.path.getsize(fp_weights) > 50 * (2 ** 20)  # Assert the weights are more than 50 MB
+            assert os.path.getsize(fp_weights) > 50 * (2**20)  # Assert the weights are more than 50 MB
 
             # Check if param after restoration is exact match
             original_state_dict = model.inner_model.state_dict()

From 42d164e558555669fd96ba9a56e9afb6c1bc1ee1 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Mon, 25 Nov 2024 02:25:51 -0800
Subject: [PATCH 10/11] Fix environment variables in torchrun executor (#11363)

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
---
 nemo/collections/llm/recipes/run/executor.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/nemo/collections/llm/recipes/run/executor.py b/nemo/collections/llm/recipes/run/executor.py
index 305fa6b0a3c7..fe14a4f55bd2 100644
--- a/nemo/collections/llm/recipes/run/executor.py
+++ b/nemo/collections/llm/recipes/run/executor.py
@@ -18,11 +18,7 @@
 def torchrun(devices: int = 8) -> run.Config[run.LocalExecutor]:
     """Local executor using torchrun."""
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
         "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
     }
 
     executor = run.Config(

From 8f779babf33203f0ea42ebfcb3edc92fde5742d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Kami=C5=84ski?=
 <67481570+Laplasjan107@users.noreply.github.com>
Date: Mon, 25 Nov 2024 13:24:15 +0100
Subject: [PATCH 11/11] Add sample generate to PTQ for NeMo 2.0 (#11339)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Initial commit

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Remove leftover print

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Fix docs and type annotations

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Applied code review suggestions

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* Fix _get_decoder_type parameter

Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>

---------

Signed-off-by: Piotr Kaminski <pikaminski@nvidia.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com>
Co-authored-by: Piotr Kaminski <pikaminski@nvidia.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
---
 .../collections/llm/quantization/quantizer.py | 113 ++++++++++--------
 nemo/collections/llm/quantization/utils.py    |  32 ++++-
 scripts/llm/ptq.py                            |   9 ++
 3 files changed, 103 insertions(+), 51 deletions(-)

diff --git a/nemo/collections/llm/quantization/quantizer.py b/nemo/collections/llm/quantization/quantizer.py
index 45f72f06741e..d41ba39f39ea 100644
--- a/nemo/collections/llm/quantization/quantizer.py
+++ b/nemo/collections/llm/quantization/quantizer.py
@@ -24,10 +24,12 @@
 from tqdm import tqdm
 
 from nemo.collections import llm
-from nemo.lightning.ckpt_utils import CONTEXT_PATH
+from nemo.collections.llm.inference import MCoreTokenizerWrappper, generate
+from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
+from nemo.lightning.megatron_parallel import MegatronParallel
 from nemo.utils import logging
 
-from .utils import get_unwrapped_mcore_model
+from .utils import get_modelopt_decoder_type, get_unwrapped_mcore_model
 
 try:
     import modelopt.torch.quantization as mtq
@@ -83,35 +85,12 @@ class ExportConfig:
     decoder_type: Optional[str] = None
     inference_tensor_parallel: int = 1
     inference_pipeline_parallel: int = 1
+    generate_sample: bool = False
 
     def __post_init__(self):
         self.path = Path(self.path)
 
 
-def get_modelopt_decoder_type(config: llm.GPTConfig) -> str:
-    """Infers the modelopt decoder type from GPTConfig class."""
-    mapping = [
-        (llm.Baichuan2Config, "baichuan"),
-        (llm.ChatGLMConfig, "chatglm"),
-        (llm.GemmaConfig, "gemma"),
-        (llm.LlamaConfig, "llama"),
-        (llm.MistralConfig7B, "llama"),
-        (llm.MixtralConfig, "llama"),
-        (llm.NemotronConfig, "gptnext"),
-        (llm.Qwen2Config, "qwen"),
-        # TODO: (llm.StarcoderConfig,   ""),
-        (llm.Starcoder2Config, "gptnext"),
-    ]
-
-    for config_class, decoder_type in mapping:
-        if isinstance(config, config_class):
-            return decoder_type
-
-    logging.warning("Could not directly infer the decoder type")
-    # TODO: Add a reasonable behavior for GPTConfig (for instance based on position_embedding_type)
-    return "llama"
-
-
 class Quantizer:
     """Post-training quantization (PTQ) and TensorRT-LLM export of NeMo 2.0 checkpoints.
 
@@ -146,16 +125,37 @@ def __init__(self, quantization_config: QuantizationConfig, export_config: Expor
             assert dtype in SUPPORTED_DTYPE, f"Unsupported export dtype: {dtype}"
         self.torch_dtype = torch_dtype_from_precision(dtype)
 
-    def _setup(self, model: llm.GPTModel) -> None:
+    @staticmethod
+    def _setup(model: MegatronParallel) -> None:
         """Setup model for quantization."""
         # TODO: disable activation checkpointing
         model.config.vocab_size = model.tokenizer.vocab_size
         model.freeze()
 
-    def _get_decoder_type(self, config: llm.GPTConfig):
-        return self.export_config.decoder_type or get_modelopt_decoder_type(config)
+    def _get_decoder_type(self, model: MegatronParallel):
+        if self.export_config.decoder_type is not None:
+            return self.export_config.decoder_type
+        unwrapped_model = model
+        while not isinstance(unwrapped_model, llm.GPTModel):
+            unwrapped_model = unwrapped_model.module
+
+        return get_modelopt_decoder_type(unwrapped_model)
+
+    @staticmethod
+    def _generate_sample(model: MegatronParallel):
+        prompts = ["Born in north-east France, Soyer trained as a", "Born in California, Soyer trained as a"]
+
+        mcore_tokenizer = MCoreTokenizerWrappper(model.tokenizer)
+        mcore_inference = model.get_inference_wrapper(
+            params_dtype=torch.bfloat16, inference_batch_times_seqlen_threshold=30
+        )
+
+        generated = [r.generated_text for r in generate(mcore_inference, mcore_tokenizer, prompts)]
+        outputs = [prompt + generation for prompt, generation in zip(prompts, generated)]
+
+        logging.info(f'Sample generation after PTQ (with prompts): {outputs}')
 
-    def quantize(self, model: llm.GPTModel, forward_loop=None):
+    def quantize(self, model: MegatronParallel, forward_loop=None):
         """Quantize the model and calibrate using given forward loop."""
         if forward_loop is None:
             get_dataloader = create_data_iterator_getter(
@@ -185,7 +185,7 @@ def quantize(self, model: llm.GPTModel, forward_loop=None):
 
         self._setup(model)
         unwrapped_model = get_unwrapped_mcore_model(model)
-        decoder_type = self._get_decoder_type(unwrapped_model.config)
+        decoder_type = self._get_decoder_type(model)
         quant_cfg = QUANT_CFG_CHOICES[algorithm]
         if "awq" in algorithm:
             weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"]
@@ -230,6 +230,10 @@ def quantize(self, model: llm.GPTModel, forward_loop=None):
         if dist.get_rank() == 0:
             mtq.print_quant_summary(unwrapped_model)
 
+        if self.export_config.generate_sample:
+            logging.info("Generating a sample output after model quantization.")
+            self._generate_sample(model)
+
         return model
 
     def create_megatron_forward_loop(
@@ -266,21 +270,34 @@ def loop(model):
 
         return loop
 
-    def export(self, model: llm.GPTModel, model_dir: str) -> None:
+    @staticmethod
+    def _validate_quantized_checkpoint(checkpoint_dir: Path, tensor_parallelism_size: int) -> bool:
+        """Basic validation of the model structure."""
+
+        saved_config = (checkpoint_dir / 'config.json').exists()
+        saved_weights = True
+        for i in range(tensor_parallelism_size):
+            saved_weights &= (checkpoint_dir / f'rank{i}.safetensors').exists()
+
+        export_successful = saved_config and saved_weights
+        if not export_successful:
+            logging.error("Failed to export the quantized model.")
+        return export_successful
+
+    def export(self, model: MegatronParallel, model_dir: str) -> None:
         """Export model to a TensorRT-LLM checkpoint."""
-        assert self.export_config is not None, "Export config is not set"
-        # TODO: Add sample generate
-        # TODO: Support megatron_amp_O2
         export_dir = self.export_config.path
+        inference_tp = self.export_config.inference_tensor_parallel
+        inference_pp = self.export_config.inference_pipeline_parallel
 
         use_nfs_workspace = model.config.pipeline_model_parallel_size > 1
         export_tensorrt_llm_checkpoint(
             model=get_unwrapped_mcore_model(model),
-            decoder_type=self._get_decoder_type(model.config),
+            decoder_type=self._get_decoder_type(model),
             dtype=self.torch_dtype,
             export_dir=export_dir,
-            inference_tensor_parallel=self.export_config.inference_tensor_parallel,
-            inference_pipeline_parallel=self.export_config.inference_pipeline_parallel,
+            inference_tensor_parallel=inference_tp,
+            inference_pipeline_parallel=inference_pp,
             use_nfs_workspace=use_nfs_workspace,
         )
         dist.barrier()
@@ -288,14 +305,13 @@ def export(self, model: llm.GPTModel, model_dir: str) -> None:
         # Save the model context in order to restore its tokenizer later. The destination
         # path is "nemo_context" as this name is used in nemo.export to setup tokenizer.
         if dist.get_rank() == 0:
+            assert self._validate_quantized_checkpoint(export_dir, inference_tp)
             shutil.copytree(
-                os.path.join(model_dir, CONTEXT_PATH),
+                ckpt_to_context_subdir(model_dir),
                 os.path.join(export_dir, "nemo_context"),
                 dirs_exist_ok=True,
             )
-            logging.info("Model context saved.")
-
-        logging.info(f"Export succeeded, model has been exported to {export_dir}.")
+            logging.info(f"Export succeeded, model has been exported to {export_dir}.")
 
 
 def get_calib_data_iter(
@@ -323,7 +339,7 @@ def get_calib_data_iter(
 def create_data_iterator_getter(model, dataset, seq_len, batch_size, calibration_size):
     """Create a function that provides iterator over a given dataset."""
 
-    def _iterator():
+    def _get_iterator():
         CHARACTERS_PER_TOKEN = 4
 
         dataloader = get_calib_data_iter(
@@ -332,14 +348,13 @@ def _iterator():
             batch_size=batch_size,
             calib_size=calibration_size,
         )
+
+        data = []
         for batch in dataloader:
             batch = [model.tokenizer.text_to_ids(text)[:seq_len] for text in batch]
             batch = [ids + (seq_len - len(ids)) * [model.tokenizer.eos] for ids in batch]
-            yield torch.tensor(batch, device=model.device)
+            data.append(torch.tensor(batch, device=model.device))
 
-    def _iterator_getter():
-        dataloader = _iterator()
-        dataloader = [data for data in dataloader]
-        return iter(tqdm(dataloader))
+        return iter(tqdm(data))
 
-    return _iterator_getter
+    return _get_iterator
diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py
index bdfccb208d06..20739c872e80 100644
--- a/nemo/collections/llm/quantization/utils.py
+++ b/nemo/collections/llm/quantization/utils.py
@@ -23,8 +23,33 @@
 from nemo.utils import logging
 
 
+def get_modelopt_decoder_type(model: llm.GPTModel) -> str:
+    """Infers the modelopt decoder type from GPTModel subclass."""
+    mapping = [
+        (llm.Baichuan2Model, "baichuan"),
+        (llm.ChatGLMModel, "chatglm"),
+        (llm.Gemma2Model, "gemma2"),
+        (llm.GemmaModel, "gemma"),
+        (llm.LlamaModel, "llama"),
+        (llm.MistralModel, "llama"),
+        (llm.MixtralModel, "llama"),
+        (llm.NemotronModel, "gptnext"),
+        (llm.Qwen2Model, "qwen"),
+        (llm.StarcoderModel, "gptnext"),
+        (llm.Starcoder2Model, "gptnext"),
+        (llm.Phi3Model, "phi3"),
+    ]
+
+    for config_class, decoder_type in mapping:
+        if isinstance(model, config_class):
+            return decoder_type
+
+    logging.warning("Could not infer the decoder type")
+    return None
+
+
 def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig:
-    """Modify model config for TensorRT Model Optimizer"""
+    """Modify model config for TensorRT-Model-Optimizer quantization"""
 
     from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec import (
         get_gpt_layer_modelopt_spec,
@@ -46,7 +71,9 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig:
 def load_with_modelopt_layer_spec(
     nemo_checkpoint_path: str, calib_tp: int = 1, calib_pp: int = 1, inference_only: bool = True
 ):
-    # TODO: setting ddp="pytorch" with manually deleting model.optim is a hackish way to disable DDP initialization. Needs a systematic solution.
+    """Loads a model from a NeMo 2.0 checkpoint using modelopt layer spec."""
+    # TODO: setting ddp="pytorch" and deleting model.optim is a hackish way to disable DDP initialization.
+    # Needs a systematic solution.
     if inference_only:
         strategy = nl.MegatronStrategy(
             tensor_model_parallel_size=calib_tp,
@@ -81,6 +108,7 @@ def load_with_modelopt_layer_spec(
 
 
 def get_unwrapped_mcore_model(model):
+    """Unwraps NeMo 2.0 to base MCore model."""
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
 
     unwrapped_model = model
diff --git a/scripts/llm/ptq.py b/scripts/llm/ptq.py
index c04d32290e5f..2afe38c37b4d 100644
--- a/scripts/llm/ptq.py
+++ b/scripts/llm/ptq.py
@@ -17,6 +17,8 @@
 
 
 def get_args():
+    """Parses PTQ arguments"""
+
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description="NeMo PTQ argument parser",
@@ -58,6 +60,10 @@ def get_args():
         type=str,
         help='Calibration dataset to be used. Should be \"wikitext\", \"cnn_dailymail\" or path to a local .json file',
     )
+    parser.add_argument(
+        '--generate_sample', help='Generate sample model output after performing PTQ', action='store_true'
+    )
+    parser.set_defaults(generate_sample=False)
 
     args = parser.parse_args()
     if args.output_path is None:
@@ -68,6 +74,8 @@ def get_args():
 
 
 def main():
+    """Example NeMo 2.0 Post Training Quantization workflow"""
+
     args = get_args()
 
     quantization_config = quantization.QuantizationConfig(
@@ -87,6 +95,7 @@ def main():
         inference_tensor_parallel=args.tensor_parallelism_size,
         inference_pipeline_parallel=args.pipeline_parallelism_size,
         dtype=args.dtype,
+        generate_sample=args.generate_sample,
     )
 
     quantizer = quantization.Quantizer(quantization_config, export_config)