From a153b8c58d56aa930749587017ee70d56f75445e Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Fri, 22 Nov 2024 13:04:32 -0500 Subject: [PATCH 01/11] Fix transcribe speech (#11379) * add explicit bracket for or operation Signed-off-by: Nithin Rao Koluguri * add cfg arg Signed-off-by: Nithin Rao Koluguri --------- Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri --- examples/asr/transcribe_speech.py | 3 +++ nemo/collections/asr/models/ctc_models.py | 1 + nemo/collections/asr/models/rnnt_models.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index f1d61edc990e..5c4a636e8b1c 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -276,6 +276,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis # we will adjust this flag if the model does not support it compute_langs = cfg.compute_langs + if cfg.timestamps: + cfg.return_hypotheses = True + # Check whether model and decoder type match if isinstance(asr_model, EncDecCTCModel): if cfg.decoder_type and cfg.decoder_type != 'ctc': diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index 3df6a7352c4d..76dcd13cca50 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -160,6 +160,7 @@ def transcribe( A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files """ + timestamps = timestamps or (override_config.timestamps if override_config is not None else None) if timestamps is not None: # else retain the decoder state (users can set it using change_decoding_strategy) if timestamps or (override_config is not None and override_config.timestamps): diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index a6408b5e935e..e4d1abd0b50c 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -285,7 +285,7 @@ def transcribe( * A list of greedy transcript texts / Hypothesis * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis. """ - + timestamps = timestamps or (override_config.timestamps if override_config is not None else None) if timestamps is not None: if timestamps or (override_config is not None and override_config.timestamps): logging.info( From d033737c96c7639f2e3912a0f6fea65be2267688 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Fri, 22 Nov 2024 14:10:10 -0500 Subject: [PATCH 02/11] Add llama 3.2 1b and 3b (#11335) * add llama 3.2 1b and 3b Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * add recipe to init Signed-off-by: Chen Cui * fix path Signed-off-by: Chen Cui --------- Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: cuichenx --- nemo/collections/llm/__init__.py | 4 + nemo/collections/llm/gpt/model/__init__.py | 4 + nemo/collections/llm/gpt/model/llama.py | 39 ++- nemo/collections/llm/recipes/__init__.py | 4 + nemo/collections/llm/recipes/llama32_1b.py | 270 +++++++++++++++++++++ nemo/collections/llm/recipes/llama32_3b.py | 270 +++++++++++++++++++++ 6 files changed, 588 insertions(+), 3 deletions(-) create mode 100644 nemo/collections/llm/recipes/llama32_1b.py create mode 100644 nemo/collections/llm/recipes/llama32_3b.py diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index c5c2e007bc1e..c36da39b43c7 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -73,6 +73,8 @@ Llama31Config8B, Llama31Config70B, Llama31Config405B, + Llama32Config1B, + Llama32Config3B, LlamaConfig, LlamaModel, MaskedTokenLossReduction, @@ -171,6 +173,8 @@ "Llama31Config8B", "Llama31Config70B", "Llama31Config405B", + "Llama32Config1B", + "Llama32Config3B", "CodeLlamaConfig7B", "CodeLlamaConfig13B", "CodeLlamaConfig34B", diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py index 152309536f5b..9f186ebba90f 100644 --- a/nemo/collections/llm/gpt/model/__init__.py +++ b/nemo/collections/llm/gpt/model/__init__.py @@ -59,6 +59,8 @@ Llama31Config8B, Llama31Config70B, Llama31Config405B, + Llama32Config1B, + Llama32Config3B, LlamaConfig, LlamaModel, ) @@ -134,6 +136,8 @@ "Llama31Config8B", "Llama31Config70B", "Llama31Config405B", + "Llama32Config1B", + "Llama32Config3B", "NemotronConfig", "Nemotron3Config4B", "Nemotron3Config8B", diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py index f5f6de6c79e7..a7e995addb83 100644 --- a/nemo/collections/llm/gpt/model/llama.py +++ b/nemo/collections/llm/gpt/model/llama.py @@ -14,6 +14,7 @@ import math from dataclasses import dataclass +from functools import partial from pathlib import Path from typing import TYPE_CHECKING, Annotated, Callable, Optional @@ -86,7 +87,7 @@ class Llama2Config70B(LlamaConfig): @dataclass -class Llama3Config(GPTConfig): +class Llama3Config(LlamaConfig): num_query_groups: int = 8 hidden_dropout: float = 0.0 attention_dropout: float = 0.0 @@ -182,6 +183,32 @@ class Llama31Config405B(Llama31Config): make_vocab_size_divisible_by: int = 128 +@dataclass +class Llama32Config1B(Llama31Config): + scale_factor: int = 32 + share_embeddings_and_output_weights: bool = True + rotary_base: int = 500_000 + num_layers: int = 16 + hidden_size: int = 2048 + ffn_hidden_size: int = 8192 + num_attention_heads: int = 32 + num_query_groups: int = 8 + make_vocab_size_divisible_by: int = 128 + + +@dataclass +class Llama32Config3B(Llama31Config): + scale_factor: int = 32 + share_embeddings_and_output_weights: bool = True + rotary_base: int = 500_000 + num_layers: int = 28 + hidden_size: int = 3072 + ffn_hidden_size: int = 8192 + num_attention_heads: int = 24 + num_query_groups: int = 8 + make_vocab_size_divisible_by: int = 128 + + @dataclass class CodeLlamaConfig7B(Llama2Config7B): rotary_base: int = 1_000_000 @@ -252,6 +279,9 @@ def convert_state(self, source, target): "model.norm.weight": "decoder.final_layernorm.weight", "lm_head.weight": "output_layer.weight", } + if getattr(source.config, "tie_word_embeddings", False): + # llama 3.2 1B and 3B models have no shared input output embeddings + del mapping["lm_head.weight"] return io.apply_transforms(source, target, mapping=mapping, transforms=[_import_qkv, _import_linear_fc1]) @@ -275,7 +305,7 @@ def make_vocab_size_divisible_by(vocab_size): if getattr(source, 'rope_scaling', None) is not None and source.rope_scaling.get('rope_type') == 'llama3': # Apply Llama3.1 customize rope scaling - cls = Llama31Config + cls = partial(Llama31Config, scale_factor=source.rope_scaling.get("factor", 8.0)) else: cls = LlamaConfig output = cls( @@ -289,7 +319,7 @@ def make_vocab_size_divisible_by(vocab_size): rotary_base=source.rope_theta, gated_linear_unit=True, make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size), - share_embeddings_and_output_weights=False, + share_embeddings_and_output_weights=getattr(source, "tie_word_embeddings", False), fp16=(dtype_from_hf(source) == torch.float16), bf16=(dtype_from_hf(source) == torch.bfloat16), params_dtype=dtype_from_hf(source), @@ -355,6 +385,7 @@ def config(self) -> "HFLlamaConfig": num_key_value_heads=source.num_query_groups, rope_theta=source.rotary_base, vocab_size=self.tokenizer.vocab_size, + tie_word_embeddings=source.share_embeddings_and_output_weights, ) @@ -509,6 +540,8 @@ def apply_rope_scaling( "Llama31Config8B", "Llama31Config70B", "Llama31Config405B", + "Llama32Config1B", + "Llama32Config3B", "CodeLlamaConfig7B", "CodeLlamaConfig13B", "CodeLlamaConfig34B", diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index 449592298d41..1db88f633e89 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -33,6 +33,8 @@ llama31_8b, llama31_70b, llama31_405b, + llama32_1b, + llama32_3b, mamba2_1_3b, mamba2_2_7b, mamba2_8b, @@ -89,6 +91,8 @@ "llama31_8b", "llama31_70b", "llama31_405b", + "llama32_1b", + "llama32_3b", "mamba2_130m", "mamba2_370m", "mamba2_780m", diff --git a/nemo/collections/llm/recipes/llama32_1b.py b/nemo/collections/llm/recipes/llama32_1b.py new file mode 100644 index 000000000000..32675adf3686 --- /dev/null +++ b/nemo/collections/llm/recipes/llama32_1b.py @@ -0,0 +1,270 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, Optional + +import lightning.pytorch as pl +import nemo_run as run +import torch +from lightning.pytorch.callbacks.callback import Callback +from megatron.core.distributed import DistributedDataParallelConfig + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs +from nemo.collections.llm.gpt.model.llama import Llama32Config1B, LlamaModel +from nemo.collections.llm.peft import PEFT_STR2CLS +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.utils.exp_manager import TimingCallback + +NAME = "llama32_1b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3.2 1B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3.2 1B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=llama32_1b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + conf = run.Config(Llama32Config1B) + conf.seq_length = 8192 + return run.Config(LlamaModel, config=conf) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Llama3.2 1B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama32_1b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size efficiently. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + average_in_collective=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Llama3.2 1B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama32_1b + $ nemo llm pretrain --factory "llama32_1b(num_nodes=1, name='my_1b_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama32_1b_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + This recipe is optimized for the large 8B model and requires significant computational resources. + """ + recipe = run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', + seq_length: Optional[int] = None, + packed_sequence: Optional[bool] = None, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3.2 1B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + seq_length (int): Maximum number of tokens per microbatch. + packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given + maximum seq_length for better efficiency. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama32_1b + + Python API usage: + >>> recipe = finetune_recipe(name="llama32_1b_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + + # For unpacked sequence, most samples in SQuAD dataset are shorter than 2K + if seq_length is None: + seq_length = 4096 if packed_sequence else 2048 + + recipe = default_finetune_recipe( + model(), "meta-llama/Llama-3.2-1B", dir, name, num_nodes, num_gpus_per_node, packed_sequence + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) + recipe.peft.dim = 8 + recipe.peft.alpha = 16 + recipe.optim.config.use_distributed_optimizer = False + + # some settings currently do not function correctly with LoRA + recipe.model.config.cross_entropy_loss_fusion = False + + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + + # Sequence length settings in the model and dataset must agree + recipe.model.config.seq_length = seq_length + recipe.data.seq_length = seq_length + if packed_sequence: + recipe.data.dataset_kwargs = {'pad_to_max_length': True} + recipe.data.packed_sequence_specs = run.Config(PackedSequenceSpecs, packed_sequence_size=seq_length) + + return recipe diff --git a/nemo/collections/llm/recipes/llama32_3b.py b/nemo/collections/llm/recipes/llama32_3b.py new file mode 100644 index 000000000000..d78ea0b50983 --- /dev/null +++ b/nemo/collections/llm/recipes/llama32_3b.py @@ -0,0 +1,270 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, Optional + +import lightning.pytorch as pl +import nemo_run as run +import torch +from lightning.pytorch.callbacks.callback import Callback +from megatron.core.distributed import DistributedDataParallelConfig + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs +from nemo.collections.llm.gpt.model.llama import Llama32Config3B, LlamaModel +from nemo.collections.llm.peft import PEFT_STR2CLS +from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe +from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.utils.exp_manager import TimingCallback + +NAME = "llama32_3b" + + +@run.cli.factory(name=NAME) +def model() -> run.Config[pl.LightningModule]: + """ + Factory function to create a Llama3.2 3B model configuration. + + Returns: + run.Config[pl.LightningModule]: Configuration for the Llama3.2 3B model. + + Examples: + CLI usage: + $ nemo llm pretrain model=llama32_3b ... + + Python API usage: + >>> model_config = model() + >>> print(model_config) + """ + conf = run.Config(Llama32Config3B) + conf.seq_length = 8192 + return run.Config(LlamaModel, config=conf) + + +def trainer( + tensor_parallelism: int = 1, + pipeline_parallelism: int = 1, + pipeline_parallelism_type: Optional[torch.dtype] = None, + virtual_pipeline_parallelism: Optional[int] = None, + context_parallelism: int = 1, + sequence_parallelism: bool = False, + num_nodes: int = 1, + num_gpus_per_node: int = 8, + max_steps: int = 1168251, + callbacks: Optional[list[run.Config[Callback]]] = None, +) -> run.Config[nl.Trainer]: + """ + Configure the NeMo Lightning Trainer for Llama3.2 3B model. + + Args: + tensor_parallelism (int): Degree of tensor model parallelism. + pipeline_parallelism (int): Degree of pipeline model parallelism. + pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. + virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. + context_parallelism (int): Degree of context parallelism. + sequence_parallelism (bool): Whether to use sequence parallelism. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + max_steps (int): Maximum number of training steps. + callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. + + Returns: + run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. + + Examples: + CLI usage: + $ nemo llm pretrain trainer=llama32_3b ... + + Python API usage: + >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1) + >>> print(trainer_config) + + Note: + This configuration uses extensive parallelism to handle the large model size efficiently. + """ + strategy = run.Config( + nl.MegatronStrategy, + tensor_model_parallel_size=tensor_parallelism, + pipeline_model_parallel_size=pipeline_parallelism, + pipeline_dtype=pipeline_parallelism_type, + virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, + context_parallel_size=context_parallelism, + sequence_parallel=sequence_parallelism, + gradient_as_bucket_view=True, + ckpt_async_save=True, + ckpt_parallel_load=True, + ddp=run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + average_in_collective=True, + ), + ) + + trainer = run.Config( + nl.Trainer, + accelerator="gpu", + accumulate_grad_batches=1, + callbacks=callbacks, + devices=num_gpus_per_node, + limit_test_batches=50, + limit_val_batches=32, + log_every_n_steps=10, + max_steps=max_steps, + num_nodes=num_nodes, + plugins=bf16_mixed(), + strategy=strategy, + use_distributed_sampler=False, + val_check_interval=2000, + ) + + return trainer + + +@run.cli.factory(target=pretrain, name=NAME) +def pretrain_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + fn: Callable = pretrain, +) -> run.Partial: + """ + Create a pre-training recipe for Llama3.2 3B model. + + This function sets up a complete configuration for pre-training, including + model, trainer, data, logging, optimization, and resumption settings. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the pre-training run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + fn (Callable): The pre-training function to use. + + Returns: + run.Partial: Partial configuration for pre-training. + + Examples: + CLI usage: + $ nemo llm pretrain --factory llama32_3b + $ nemo llm pretrain --factory "llama32_3b(num_nodes=1, name='my_3b_pretrain')" + + Python API usage: + >>> recipe = pretrain_recipe(name="llama32_3b_pretrain", num_nodes=1) + >>> print(recipe) + + Note: + This recipe is optimized for the large 8B model and requires significant computational resources. + """ + recipe = run.Partial( + fn, + model=model(), + trainer=trainer( + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + callbacks=[run.Config(TimingCallback)], + ), + data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), + log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), + optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), + resume=default_resume(), + ) + + return recipe + + +@run.cli.factory(target=finetune, name=NAME) +def finetune_recipe( + dir: Optional[str] = None, + name: str = "default", + num_nodes: int = 1, + num_gpus_per_node: int = 8, + peft_scheme: Optional[str] = 'lora', + seq_length: Optional[int] = None, + packed_sequence: Optional[bool] = None, +) -> run.Partial: + """ + Create a fine-tuning recipe for Llama3.2 3B model. + + This function sets up a complete configuration for fine-tuning, including + model, trainer, data, logging, optimization, and resumption settings. + The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None. + + Args: + dir (Optional[str]): Directory for saving logs and checkpoints. + name (str): Name of the fine-tuning run. + num_nodes (int): Number of compute nodes to use. + num_gpus_per_node (int): Number of GPUs per node. + peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. + Allowed values: 'lora'/'dora'/'none'/None. + seq_length (int): Maximum number of tokens per microbatch. + packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given + maximum seq_length for better efficiency. + + Returns: + run.Partial: Partial configuration for fine-tuning. + + Examples: + CLI usage: + $ nemo llm finetune --factory llama32_3b + + Python API usage: + >>> recipe = finetune_recipe(name="llama32_3b_finetune", num_nodes=1) + >>> print(recipe) + + Note: + This recipe uses the SQuAD dataset for fine-tuning. For more information + on fine-tuning LLMs with NeMo, see the fine-tuning guide in the + `examples/llm/finetune/` directory. + """ + + # For unpacked sequence, most samples in SQuAD dataset are shorter than 2K + if seq_length is None: + seq_length = 4096 if packed_sequence else 2048 + + recipe = default_finetune_recipe( + model(), "meta-llama/Llama-3.2-3B", dir, name, num_nodes, num_gpus_per_node, packed_sequence + ) + if peft_scheme is None or peft_scheme.lower() == 'none': + recipe.trainer.strategy.tensor_model_parallel_size = 1 + recipe.optim.config.lr = 5e-6 + elif peft_scheme.lower() in ['lora', 'dora']: + recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()]) + recipe.peft.dim = 8 + recipe.peft.alpha = 16 + recipe.optim.config.use_distributed_optimizer = False + + # some settings currently do not function correctly with LoRA + recipe.model.config.cross_entropy_loss_fusion = False + + recipe.optim.config.lr = 1e-4 + else: + raise ValueError(f"Unrecognized peft scheme: {peft_scheme}") + + # Sequence length settings in the model and dataset must agree + recipe.model.config.seq_length = seq_length + recipe.data.seq_length = seq_length + if packed_sequence: + recipe.data.dataset_kwargs = {'pad_to_max_length': True} + recipe.data.packed_sequence_specs = run.Config(PackedSequenceSpecs, packed_sequence_size=seq_length) + + return recipe From ba7a68255bb2be0d449f7c63ed43178f78e188fd Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Fri, 22 Nov 2024 21:22:25 +0200 Subject: [PATCH 03/11] mlm conversion & tiktokenizer support (#11349) * mlm conversion fix Signed-off-by: dimapihtar * add tiktoken support for nemotron -> hf Signed-off-by: dimapihtar * additional params Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * add ci test for mlm conversion Signed-off-by: dimapihtar * add ci test for mlm ckpt conversion Signed-off-by: dimapihtar * remove extra if statement Signed-off-by: dimapihtar * fix typo Signed-off-by: dimapihtar * fix if statement Signed-off-by: dimapihtar * fix paths Signed-off-by: dimapihtar * update paths Signed-off-by: dimapihtar --------- Signed-off-by: dimapihtar Signed-off-by: dimapihtar Co-authored-by: dimapihtar --- .github/workflows/cicd-main.yml | 121 ++++++++++++++++++ .../megatron_ckpt_to_nemo.py | 9 +- nemo/collections/nlp/models/nlp_model.py | 17 ++- .../convert_nemotron_nemo_to_hf.py | 38 +++++- 4 files changed, 182 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 49c6c55ca778..b82bbc65cfc1 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2109,6 +2109,126 @@ jobs: # } # } + L2_Megatron_LM_To_NeMo_Conversion: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_LM_To_NeMo_Conversion') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=1 Megatron-LM/pretrain_gpt.py \ + --mock-data \ + --distributed-timeout-minutes 60 \ + --use-mcore-models \ + --no-mmap-bin-files \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --train-samples 80 \ + --init-method-std 0.014 \ + --position-embedding-type rope \ + --rotary-base 1000000 \ + --rotary-percent 1.0 \ + --squared-relu \ + --num-layers 4 \ + --hidden-size 384 \ + --num-attention-heads 8 \ + --group-query-attention \ + --num-query-groups 8 \ + --ffn-hidden-size 1536 \ + --kv-channels 128 \ + --normalization RMSNorm \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --exit-duration-in-mins 5750 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --micro-batch-size 1 \ + --global-batch-size 8 \ + --lr 6e-4 \ + --min-lr 6e-6 \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --lr-decay-style cosine \ + --log-interval 1 \ + --eval-iters 1 \ + --eval-interval 10 \ + --tokenizer-type GPT2BPETokenizer \ + --tokenizer-model /home/TestData/nlp/gpt2_tokenizer \ + --vocab-file /home/TestData/nlp/gpt2_tokenizer/vocab.json \ + --merge-file /home/TestData/nlp/gpt2_tokenizer/merges.txt \ + --save /tmp/mlm_conversion_ckpt \ + --save-interval 10 \ + --ckpt-format torch_dist \ + --ckpt-fully-parallel-save \ + --ckpt-fully-parallel-load \ + --async-save \ + --ckpt-assume-constant-structure \ + --timing-log-option minmax \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-throughput \ + --bf16 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --manual-gc \ + --num-workers 2 + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + model.data.data_impl=mock \ + model.data.data_prefix=[] \ + model.skip_train=True \ + model.transformer_engine=True \ + model.use_flash_attention=False \ + model.normalization=rmsnorm \ + model.num_layers=4 \ + model.hidden_size=384 \ + model.ffn_hidden_size=1536 \ + model.num_attention_heads=8 \ + model.num_query_groups=8 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=True \ + model.masked_softmax_fusion=True \ + model.encoder_seq_length=8192 \ + model.max_position_embeddings=8192 \ + model.data.seq_length=8192 \ + model.activation=squared-relu \ + model.transformer_block_type=True \ + model.micro_batch_size=1 \ + model.global_batch_size=8 \ + ++model.rotary_base=1000000 \ + model.rotary_percentage=1.0 \ + model.apply_query_key_layer_scaling=False \ + ++model.group_query_attention=True \ + model.apply_rope_fusion=True \ + model.kv_channels=128 \ + ++model.bert_binary_head=True \ + ++model.position_embedding_type=rope \ + ++model.add_position_embedding=True \ + trainer.limit_val_batches=1 \ + exp_manager.exp_dir=/tmp/nemo_conversion_ckpt + + python -m torch.distributed.launch --nproc_per_node=1 examples/nlp/language_modeling/megatron_ckpt_to_nemo.py \ + --checkpoint_folder /tmp/mlm_conversion_ckpt \ + --checkpoint_name iter_0000010 \ + --nemo_file_path /tmp/mlm_to_nemo_test.nemo \ + --tensor_model_parallel_size 1 \ + --pipeline_model_parallel_size 1 \ + --gpus_per_node 1 \ + --model_type gpt \ + --hparams_file /tmp/nemo_conversion_ckpt/megatron_gpt/version_0/hparams.yaml \ + --convert_mlm + + AFTER_SCRIPT: | + rm -rf /tmp/nemo_conversion_ckpt + rm -rf /tmp/mlm_conversion_ckpt + rm -rf /tmp/mlm_to_nemo_test.nemo + L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4432,6 +4552,7 @@ jobs: - L2_RAG_Pipeline_Generating - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_Skip_Train + - L2_Megatron_LM_To_NeMo_Conversion - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_Drop_Optimizer_States_TP2 diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py index b46f8f459ff0..4b9fab987dc7 100644 --- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py +++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py @@ -112,6 +112,11 @@ def get_args(): choices=['32-true', '16-mixed', 'bf16-mixed'], help="Precision value for the trainer that matches with precision of the ckpt", ) + parser.add_argument( + "--convert_mlm", + action="store_true", + help="Use this flag to convert megatron-lm checkpoints.", + ) args = parser.parse_args() return args @@ -195,7 +200,9 @@ def convert(local_rank, rank, world_size, args): ) if args.model_type == 'gpt': - model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) + model = MegatronGPTModel.load_from_checkpoint( + checkpoint_path, hparams_file=args.hparams_file, trainer=trainer, load_mlm=args.convert_mlm + ) elif args.model_type == 'sft': model = MegatronGPTSFTModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py index 0c61b085bc7f..6a87eb28723c 100644 --- a/nemo/collections/nlp/models/nlp_model.py +++ b/nemo/collections/nlp/models/nlp_model.py @@ -397,7 +397,22 @@ def dummy(): model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer) model.trainer.strategy.setup_environment() sharded_state_dict = model.sharded_state_dict() - checkpoint['state_dict'] = sharded_state_dict + if kwargs.get("load_mlm", False): + mlm_sharded_state_dict = {} + for k, v in sharded_state_dict.items(): + # Remove 'model.' from the sharded_state_dict keys + new_key = k.replace('model.', '', 1) + + # Update the key attribute of the ShardedTensor value + new_value = v + if hasattr(v, 'key'): + new_value.key = v.key.replace('model.', '', 1) + + # Add the updated key-value pair to the new dictionary + mlm_sharded_state_dict[new_key] = new_value + checkpoint['state_dict'] = mlm_sharded_state_dict + else: + checkpoint['state_dict'] = sharded_state_dict # load the checkpoint from disk checkpoint = dist_checkpointing.load(sharded_state_dict=checkpoint, checkpoint_dir=checkpoint_dir) # restore the weights diff --git a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py index 392e3628ccdb..2f66773f8724 100644 --- a/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py @@ -21,7 +21,7 @@ import torch from lightning.pytorch import Trainer from transformers import LlamaTokenizer, PreTrainedTokenizerFast -from transformers.convert_slow_tokenizer import LlamaConverter +from transformers.convert_slow_tokenizer import LlamaConverter, TikTokenConverter from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel @@ -130,6 +130,20 @@ def convert_hf_config(nemo_config, tokenizer, vocab_size, dtype, hf_output_path, json.dump(hf_config, open(f"{hf_output_path}/config.json", "w"), indent=2) +def convert_tiktoken(vocab_file) -> None: + with open(vocab_file, 'r') as f: + vocab = json.load(f) + os.remove(vocab_file) + + lines = [] + for line in vocab: + lines.append(f"{line['token_bytes']} {line['rank']}") + + for line in lines: + with open(vocab_file, 'a') as f: + f.write(line + '\n') + + def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None: """ Convert NeMo weights to HF weights @@ -323,6 +337,28 @@ def extract_nemotron_tokenizer(nemo_file, model_config, output_hf_path, nemo_tok ) tokenizer.save_pretrained(output_hf_path) logging.info(f"Setencepiece tokenizer has been saved to {output_tokenizer}") + elif tokenizer_cfg.library == "tiktoken": + tokenizer_fn = tokenizer_cfg.model[5:] + special_tokens = ["", "", ""] + import tarfile + + archive = tarfile.open(nemo_file, "r") + tokenizer_filename = "./" + tokenizer_fn # exclude 'nemo:' prefix + archive.extract(tokenizer_filename, output_hf_path) + archive.close() + vocab_file = os.path.join(output_hf_path, tokenizer_fn) + convert_tiktoken(vocab_file) + converted_tokenizer = TikTokenConverter( + vocab_file=vocab_file, additional_special_tokens=special_tokens + ).converted() + os.remove(vocab_file) + tokenizer = PreTrainedTokenizerFast( + tokenizer_object=converted_tokenizer, + model_input_names=["input_ids", "attention_mask"], + bos_token="", + eos_token="", + ) + tokenizer.save_pretrained(output_hf_path) elif isinstance(nemo_tokenizer, AutoTokenizer): nemo_tokenizer.tokenizer.save_pretrained(output_hf_path) logging.info(f"HF AutoTokenizer has been saved to {output_hf_path}") From 7ec58fab6fe990efb6abf18b68bc2eceffbbd457 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 22 Nov 2024 11:25:39 -0800 Subject: [PATCH 04/11] nit: remove non-strictly needed lines --- .github/workflows/cicd-main.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index b82bbc65cfc1..a4b2baa59550 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2224,11 +2224,6 @@ jobs: --hparams_file /tmp/nemo_conversion_ckpt/megatron_gpt/version_0/hparams.yaml \ --convert_mlm - AFTER_SCRIPT: | - rm -rf /tmp/nemo_conversion_ckpt - rm -rf /tmp/mlm_conversion_ckpt - rm -rf /tmp/mlm_to_nemo_test.nemo - L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml From 9d80f84bc101282046707d55ed2b1ef490f31a80 Mon Sep 17 00:00:00 2001 From: Huiying Date: Fri, 22 Nov 2024 15:12:47 -0800 Subject: [PATCH 05/11] add metric calc (#11381) Signed-off-by: HuiyingLi --- .../llama-3/nemo2-sft-peft/nemo2-peft.ipynb | 25 +++++++++++++++++++ .../llama-3/nemo2-sft-peft/nemo2-sft.ipynb | 25 +++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb index cd3bae1cc627..aa463e2b84be 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb @@ -499,6 +499,31 @@ "```" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5. Calculate Evaluation Metrics\n", + "\n", + "We can evaluate the model's predictions by calculating the Exact Match (EM) and F1 scores.\n", + "- Exact Match is a binary measure (0 or 1) checking if the model outputs match one of the\n", + "ground truth answer exactly.\n", + "- F1 score is the harmonic mean of precision and recall for the answer words.\n", + "\n", + "Below is a script that computes these metrics. The sample scores can be improved by training the model further and performing hyperparameter tuning. In this notebook, we only train for 20 steps.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!python /opt/NeMo/scripts/metric_calculation/peft_metric_calc.py --pred_file peft_prediction.jsonl --label_field \"original_answers\" --pred_field \"prediction\"" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb index 479d81928e98..e84ff916fc4e 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb @@ -606,6 +606,31 @@ "{\"input\": \"Muckle Water is a long, narrow fresh water loch on Ward Hill on Rousay, Orkney, Scotland. It is the biggest loch on the island and is popular for fishing. It can be reached by a track from the roadside. The Suso Burn on the north eastern shore drains the loch into the Sound of Rousay.\\n\\nWhere is Muckle Water?\", \"category\": \"closed_qa\", \"label\": \"Muckle water is located in Rousay, Orkney, Scotland.\", \"prediction\": \" Muckle Water is a long, narrow fresh water loch on Ward Hill on Rousay,\"}\n", "```" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5. Calculate Evaluation Metrics\n", + "\n", + "We can evaluate the model's predictions by calculating the Exact Match (EM) and F1 scores.\n", + "- Exact Match is a binary measure (0 or 1) checking if the model outputs match one of the\n", + "ground truth answer exactly.\n", + "- F1 score is the harmonic mean of precision and recall for the answer words.\n", + "\n", + "Below is a script that computes these metrics. The sample scores can be improved by training the model further and performing hyperparameter tuning. In this notebook, we only train for 20 steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!python /opt/NeMo/scripts/metric_calculation/peft_metric_calc.py --pred_file sft_prediction.jsonl --label_field \"label\" --pred_field \"prediction\"" + ] } ], "metadata": { From e83d3eaa968120cd22112215510af0c7fd7ccc90 Mon Sep 17 00:00:00 2001 From: Michal Futrega Date: Sat, 23 Nov 2024 22:08:00 +0100 Subject: [PATCH 06/11] Enable packed dataset for validation; add a2a_experimental argument (#11378) * Enable packed dataset for validation; add a2a_experimental argument * Apply isort and black reformatting Signed-off-by: michal2409 --------- Signed-off-by: michal2409 Co-authored-by: michal2409 --- nemo/collections/llm/gpt/data/fine_tuning.py | 49 ++++++++++++++----- .../llm/gpt/data/packed_sequence.py | 30 +++++++++--- nemo/collections/llm/peft/lora.py | 3 ++ 3 files changed, 63 insertions(+), 19 deletions(-) diff --git a/nemo/collections/llm/gpt/data/fine_tuning.py b/nemo/collections/llm/gpt/data/fine_tuning.py index 8fcef72f3bd9..0d866bb600fe 100644 --- a/nemo/collections/llm/gpt/data/fine_tuning.py +++ b/nemo/collections/llm/gpt/data/fine_tuning.py @@ -117,17 +117,28 @@ def prepare_data(self) -> None: """ Prepare packed sequence data """ - if self.packed_sequence_size > 0 and not self.train_path_packed.is_file(): + if self.packed_sequence_size > 0: from nemo.collections.llm.gpt.data.packed_sequence import prepare_packed_sequence_data - prepare_packed_sequence_data( - input_path=self.train_path, - output_path=self.train_path_packed, - packed_sequence_size=self.packed_sequence_size, - tokenizer=self.tokenizer, - max_seq_length=self.seq_length, - seed=self.seed, - ) + if not self.train_path_packed.is_file(): + prepare_packed_sequence_data( + input_path=self.train_path, + output_path=self.train_path_packed, + packed_sequence_size=self.packed_sequence_size, + tokenizer=self.tokenizer, + max_seq_length=self.seq_length, + seed=self.seed, + ) + + if not self.validation_path_packed.is_file(): + prepare_packed_sequence_data( + input_path=self.validation_path, + output_path=self.validation_path_packed, + packed_sequence_size=self.packed_sequence_size, + tokenizer=self.tokenizer, + max_seq_length=self.seq_length, + seed=self.seed, + ) def setup(self, stage: str): """Called by pytorch lightning in datamodule setup""" @@ -195,7 +206,7 @@ def val_dataloader(self) -> DataLoader: # pylint: disable=C0115,C0116 return self._create_dataloader( self._create_dataset( - self.validation_path, + self.validation_path if self.packed_sequence_size <= 0 else self.validation_path_packed, is_test=True, **self.dataset_kwargs, ), @@ -249,8 +260,8 @@ def train_path_packed(self) -> Path: """Path to training dataset file for packed sequence. The file path contains a reference to the tokenizer/model name since packed sequence dataset consists of tokenized indices.""" if self.packed_sequence_size > 0: - if self.packed_sequence_specs.packed_data_path is not None: - return self.packed_sequence_specs.packed_data_path + if self.packed_sequence_specs.packed_train_data_path is not None: + return self.packed_sequence_specs.packed_train_data_path tokenizer_model_name = self._extract_tokenizer_model_name() folder_name = self.dataset_root / "packed" / tokenizer_model_name folder_name.mkdir(parents=True, exist_ok=True) @@ -258,6 +269,20 @@ def train_path_packed(self) -> Path: else: raise ValueError("`train_path_packed` invalid since packed sequence size is not specified.") + @property + def validation_path_packed(self) -> Path: + """Path to validation dataset file for packed sequence. The file path contains a reference to the + tokenizer/model name since packed sequence dataset consists of tokenized indices.""" + if self.packed_sequence_size > 0: + if self.packed_sequence_specs.packed_val_data_path is not None: + return self.packed_sequence_specs.packed_val_data_path + tokenizer_model_name = self._extract_tokenizer_model_name() + folder_name = self.dataset_root / "packed" / tokenizer_model_name + folder_name.mkdir(parents=True, exist_ok=True) + return folder_name / f"validation_{self.packed_sequence_size}.npy" + else: + raise ValueError("`validation_path_packed` invalid since packed sequence size is not specified.") + @property def validation_path(self) -> Path: """Path to validation dataset file""" diff --git a/nemo/collections/llm/gpt/data/packed_sequence.py b/nemo/collections/llm/gpt/data/packed_sequence.py index 153e79f94391..345489ea0b63 100644 --- a/nemo/collections/llm/gpt/data/packed_sequence.py +++ b/nemo/collections/llm/gpt/data/packed_sequence.py @@ -101,15 +101,31 @@ class PackedSequenceSpecs: This field is set by llm.finetune api. """ - packed_data_path: str = None + packed_train_data_path: str = None """ - If specified, use the packed dataset from this file instead of the default path. + If specified, use this file for the packed training dataset instead of the default path. + """ + + packed_val_data_path: str = None + """ + If specified, use this file for the packed validation dataset instead of the default path. """ def __post_init__(self): - if self.packed_data_path is not None: - self.packed_data_path = Path(self.packed_data_path) + if self.packed_train_data_path is not None: + self.packed_train_data_path = Path(self.packed_train_data_path) + assert ( + self.packed_train_data_path.suffix == ".npy" + ), f"packed training data file must be a .npy file: {self.packed_train_data_path}" + assert ( + self.packed_train_data_path.exists() + ), f"packed training data file does not exist: {self.packed_train_data_path}" + + if self.packed_val_data_path is not None: + self.packed_val_data_path = Path(self.packed_val_data_path) + assert ( + self.packed_val_data_path.suffix == ".npy" + ), f"packed validation data file must be a .npy file: {self.packed_val_data_path}" assert ( - self.packed_data_path.suffix == ".npy" - ), f"packed data file must be a .npy file: {self.packed_data_path}" - assert self.packed_data_path.exists(), f"packed data file does not exist: {self.packed_data_path}" + self.packed_val_data_path.exists() + ), f"packed validation data file does not exist: {self.packed_val_data_path}" diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py index 57cdda3a2871..205cde071fa7 100644 --- a/nemo/collections/llm/peft/lora.py +++ b/nemo/collections/llm/peft/lora.py @@ -124,6 +124,7 @@ class LoRA(PEFT): dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0. dropout_position (Literal['pre', 'post'], optional): Position for applying dropout. Can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'post'. + a2a_experimental (bool): Enables the experimental All-to-All (A2A) communication strategy. Defaults to False. Example: -------- @@ -151,6 +152,7 @@ class LoRA(PEFT): dropout_position: Literal['pre', 'post'] = 'post' lora_A_init_method: str = "xavier" lora_B_init_method: str = "zero" + a2a_experimental: bool = False def transform(self, m: nn.Module, name=None, prefix=None): """ @@ -224,6 +226,7 @@ def wildcard_match(pattern, key): model_parallel_config=getattr(m, "config", None), alpha=self.alpha, is_expert=is_expert_linear(full_name), + a2a_experimental=self.a2a_experimental, ) return AdapterParallelAdd(m, adapter) return m From 3afcde032355efa735f670db72f7efa361ab26dc Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Sun, 24 Nov 2024 17:27:47 -0500 Subject: [PATCH 07/11] Fix DDP unused param error when TE is enabled in NeMo Lite (#11364) * Fix DDP unused param error when TE is enabled Signed-off-by: Onur Yilmaz * Added partial function for te Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia --------- Signed-off-by: Onur Yilmaz Signed-off-by: oyilmaz-nvidia Co-authored-by: oyilmaz-nvidia --- examples/llm/sft/hf.py | 23 ++++++++++--------- .../gpt/model/hf_auto_model_for_causal_lm.py | 6 +++++ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py index 59b8b4ad3491..1d282312b130 100755 --- a/examples/llm/sft/hf.py +++ b/examples/llm/sft/hf.py @@ -19,7 +19,7 @@ from nemo import lightning as nl from nemo.collections import llm -from nemo.lightning.pytorch.accelerate.transformer_engine import is_te_accelerated, te_accelerate +from nemo.lightning.pytorch.accelerate.transformer_engine import is_te_accelerated from nemo.lightning.pytorch.callbacks import ModelCallback @@ -75,16 +75,17 @@ def squad(tokenizer) -> pl.LightningDataModule: grad_clip = None use_dist_samp = False - model = llm.HfAutoModelForCausalLM(args.model) - tokenizer = model.tokenizer + model_accelerator = None + if args.model_accelerator == "te": + from functools import partial + from nemo.lightning.pytorch.accelerate.transformer_engine import te_accelerate - callbacks = [] - if args.model_accelerator: - if args.model_accelerator == "te": - model_transform = ModelCallback( - on_train_start=lambda model: te_accelerate(model, fp8_autocast=args.fp8_autocast) - ) - callbacks.append(model_transform) + model_accelerator = partial(te_accelerate, fp8_autocast=args.fp8_autocast) + + from nemo.lightning.pytorch.accelerate.transformer_engine import te_accelerate + + model = llm.HfAutoModelForCausalLM(model_name=args.model, model_accelerator=model_accelerator) + tokenizer = model.tokenizer llm.api.finetune( model=model, @@ -100,7 +101,7 @@ def squad(tokenizer) -> pl.LightningDataModule: accumulate_grad_batches=10, gradient_clip_val=grad_clip, use_distributed_sampler=use_dist_samp, - callbacks=callbacks, + callbacks=[], logger=wandb, ), optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)), diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py index c0f02d706ceb..26e4604adc43 100644 --- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py +++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py @@ -39,6 +39,7 @@ def __init__( tokenizer=None, loss_fn=masked_cross_entropy, model_transform=None, + model_accelerator=None, trust_remote_code=False, ): super().__init__() @@ -50,6 +51,7 @@ def __init__( self.load_pretrained_weights = load_pretrained_weights self.is_hf_model = True self.model_transform = model_transform + self.model_accelerator = model_accelerator self.trust_remote_code = trust_remote_code @property @@ -78,6 +80,10 @@ def configure_model(self): config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code) self.model = AutoModelForCausalLM.from_config(config, trust_remote_code=self.trust_remote_code) + + if self.model_accelerator is not None: + self.model_accelerator(self.model) + self.model.train() def forward(self, input_ids, attention_mask=None, labels=None, loss_mask=None): From 5094b2e53836adf0a50d455c70513c774cf6523a Mon Sep 17 00:00:00 2001 From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Date: Sun, 24 Nov 2024 17:42:57 -0800 Subject: [PATCH 08/11] Update llama32 vision (mllama) use attention bias (#11316) * update recipe Signed-off-by: yaoyu-33 * fix mllama mock ds Signed-off-by: yaoyu-33 * update to use attention bias Signed-off-by: yaoyu-33 * remove example Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix docstring mock.py Signed-off-by: yaoyu-33 * fix docstring language.py Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix docstring language.py Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix docstring mllama/base.py Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix docstring mllama/language.py Signed-off-by: yaoyu-33 * bump mcore Signed-off-by: Oliver Koenig * Add scripts for mllama Signed-off-by: yaoyu-33 * fix Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * update script Signed-off-by: yaoyu-33 * fix pylint Signed-off-by: yaoyu-33 * revert Dockerfile.ci Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> * update script match recipe Signed-off-by: yaoyu-33 * update recipes Signed-off-by: yaoyu-33 * update mllama 90b recipe Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Signed-off-by: yaoyu-33 Signed-off-by: Oliver Koenig Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: yaoyu-33 Co-authored-by: Oliver Koenig --- nemo/collections/vlm/mllama/data/mock.py | 40 ++++ nemo/collections/vlm/mllama/model/base.py | 61 +++-- nemo/collections/vlm/mllama/model/language.py | 80 ++++++- nemo/collections/vlm/mllama/model/vision.py | 13 +- nemo/collections/vlm/recipes/mllama_11b.py | 33 +-- nemo/collections/vlm/recipes/mllama_90b.py | 25 ++- scripts/vlm/mllama_finetune.py | 212 ++++++++++++++++++ scripts/vlm/mllama_generation.py | 164 ++++++++++++++ 8 files changed, 573 insertions(+), 55 deletions(-) create mode 100644 scripts/vlm/mllama_finetune.py create mode 100644 scripts/vlm/mllama_generation.py diff --git a/nemo/collections/vlm/mllama/data/mock.py b/nemo/collections/vlm/mllama/data/mock.py index fae92b097200..4d078c745492 100644 --- a/nemo/collections/vlm/mllama/data/mock.py +++ b/nemo/collections/vlm/mllama/data/mock.py @@ -25,6 +25,26 @@ class MockDataModule(pl.LightningDataModule): + """ + Mock DataModule for testing and development. + Generates synthetic data for training, validation, and testing purposes. + + Args: + seq_length (int): Sequence length for the generated data. + decoder_seq_length (Optional[int]): Decoder sequence length if applicable, used in pp. + vocab_size (int): Size of the vocabulary of tokenizer. + crop_size (Tuple[int, int]): Image crop size (height, width). + micro_batch_size (int): Micro batch size for data loading. + global_batch_size (int): Global batch size across all processes. + rampup_batch_size (Optional[List[int]]): Batch size ramp-up configuration. + num_train_samples (int): Number of training samples to generate. + num_val_samples (int): Number of validation samples to generate. + num_test_samples (int): Number of test samples to generate. + num_workers (int): Number of workers for data loading. + pin_memory (bool): Whether to pin memory for data loading. + persistent_workers (bool): Whether workers should remain persistent. + """ + def __init__( self, seq_length: int = 2048, @@ -66,6 +86,7 @@ def __init__( ) def setup(self, stage: str = "") -> None: + """Set up datasets for the specified stage.""" self._train_ds = _MockMLlamaDataset( self.vocab_size, self.crop_size, "train", self.num_train_samples, self.decoder_seq_length ) @@ -77,21 +98,25 @@ def setup(self, stage: str = "") -> None: ) def train_dataloader(self) -> TRAIN_DATALOADERS: + """Returns the DataLoader for training.""" if not hasattr(self, "_train_ds"): self.setup() return self._create_dataloader(self._train_ds) def val_dataloader(self) -> EVAL_DATALOADERS: + """Returns the DataLoader for validation.""" if not hasattr(self, "_validation_ds"): self.setup() return self._create_dataloader(self._validation_ds) def test_dataloader(self) -> EVAL_DATALOADERS: + """Returns the DataLoader for testing.""" if not hasattr(self, "_test_ds"): self.setup() return self._create_dataloader(self._test_ds) def _create_dataloader(self, dataset, **kwargs) -> DataLoader: + """Creates a DataLoader for the specified dataset.""" return DataLoader( dataset, num_workers=self.num_workers, @@ -103,6 +128,18 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader: class _MockMLlamaDataset(Dataset): + """ + Mock dataset for generating synthetic data with text and image components. + + Args: + vocab_size (int): Vocabulary size for text data. + crop_size (Tuple[int, int]): Image crop size (height, width). + name (str): Name of the dataset split ('train', 'valid', 'test'). + num_samples (int): Number of samples in the dataset. + seq_length (int): Sequence length for the text data. + seed (int): Seed for random number generation. + """ + def __init__( self, vocab_size, @@ -127,13 +164,16 @@ def __init__( self.position_ids = torch.arange(self.seq_length, dtype=torch.int64) def __len__(self) -> int: + """Returns the number of samples in the dataset.""" return self.length def _get_text(self, idx: int) -> np.ndarray: + """Generates a random sequence of integers representing text tokens.""" np_gen = np.random.default_rng(seed=(self.seed + idx)) return np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64) def __getitem__(self, idx) -> Dict[str, torch.Tensor]: + """Generates a single data sample.""" # Generate data of the expected size and datatype (based on GPTDataset). np_gen = np.random.default_rng(seed=(self.seed + idx)) tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length + 1], dtype=np.int64)) diff --git a/nemo/collections/vlm/mllama/model/base.py b/nemo/collections/vlm/mllama/model/base.py index d417af27aedd..9279936e23d7 100644 --- a/nemo/collections/vlm/mllama/model/base.py +++ b/nemo/collections/vlm/mllama/model/base.py @@ -47,7 +47,8 @@ from nemo.utils import logging -def llama_data_step(dataloader_iter) -> Dict[str, torch.Tensor]: +def mllama_data_step(dataloader_iter) -> Dict[str, torch.Tensor]: + """Mllama data step.""" from megatron.core import parallel_state # Based on: https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py#L87 @@ -96,7 +97,8 @@ def llama_data_step(dataloader_iter) -> Dict[str, torch.Tensor]: return output -def llama_forward_step(model, batch) -> torch.Tensor: +def mllama_forward_step(model, batch) -> torch.Tensor: + """Mllama model forward step.""" forward_config = { "batch_images": batch["batch_images"], "batch_masks": batch["batch_masks"], @@ -114,13 +116,15 @@ def llama_forward_step(model, batch) -> torch.Tensor: def set_input_tensor(self, tensor): + """Placeholder for `set_input_tensor` method for PP implementation.""" pass @dataclass class CrossAttentionVisionConfig(TransformerConfig, io.IOMixin): - # core params + """Configuration for llama vision model.""" + # core params bias_activation_fusion: bool = True bias_dropout_add_fusion: bool = True @@ -150,9 +154,11 @@ class CrossAttentionVisionConfig(TransformerConfig, io.IOMixin): @property def max_aspect_ratio_id(self) -> int: + # pylint: disable=C0115,C0116 return len(self.supported_aspect_ratios) def configure_model(self) -> "CrossAttentionVisionModel": + """Configure mllama vision model.""" return CrossAttentionVisionModel( self, ) @@ -160,6 +166,10 @@ def configure_model(self) -> "CrossAttentionVisionModel": @dataclass class CrossAttentionTextConfig(Llama31Config): + """ + Configuration for llama model with cross-attention layers to take in multimodal features. + """ + rotary_base: int = 500_000 seq_length: int = 8192 num_layers: int = 32 @@ -171,12 +181,14 @@ class CrossAttentionTextConfig(Llama31Config): apply_rope_fusion: bool = False def _init_fusion_schedule(self, num_layers: int) -> List[int]: - llama_layers = list(range(self.num_layers)) + """Initialize self-attention layer / cross-attention layer fusion schedule""" + mllama_layers = list(range(self.num_layers)) # uniformly spread the layers - k = math.ceil(len(llama_layers) / num_layers) - return llama_layers[::-1][::k][:num_layers][::-1] + k = math.ceil(len(mllama_layers) / num_layers) + return mllama_layers[::-1][::k][:num_layers][::-1] def configure_model(self, tokenizer, pre_process=True, post_process=True): + """Configure mllama text model.""" self.fusion_schedule = self._init_fusion_schedule(self.num_cross_attention_layers) vp_size = self.virtual_pipeline_model_parallel_size if vp_size: @@ -225,6 +237,8 @@ def configure_model(self, tokenizer, pre_process=True, post_process=True): @dataclass class MLlamaModelConfig(TransformerConfig, io.IOMixin): + """Combined configuration for multimodal vision-language model.""" + language_model_config: Optional[CrossAttentionTextConfig] = None vision_model_config: Optional[CrossAttentionVisionConfig] = None @@ -237,8 +251,8 @@ class MLlamaModelConfig(TransformerConfig, io.IOMixin): language_model_from_pretrained: Optional[str] = None # TODO vision_model_from_pretrained: Optional[str] = None # TODO - forward_step_fn: Callable = llama_forward_step - data_step_fn: Callable = llama_data_step + forward_step_fn: Callable = mllama_forward_step + data_step_fn: Callable = mllama_data_step def __post_init__(self): if self.language_model_config is not None: @@ -246,6 +260,7 @@ def __post_init__(self): setattr(self, attr, getattr(self.language_model_config, attr)) def configure_model(self, tokenizer) -> "MLlamaBaseModel": + """Configure mllama model.""" from megatron.core import parallel_state as ps self.language_model_config.tensor_model_parallel_size = self.tensor_model_parallel_size @@ -274,6 +289,8 @@ def configure_model(self, tokenizer) -> "MLlamaBaseModel": class CrossAttentionVisionModel(MegatronModule): + """Mllama vision model.""" + def __init__(self, config) -> None: super().__init__(config=config) return_intermediate = "3,7,15,23,30" @@ -303,6 +320,7 @@ def __init__(self, config) -> None: self.vision_projection.encoder.skip_bias_add = False # Temporary fix for a MCore side bug def forward(self, images: torch.Tensor, aspect_ratio_ids: torch.Tensor) -> torch.Tensor: + """Forward.""" # vision_tokens: (B, T, D) # aspect_ratio_ids: (B, 1) # h: (B, T, D) @@ -313,10 +331,13 @@ def forward(self, images: torch.Tensor, aspect_ratio_ids: torch.Tensor) -> torch return vision_tokens def set_input_tensor(self, tensor): + # pylint: disable=C0115,C0116 pass class MLlamaBaseModel(MegatronModule): + """Mllama base model combining vision and text models with cross-attention.""" + def __init__( self, config: MLlamaModelConfig, @@ -356,10 +377,6 @@ def __init__( self.patch_size = 14 self.image_res = vision_model_config.vision_chunk_size self.max_num_chunks = vision_model_config.vision_max_num_chunks - logging.warning("[WARNING] NeMo Mllama will always pad images to max number of tiles. A fix is coming soon!") - - def setup_cache(self, max_batch_size: int, dtype: torch.dtype): - self.language_model.setup_cache(max_batch_size, dtype) def compute_xattn_caches_masks( self, @@ -369,6 +386,7 @@ def compute_xattn_caches_masks( num_chunks: torch.Tensor, total_len: int, ) -> Tuple[List, torch.Tensor, torch.Tensor]: + """Compute xattn caches masks used in text model.""" bsz, nimg, nchunk, ntok, image_token_dim = vision_orig_shape xattn_caches = [ @@ -408,6 +426,7 @@ def forward( full_text_row_masked_out_mask: Optional[torch.Tensor] = None, xattn_caches: Optional[List] = None, ) -> torch.Tensor: + """Forward.""" if xattn_caches is None: bsz, max_num_images = batch_images.size(0), batch_images.size(1) vision_orig_shape = ( @@ -418,8 +437,8 @@ def forward( self.config.hidden_size, ) skip_vision_encoder = False - num_chunks[num_chunks > 0] = self.max_num_chunks if max_num_images == 0: + num_chunks[num_chunks > 0] = self.max_num_chunks skip_vision_encoder = True if self.encoder_hidden_state is not None: @@ -489,6 +508,8 @@ def set_input_tensor(self, input_tensor) -> None: class MLlamaModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin): + """Lightning Module for the MLlama model.""" + def __init__( self, config: MLlamaModelConfig, @@ -506,6 +527,7 @@ def __init__( self._validation_loss_reduction = None def configure_model(self) -> None: + """Configure mllama model""" if not hasattr(self, "module"): self.module: MLlamaBaseModel = self.config.configure_model(self.tokenizer) @@ -522,7 +544,7 @@ def forward( full_text_row_masked_out_mask: Optional[torch.Tensor] = None, xattn_caches: Optional[torch.Tensor] = None, ) -> torch.Tensor: - + """Forward.""" output_tensor = self.module( position_ids=position_ids, tokens=tokens, @@ -539,22 +561,26 @@ def forward( return output_tensor def data_step(self, dataloader_iter) -> Dict[str, torch.Tensor]: + # pylint: disable=C0115,C0116 return self.config.data_step_fn(dataloader_iter) def forward_step(self, batch) -> torch.Tensor: + # pylint: disable=C0115,C0116 return self.config.forward_step_fn(self, batch) def training_step(self, batch, batch_idx=None) -> torch.Tensor: + # pylint: disable=C0115,C0116 # In mcore the loss-function is part of the forward-pass (when labels are provided) return self.forward_step(batch) def validation_step(self, batch, batch_idx=None) -> torch.Tensor: + # pylint: disable=C0115,C0116 # In mcore the loss-function is part of the forward-pass (when labels are provided) - return self.forward_step(batch) @property def training_loss_reduction(self) -> MaskedTokenLossReduction: + # pylint: disable=C0115,C0116 if not self._training_loss_reduction: self._training_loss_reduction = MaskedTokenLossReduction() @@ -562,6 +588,7 @@ def training_loss_reduction(self) -> MaskedTokenLossReduction: @property def validation_loss_reduction(self) -> MaskedTokenLossReduction: + # pylint: disable=C0115,C0116 if not self._validation_loss_reduction: self._validation_loss_reduction = MaskedTokenLossReduction(validation_step=True) @@ -573,8 +600,8 @@ def validation_loss_reduction(self) -> MaskedTokenLossReduction: "MLlamaModelConfig", "CrossAttentionTextConfig", "CrossAttentionVisionConfig", - "llama_data_step", - "llama_forward_step", + "mllama_data_step", + "mllama_forward_step", "transformer_engine_layer_spec", "local_layer_spec", ] diff --git a/nemo/collections/vlm/mllama/model/language.py b/nemo/collections/vlm/mllama/model/language.py index b8985e53c54c..5d4cc2e09f21 100644 --- a/nemo/collections/vlm/mllama/model/language.py +++ b/nemo/collections/vlm/mllama/model/language.py @@ -60,6 +60,10 @@ @dataclass class MLlamaCrossAttentionSubmodules: + """ + Defines the submodules required for cross-attention layers in the Llama architecture. + """ + linear_q: Union[ModuleSpec, type] = None linear_kv: Union[ModuleSpec, type] = None core_attention: Union[ModuleSpec, type] = None @@ -69,6 +73,10 @@ class MLlamaCrossAttentionSubmodules: class CrossAttentionTextModel(MCoreGPTModel): + """ + GPT-based model with integrated cross-attention layers for multimodal tasks. + """ + def __init__( self, config: TransformerConfig, @@ -122,6 +130,7 @@ def __init__( self._thresh = self.num_frozen_embeddings - 1 def get_partially_trainable_embedding(self, x): + """Get word embedding w/ few extra learnable tokens.""" xz = torch.zeros_like(x, device=x.device) oz = torch.ones_like(x, device=x.device) x_orig = torch.minimum(x, torch.tensor(self._thresh, device=x.device)) @@ -148,7 +157,7 @@ def forward( packed_seq_params: PackedSeqParams = None, extra_block_kwargs: dict = None, ) -> Tensor: - + """Forward.""" # Decoder embedding. if decoder_input is not None: pass @@ -171,6 +180,9 @@ def forward( ) rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len) + dtype = decoder_input.dtype + cross_attention_bias = cross_attention_masks.to(dtype) * torch.finfo(dtype).min + # Run decoder. hidden_states = self.decoder( hidden_states=decoder_input, @@ -178,9 +190,10 @@ def forward( inference_params=inference_params, rotary_pos_emb=rotary_pos_emb, packed_seq_params=packed_seq_params, - cross_attention_masks=cross_attention_masks, + cross_attention_masks=None, full_text_row_masked_out_mask=full_text_row_masked_out_mask, xattn_caches=xattn_caches, + cross_attention_bias=cross_attention_bias, **(extra_block_kwargs or {}), ) @@ -203,6 +216,10 @@ def forward( class CrossAttentionTransformerBlock(TransformerBlock): + """ + Transformer block with integrated cross-attention layers for multimodal tasks. + """ + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -220,7 +237,7 @@ def __init__(self, *args, **kwargs): submodules=TransformerLayerSubmodules( cross_attention=ModuleSpec( module=MLlamaCrossAttention, - params={"attn_mask_type": AttnMaskType.arbitrary}, + params={"attn_mask_type": AttnMaskType.no_mask}, submodules=MLlamaCrossAttentionSubmodules( linear_q=TELayerNormColumnParallelLinear, # This wraps attention_norm before attention linear_kv=TEColumnParallelLinear, @@ -250,6 +267,7 @@ def __init__(self, *args, **kwargs): assert len(self.xattn_layers) == len(self.layers), 'Check PP implementation for cross attention layers!' def _get_layer_offset(self): + """Get correct layer offset when encoder pipeline parallel size > 0.""" encoder_pipeline_model_parallel_size = getattr(self.config, "encoder_pipeline_model_parallel_size", 0) decoder_pipeline_model_parallel_rank = ( parallel_state.get_pipeline_model_parallel_rank() - encoder_pipeline_model_parallel_size @@ -264,9 +282,12 @@ def forward( cross_attention_masks: Tensor = None, full_text_row_masked_out_mask: Tensor = None, rotary_pos_emb: Tensor = None, + attention_bias: Tensor = None, + cross_attention_bias: Tensor = None, inference_params: InferenceParams = None, packed_seq_params: PackedSeqParams = None, ): + """Forward.""" # hidden_states (float): [s, b, h] # attention_mask (bool): [1, 1, s, s] @@ -324,6 +345,7 @@ def forward( xattn_cache=xattn_caches[l_no], full_text_row_masked_out_mask=full_text_row_masked_out_mask, rotary_pos_emb=rotary_pos_emb, + cross_attention_bias=cross_attention_bias, inference_params=inference_params, packed_seq_params=packed_seq_params, ) @@ -331,6 +353,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb, + attention_bias=attention_bias, inference_params=inference_params, packed_seq_params=packed_seq_params, ) @@ -361,6 +384,7 @@ def forward( def sharded_state_dict( self, prefix: str = '', sharded_offsets: tuple = (), metadata: dict = None ) -> ShardedStateDict: + """Update shareded state dict for cross-attention layers""" sharded_state_dict = {} layer_prefix = f'{prefix}layers.' @@ -399,6 +423,10 @@ def sharded_state_dict( class CrossAttentionTransformerLayer(TransformerLayer): + """ + Transformer layer with cross-attention for integration. + """ + def __init__( self, config: TransformerConfig, @@ -417,6 +445,7 @@ def __init__( self.gate_ffn = nn.Parameter(torch.zeros(1, dtype=self.config.params_dtype)) def compute_xattn_kv_cache(self, xattn_tokens: Tensor) -> Tensor: + """Compute cross-attention kv cahce.""" return self.cross_attention._compute_xattn_kv_cache(xattn_tokens) def forward( @@ -426,9 +455,11 @@ def forward( xattn_cache=None, full_text_row_masked_out_mask=None, rotary_pos_emb=None, + cross_attention_bias=None, inference_params=None, packed_seq_params=None, ): + """Forward.""" # hidden_states: [s, b, h] # Residual connection. @@ -444,6 +475,7 @@ def forward( xattn_cache=xattn_cache, full_text_row_masked_out_mask=full_text_row_masked_out_mask, rotary_pos_emb=rotary_pos_emb, + cross_attention_bias=cross_attention_bias, inference_params=inference_params, ) @@ -507,11 +539,13 @@ def __call__( return hidden_states, None def compute_xattn_kv_cache(self, xattn_tokens: Tensor) -> Optional[Tensor]: + # pylint: disable=C0115,C0116 return None class MLlamaCrossAttention(Attention): - """Cross-attention layer class for Llama VLM support + """ + Cross-attention layer for Llama multimodal tasks. Cross-attention layer takes input with size [s, b, h] and context with size [s, b, h] and returns output of the same size. @@ -574,6 +608,7 @@ def __init__( ) def get_key_value_tensors(self, key_value_states): + """Get key value tensors.""" mixed_kv, _ = self.linear_kv(key_value_states) # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn] @@ -590,7 +625,7 @@ def get_key_value_tensors(self, key_value_states): return key, value def get_query_tensor(self, hidden_states): - + """ "Get query tensor.""" # Attention head [sq, b, h] --> [sq, b, hp] query, _ = self.linear_q(hidden_states) @@ -607,6 +642,7 @@ def get_query_tensor(self, hidden_states): return query def get_query_key_value_tensors(self, hidden_states, key_value_states): + """Get query key value tensors.""" query = self.get_query_tensor(hidden_states) key, value = self.get_key_value_tensors(key_value_states) return query, key, value @@ -619,8 +655,17 @@ def forward( full_text_row_masked_out_mask=None, inference_params=None, rotary_pos_emb=None, + rotary_pos_cos=None, + rotary_pos_sin=None, + cross_attention_bias=None, packed_seq_params=None, ): + """Forward.""" + # hidden_states: [sq, b, h] + if self.config.flash_decode: + rotary_pos_emb = None + else: + assert rotary_pos_cos is None and rotary_pos_sin is None # For self attention we just duplicate the rotary_pos_emb if it isn't already if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple): @@ -637,8 +682,8 @@ def forward( # =================================================== # Adjust key, value, and rotary_pos_emb for inference # =================================================== - key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( - inference_params, key, value, rotary_pos_emb + query, key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( + inference_params, query, key, value, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin ) if packed_seq_params is not None: @@ -650,9 +695,6 @@ def forward( # core attention computation # ================================== - # In TE "True" means masked out - cross_attention_masks = torch.where(cross_attention_masks == 0, False, True) - if self.checkpoint_core_attention and self.training: core_attn_out = self._checkpointed_attention_forward( query, @@ -660,6 +702,7 @@ def forward( value, cross_attention_masks, attn_mask_type=attn_mask_type, + attention_bias=cross_attention_bias, packed_seq_params=packed_seq_params, ) else: @@ -669,6 +712,7 @@ def forward( value, cross_attention_masks, attn_mask_type=attn_mask_type, + attention_bias=cross_attention_bias, packed_seq_params=packed_seq_params, ) @@ -702,8 +746,22 @@ def apply_rope_scaling( high_freq_factor: int = 4, old_context_len: int = 8192, ): + """ + Apply scaling to rotary embeddings for positional encoding. + + Args: + inv_freq (Tensor): Tensor of inverse frequencies. + factor (int): Scaling factor for medium-to-high frequencies. + low_freq_factor (int): Factor for identifying low frequencies. + high_freq_factor (int): Factor for identifying high frequencies. + old_context_len (int): Original context length for scaling computation. + + Returns: + Tensor: Scaled inverse frequencies. + """ logging.info( - f"Apply rope scaling with factor={factor}, low_freq_factor={low_freq_factor}, high_freq_factor={high_freq_factor}, old_context_len={old_context_len}." + f"Apply rope scaling with factor={factor}, low_freq_factor={low_freq_factor}, " + f"high_freq_factor={high_freq_factor}, old_context_len={old_context_len}." ) low_freq_wavelen = old_context_len / low_freq_factor diff --git a/nemo/collections/vlm/mllama/model/vision.py b/nemo/collections/vlm/mllama/model/vision.py index f023cc7bf943..bb58ad093cd6 100644 --- a/nemo/collections/vlm/mllama/model/vision.py +++ b/nemo/collections/vlm/mllama/model/vision.py @@ -120,15 +120,16 @@ def build_encoder_attention_mask( torch.Tensor: Tensor containing the attention mask. """ masks = [] + dtype = x.dtype for ar_id in ar_ids: arx = supported_aspect_ratios[ar_id - 1] mask_i = torch.ones((num_chunks, x.shape[1] // num_chunks), device=x.device) mask_i[: arx[0] * arx[1], :ntok] = 0 mask_i = mask_i.view(num_chunks * x.shape[1] // num_chunks, -1) - mask_i = (mask_i @ mask_i.T).type(torch.bool) + mask_i = mask_i @ mask_i.T mask_i = mask_i.unsqueeze(0) masks.append(mask_i) - masks = torch.stack(masks) + masks = torch.stack(masks).to(dtype) * torch.finfo(dtype).min return masks @@ -197,6 +198,7 @@ def forward_with_return_intermediate( context: Tensor = None, context_mask: Tensor = None, rotary_pos_emb: Tensor = None, + attention_bias: Tensor = None, inference_params: InferenceParams = None, packed_seq_params: PackedSeqParams = None, return_intermediate: List[int] = None, @@ -253,6 +255,7 @@ def forward_with_return_intermediate( context=context, context_mask=context_mask, rotary_pos_emb=rotary_pos_emb, + attention_bias=attention_bias, packed_seq_params=packed_seq_params, ) else: @@ -269,6 +272,7 @@ def forward_with_return_intermediate( context=context, context_mask=context_mask, rotary_pos_emb=rotary_pos_emb, + attention_bias=attention_bias, inference_params=inference_params, packed_seq_params=packed_seq_params, ) @@ -506,6 +510,7 @@ def forward( attention_mask=attention_mask, inference_params=inference_params, rotary_pos_emb=rotary_pos_emb, + attention_bias=attention_bias, packed_seq_params=packed_seq_params, ) @@ -690,11 +695,12 @@ def forward(self, images: torch.Tensor, ar_ids: torch.Tensor) -> torch.Tensor: x = x.view(bsz * num_concurrent_media, -1, dim) npad, attn_mask = 0, None - attn_mask = build_encoder_attention_mask(x, ar_ids, ntok, num_chunks, self.config.supported_aspect_ratios) + attn_bias = build_encoder_attention_mask(x, ar_ids, ntok, num_chunks, self.config.supported_aspect_ratios) x = x.transpose(0, 1).contiguous() x, int_x = self.transformer( hidden_states=x, attention_mask=attn_mask, + attention_bias=attn_bias, return_intermediate=self.return_intermediate, ) @@ -709,6 +715,7 @@ def forward(self, images: torch.Tensor, ar_ids: torch.Tensor) -> torch.Tensor: x = self.global_transformer( hidden_states=x, attention_mask=None, + attention_bias=attn_bias, ) x = x.transpose(0, 1) x = x.reshape(bsz * num_concurrent_media, num_chunks, ntok + npad, dim) diff --git a/nemo/collections/vlm/recipes/mllama_11b.py b/nemo/collections/vlm/recipes/mllama_11b.py index e4842ae63d52..4b08606900e3 100644 --- a/nemo/collections/vlm/recipes/mllama_11b.py +++ b/nemo/collections/vlm/recipes/mllama_11b.py @@ -26,6 +26,7 @@ from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed from nemo.collections.vlm.mllama.data.mock import MockDataModule +from nemo.utils.exp_manager import TimingCallback NAME = "mllama_11b" @@ -46,7 +47,7 @@ def model() -> run.Config[pl.LightningModule]: >>> model_config = model() >>> print(model_config) """ - return run.Config(vlm.MLlamaModel, config=run.Config(vlm.MLlamaConfig11B)) + return run.Config(vlm.MLlamaModel, config=run.Config(vlm.MLlamaConfig11BInstruct)) @run.cli.factory(target=llm.finetune, name=NAME) @@ -107,6 +108,7 @@ def finetune_recipe( plugins=bf16_mixed(), strategy=strategy, val_check_interval=100, + callbacks=[run.Config(TimingCallback)], ) recipe = run.Partial( @@ -115,34 +117,37 @@ def finetune_recipe( trainer=trainer, data=run.Config( MockDataModule, - seq_length=4100, # encoder (vision) seq length - decoder_seq_length=512, # decoder (llm) seq length - global_batch_size=16, - micro_batch_size=2, + seq_length=6404, # encoder (vision) seq length + decoder_seq_length=2048, # decoder (llm) seq length + global_batch_size=2, + micro_batch_size=1, vocab_size=128256, - crop_size=(448, 448), + crop_size=(560, 560), num_workers=0, ), log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=2.0e-07, warmup_steps=150), - resume=nemo_resume("meta-llama/Llama-3.2-11B-Vision"), + resume=nemo_resume("meta-llama/Llama-3.2-11B-Vision-Instruct"), ) if peft_scheme is None or peft_scheme.lower() == 'none': recipe.trainer.strategy.tensor_model_parallel_size = 2 recipe.optim.config.lr = 2e-05 elif peft_scheme.lower() == 'lora': + # pylint: disable=line-too-long + """Adapted from https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/configs/peft.py""" recipe.peft = run.Config( vlm.LoRA, - freeze_vision_model=False, + freeze_vision_model=True, target_modules=[ - "*.language_model.*.linear_qkv", - "*.language_model.*.linear_q", - "*.language_model.*.linear_kv", - "*.language_model.*.linear_proj", - "*.language_model.*.linear_fc1", - "*.language_model.*.linear_fc2", + "linear_qkv", + "linear_q", + "linear_kv", ], + dim=8, + alpha=32, + dropout=0.05, + dropout_position="pre", ) recipe.optim.config.lr = 1e-4 else: diff --git a/nemo/collections/vlm/recipes/mllama_90b.py b/nemo/collections/vlm/recipes/mllama_90b.py index 28a6ff7ff9a6..12e0329fc6dd 100644 --- a/nemo/collections/vlm/recipes/mllama_90b.py +++ b/nemo/collections/vlm/recipes/mllama_90b.py @@ -26,6 +26,7 @@ from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed from nemo.collections.vlm.mllama.data.mock import MockDataModule +from nemo.utils.exp_manager import TimingCallback NAME = "mllama_90b" @@ -46,7 +47,7 @@ def model() -> run.Config[pl.LightningModule]: >>> model_config = model() >>> print(model_config) """ - return run.Config(vlm.MLlamaModel, config=run.Config(vlm.MLlamaConfig90B)) + return run.Config(vlm.MLlamaModel, config=run.Config(vlm.MLlamaConfig90BInstruct)) @run.cli.factory(target=llm.finetune, name=NAME) @@ -107,6 +108,7 @@ def finetune_recipe( plugins=bf16_mixed(), strategy=strategy, val_check_interval=100, + callbacks=[run.Config(TimingCallback)], ) recipe = run.Partial( @@ -116,7 +118,7 @@ def finetune_recipe( data=run.Config( MockDataModule, seq_length=6404, # encoder (vision) seq length - decoder_seq_length=512, # decoder (llm) seq length + decoder_seq_length=2048, # decoder (llm) seq length global_batch_size=16, micro_batch_size=2, vocab_size=128256, @@ -125,23 +127,26 @@ def finetune_recipe( ), log=llm.default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), optim=distributed_fused_adam_with_cosine_annealing(max_lr=1e-4, min_lr=2.0e-07, warmup_steps=150), - resume=nemo_resume("meta-llama/Llama-3.2-90B-Vision"), + resume=nemo_resume("meta-llama/Llama-3.2-90B-Vision-Instruct"), ) if peft_scheme is None or peft_scheme.lower() == 'none': raise ValueError("Full finetuning recipe for Llama-3.2-90B model will be supported soon.") elif peft_scheme.lower() == 'lora': + # pylint: disable=line-too-long + """Adapted from https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/configs/peft.py""" recipe.peft = run.Config( vlm.LoRA, - freeze_vision_model=False, + freeze_vision_model=True, target_modules=[ - "*.language_model.*.linear_qkv", - "*.language_model.*.linear_q", - "*.language_model.*.linear_kv", - "*.language_model.*.linear_proj", - "*.language_model.*.linear_fc1", - "*.language_model.*.linear_fc2", + "linear_qkv", + "linear_q", + "linear_kv", ], + dim=8, + alpha=32, + dropout=0.05, + dropout_position="pre", ) recipe.optim.config.lr = 1e-4 else: diff --git a/scripts/vlm/mllama_finetune.py b/scripts/vlm/mllama_finetune.py new file mode 100644 index 000000000000..2b6990a03aa5 --- /dev/null +++ b/scripts/vlm/mllama_finetune.py @@ -0,0 +1,212 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import torch +from megatron.core.optimizer import OptimizerConfig +from pytorch_lightning.loggers import WandbLogger +from transformers import AutoProcessor + +from nemo import lightning as nl +from nemo.collections import llm, vlm +from nemo.collections.vlm import ImageDataConfig +from nemo.collections.vlm.mllama.data.lazy import MLlamaLazyDataModule +from nemo.lightning.pytorch.optim import CosineAnnealingScheduler +from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule +from nemo.utils.exp_manager import TimingCallback + + +def main(args): + """ + Main function for setting up and training the MLLama model. + + This function prepares the data module, model, training strategy, + logger, checkpointing, and optimizer configuration. It then starts + the training loop using PyTorch Lightning's trainer. + + Args: + args (argparse.Namespace): The command-line arguments passed to the script. + """ + # Setting gbs, mbs, and max_steps from arguments + gbs = args.gbs + mbs = args.mbs + max_steps = args.max_steps + + # encoder (vision) seq length + # ((img_res / patch_size) ** 2 + cls_token) * num_tiles, = ((560 / 14) ** 2 + 1) * 4 = 6404 + seq_length = 6404 + decoder_seq_length = 1024 # decoder (llm) seq length + + if args.restore_path is not None and args.restore_path.startswith("nemo://"): + model_id = args.restore_path[len("nemo://") :] + else: + model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" + + processor = AutoProcessor.from_pretrained(model_id) + image_processor = processor.image_processor + tokenizer = processor.tokenizer + + # Data configuration + data_config = ImageDataConfig( + image_folder=args.image_folder, + conv_template="mllama", + ) + + # Data module setup + data = MLlamaLazyDataModule( + paths=args.data_path, + data_config=data_config, + seq_length=seq_length, + decoder_seq_length=decoder_seq_length, + global_batch_size=gbs, + micro_batch_size=mbs, + tokenizer=tokenizer, + image_processor=image_processor, + num_workers=16, + ) + + model_configs = { + "meta-llama/Llama-3.2-11B-Vision": vlm.MLlamaConfig11B, + "meta-llama/Llama-3.2-11B-Vision-Instruct": vlm.MLlamaConfig11BInstruct, + "meta-llama/Llama-3.2-90B-Vision": vlm.MLlamaConfig90B, + "meta-llama/Llama-3.2-90B-Vision-Instruct": vlm.MLlamaConfig90BInstruct, + } + conf = model_configs[model_id]() + if args.pp_size > 1: + conf.language_model_config.first_pipeline_num_layers = 0 + model = vlm.MLlamaModel(conf, tokenizer=tokenizer) + + # Training strategy setup + strategy = nl.MegatronStrategy( + tensor_model_parallel_size=args.tp_size, + pipeline_model_parallel_size=args.pp_size, + encoder_pipeline_model_parallel_size=args.encoder_pp_size, + pipeline_dtype=torch.bfloat16, + ) + + # Checkpoint callback setup + checkpoint_callback = nl.ModelCheckpoint( + save_last=True, + monitor="reduced_train_loss", + save_top_k=6, + every_n_train_steps=100, + dirpath=args.log_dir, + ) + + # Trainer setup + trainer = nl.Trainer( + num_nodes=args.num_nodes, + devices=args.devices, + max_steps=max_steps, + accelerator="gpu", + strategy=strategy, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + callbacks=[checkpoint_callback, TimingCallback()], + val_check_interval=500, + limit_val_batches=gbs, + log_every_n_steps=1, + num_sanity_val_steps=0, + ) + + # Logger setup + nemo_logger = nl.NeMoLogger( + log_dir=args.log_dir, + name=args.name, + wandb=WandbLogger(project=args.wandb_project, name=args.name) if args.wandb_project is not None else None, + ) + + # Auto resume setup + resume = nl.AutoResume( + resume_if_exists=True, + resume_ignore_no_checkpoint=True, + resume_from_directory=args.log_dir, + restore_config=nl.RestoreConfig(path=args.restore_path) if args.restore_path is not None else None, + ) + + # Optimizer and scheduler setup + opt_config = OptimizerConfig( + optimizer='adam', + lr=args.lr, + adam_beta1=0.9, + adam_beta2=0.95, + use_distributed_optimizer=True, + bf16=True, + ) + sched = CosineAnnealingScheduler( + max_steps=trainer.max_steps, + warmup_steps=100, + constant_steps=0, + min_lr=args.lr, + ) + opt = MegatronOptimizerModule(opt_config, sched) + + # PEFT setup + if args.peft == 'lora': + peft = vlm.peft.LoRA( + freeze_vision_model=True, + target_modules=[ + "linear_qkv", + "linear_q", + "linear_kv", + ], + dim=8, + alpha=32, + dropout=0.05, + dropout_position="pre", + ) + else: + peft = None + + llm.finetune( + model=model, + data=data, + trainer=trainer, + peft=peft, + log=nemo_logger, + optim=opt, + resume=resume, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Mllama Model Training Script") + + parser.add_argument( + "--restore_path", type=str, required=False, default=None, help="Path to restore model from checkpoint" + ) + parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset") + parser.add_argument("--image_folder", type=str, required=True, help="Path to the image folder") + parser.add_argument( + "--log_dir", + type=str, + required=False, + default="/results", + help="Directory for logging and checkpoints", + ) + parser.add_argument("--devices", type=int, required=False, default=1) + parser.add_argument("--num_nodes", type=int, required=False, default=1) + parser.add_argument("--max_steps", type=int, required=False, default=5190) + parser.add_argument("--tp_size", type=int, required=False, default=1) + parser.add_argument("--pp_size", type=int, required=False, default=1) + parser.add_argument("--encoder_pp_size", type=int, required=False, default=0) + parser.add_argument("--name", type=str, required=False, default="neva_pretrain") + parser.add_argument("--peft", type=str, default='none', help="none | lora") + parser.add_argument("--wandb_project", type=str, required=False, default=None) + parser.add_argument("--gbs", type=int, required=False, default=64, help="Global batch size") + parser.add_argument("--mbs", type=int, required=False, default=2, help="Micro batch size") + parser.add_argument("--lr", type=float, required=False, default=2.0e-06, help="Learning rate") + + args = parser.parse_args() + main(args) diff --git a/scripts/vlm/mllama_generation.py b/scripts/vlm/mllama_generation.py new file mode 100644 index 000000000000..4ebf2d0055ad --- /dev/null +++ b/scripts/vlm/mllama_generation.py @@ -0,0 +1,164 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import requests +import torch +from PIL import Image +from transformers import AutoProcessor + +from nemo import lightning as nl +from nemo.collections import vlm +from nemo.collections.vlm.mllama.model.utils import create_vision_mask_tensor + +model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" + + +def load_image(image_url: str) -> Image.Image: + # pylint: disable=C0115,C0116 + try: + response = requests.get(image_url, stream=True) + response.raise_for_status() + image = Image.open(response.raw) + return image + except requests.exceptions.RequestException as e: + print(f"Error loading image from {image_url}: {e}") + return None + + +def generate(model, processor, image, text): + # pylint: disable=C0115,C0116 + tokenizer = processor.tokenizer + + messages = [ + { + "role": "user", + "content": [{"type": "text", "text": text}], + } + ] + input_text = processor.apply_chat_template(messages, add_generation_prompt=True) + batch = processor(image, input_text, add_special_tokens=False, return_tensors="pt") + + input_ids = batch["input_ids"].cuda(non_blocking=True) + position_ids = ( + torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device).unsqueeze(0).expand_as(input_ids) + ) + num_tiles = processor.image_processor.preprocess(image, return_tensors='pt')["num_tiles"] + + min_prompt_len = position_ids.shape[-1] + + input_ids = input_ids[:, :min_prompt_len] + generated_ids = input_ids.clone() + + from tqdm import tqdm + + for cur_pos in tqdm(range(min_prompt_len, min_prompt_len + 100)): + with torch.no_grad(): + position_ids = torch.arange(0, cur_pos, dtype=torch.long, device="cuda").reshape(1, -1) + batch_masks = create_vision_mask_tensor(generated_ids[0]) + + output = model( + batch_images=batch["pixel_values"].cuda(non_blocking=True), + batch_masks=[batch_masks], + num_chunks=torch.tensor(num_tiles), + aspect_ratio_ids=batch["aspect_ratio_ids"].cuda(non_blocking=True), + tokens=generated_ids, + position_ids=position_ids, + ) + + next_token_ids = torch.argmax(output[:, -1], dim=-1, keepdim=True) + # Broadcast the tensor from rank 0 to all other ranks + torch.distributed.broadcast(next_token_ids, src=0) + generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1) + if (next_token_ids == tokenizer.eos_token_id).all(): + break + + generated_ids = generated_ids.tolist() + generated_texts = tokenizer.decode(generated_ids[0][min_prompt_len:]) + + if torch.distributed.get_rank() == 0: + print("======== GENERATED TEXT OUTPUT ========") + print(f"{generated_texts}") + print("=======================================") + return generated_texts + + +def main(args) -> None: + # pylint: disable=C0115,C0116 + strategy = nl.MegatronStrategy( + tensor_model_parallel_size=args.tp_size, + ckpt_load_optimizer=False, + ckpt_save_optimizer=False, + ) + trainer = nl.Trainer( + devices=args.tp_size, + max_steps=1000, + accelerator="gpu", + strategy=strategy, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + val_check_interval=1000, + limit_val_batches=50, + ) + + processor = AutoProcessor.from_pretrained(model_id) + tokenizer = processor.tokenizer + + fabric = trainer.to_fabric() + + if args.load_from_hf: + model = fabric.import_model(f"hf://{model_id}", vlm.MLlamaModel) + else: + model = vlm.MLlamaModel(vlm.MLlamaConfig11BInstruct(), tokenizer=tokenizer) + model = fabric.load_model(args.local_model_path, model) + + model = model.module.cuda() + model.eval() + model = model.to(torch.bfloat16) + + # Load the image + raw_image = load_image(args.image_url) + if raw_image is None: + return # Exit if the image can't be loaded + + generate(model, processor, image=raw_image, text="<|image|>\nDescribe the image.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parser.add_argument( + "--load_from_hf", + action="store_true", + help="Flag to indicate whether to load the model from Hugging Face hub.", + ) + parser.add_argument( + "--local_model_path", + type=str, + default=None, + help="Local path to the model if not loading from Hugging Face.", + ) + parser.add_argument( + "--image_url", + type=str, + # pylint: disable=line-too-long + default="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + help="URL of the image to use for inference.", + ) + parser.add_argument("--devices", type=int, required=False, default=1) + parser.add_argument("--tp_size", type=int, required=False, default=1) + parser.add_argument("--pp_size", type=int, required=False, default=1) + parser.add_argument("--encoder_pp_size", type=int, required=False, default=0) + + args = parser.parse_args() + main(args) From ee072617f15f2d47cb1888853af60d81c56e7fba Mon Sep 17 00:00:00 2001 From: nune-tadevosyan <152167970+nune-tadevosyan@users.noreply.github.com> Date: Mon, 25 Nov 2024 12:18:34 +0400 Subject: [PATCH 09/11] Lhotse support for transcribe_speech_parallel (#11249) * Lhotse support for transcribe_speech_parallel Signed-off-by: Nune * Apply isort and black reformatting Signed-off-by: nune-tadevosyan * Removing prints Signed-off-by: Nune * Remove Signed-off-by: Nune * Adding shard_id Signed-off-by: Nune * Handling empty text fields Signed-off-by: Nune * Apply isort and black reformatting Signed-off-by: nune-tadevosyan * Changing keys Signed-off-by: Nune * Key Signed-off-by: Nune * Commented issues Signed-off-by: Nune * Apply isort and black reformatting Signed-off-by: nune-tadevosyan * Commented issues Signed-off-by: Nune * Apply isort and black reformatting Signed-off-by: nune-tadevosyan * test for lhotse metadata return Signed-off-by: Nune * test for lhotse metadata return Signed-off-by: Nune * Small change Signed-off-by: Nune * Apply isort and black reformatting Signed-off-by: nune-tadevosyan * Support for RNNT and CTC model Signed-off-by: Nune * Support for all models Signed-off-by: Nune * Small change Signed-off-by: Nune * Apply isort and black reformatting Signed-off-by: nune-tadevosyan * Tests for predict_step Signed-off-by: Nune * Apply isort and black reformatting Signed-off-by: nune-tadevosyan * Adding support for force_map_dataset Signed-off-by: Nune * Apply isort and black reformatting Signed-off-by: nune-tadevosyan --------- Signed-off-by: Nune Signed-off-by: nune-tadevosyan Co-authored-by: nune-tadevosyan --- examples/asr/transcribe_speech_parallel.py | 10 +++- .../asr/data/audio_to_text_dataset.py | 8 +++- .../asr/data/audio_to_text_lhotse.py | 7 ++- .../asr/models/configs/asr_models_config.py | 11 +++++ nemo/collections/asr/models/ctc_bpe_models.py | 12 +++-- nemo/collections/asr/models/ctc_models.py | 11 +++-- .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 8 +++- .../asr/models/hybrid_rnnt_ctc_models.py | 4 +- .../collections/asr/models/rnnt_bpe_models.py | 8 +++- nemo/collections/asr/models/rnnt_models.py | 11 +++-- .../asr/models/transformer_bpe_models.py | 8 +++- .../common/data/lhotse/dataloader.py | 43 ++++++++++++----- .../asr/test_asr_ctc_encoder_model_bpe.py | 33 +++++++++++++ .../asr/test_asr_ctcencdec_model.py | 35 ++++++++++++++ .../asr/test_asr_hybrid_rnnt_ctc_model_bpe.py | 15 ++++++ .../test_asr_hybrid_rnnt_ctc_model_char.py | 17 +++++++ .../asr/test_asr_lhotse_dataset.py | 32 +++++++++++++ .../asr/test_asr_rnnt_encdec_model.py | 17 +++++++ .../asr/test_asr_rnnt_encoder_model_bpe.py | 48 +++++++++++++++---- 19 files changed, 296 insertions(+), 42 deletions(-) diff --git a/examples/asr/transcribe_speech_parallel.py b/examples/asr/transcribe_speech_parallel.py index bdf54ea67f7d..d60099acd379 100644 --- a/examples/asr/transcribe_speech_parallel.py +++ b/examples/asr/transcribe_speech_parallel.py @@ -163,6 +163,14 @@ def main(cfg: ParallelTranscriptionConfig): cfg.predict_ds.return_sample_id = True cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds, train_ds=model.cfg.train_ds) + if cfg.predict_ds.use_lhotse: + OmegaConf.set_struct(cfg.predict_ds, False) + cfg.trainer.use_distributed_sampler = False + cfg.predict_ds.force_finite = True + cfg.predict_ds.force_map_dataset = True + cfg.predict_ds.do_transcribe = True + OmegaConf.set_struct(cfg.predict_ds, True) + if isinstance(model, EncDecMultiTaskModel): cfg.trainer.use_distributed_sampler = False OmegaConf.set_struct(cfg.predict_ds, False) @@ -172,7 +180,7 @@ def main(cfg: ParallelTranscriptionConfig): trainer = ptl.Trainer(**cfg.trainer) - if isinstance(model, EncDecMultiTaskModel): + if cfg.predict_ds.use_lhotse: OmegaConf.set_struct(cfg.predict_ds, False) cfg.predict_ds.global_rank = trainer.global_rank cfg.predict_ds.world_size = trainer.world_size diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py index 76537a8b2b78..f91710de3cb3 100644 --- a/nemo/collections/asr/data/audio_to_text_dataset.py +++ b/nemo/collections/asr/data/audio_to_text_dataset.py @@ -867,10 +867,16 @@ def write_on_batch_end( sample = sample_id if isinstance(sample, lhotse.cut.MixedCut): sample = sample.first_non_padding_cut + if sample.recording.sources[0].source != '': + item["audio_filepath"] = sample.recording.sources[0].source + else: + item["audio_filepath"] = sample.id item["audio_filepath"] = sample.recording.sources[0].source item["offset"] = sample.start item["duration"] = sample.duration - item["text"] = sample.supervisions[0].text + item["text"] = sample.supervisions[0].text or '' + if hasattr(sample, 'shard_id'): + item["shard_id"] = sample.shard_id item["pred_text"] = transcribed_text self.outf.write(json.dumps(item) + "\n") self.samples_num += 1 diff --git a/nemo/collections/asr/data/audio_to_text_lhotse.py b/nemo/collections/asr/data/audio_to_text_lhotse.py index f916ae1de56b..0ae3059a9296 100644 --- a/nemo/collections/asr/data/audio_to_text_lhotse.py +++ b/nemo/collections/asr/data/audio_to_text_lhotse.py @@ -43,17 +43,18 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), } - def __init__(self, tokenizer): + def __init__(self, tokenizer, return_cuts=False): super().__init__() self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) + self.return_cuts = return_cuts def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]: audio, audio_lens, cuts = self.load_audio(cuts) tokens = [ torch.cat( [ - torch.as_tensor(s.tokens if hasattr(s, "tokens") else self.tokenizer(s.text, s.language)) + torch.as_tensor(s.tokens if hasattr(s, "tokens") else self.tokenizer(s.text or "", s.language)) for s in c.supervisions ], dim=0, @@ -62,6 +63,8 @@ def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]: ] token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long) tokens = collate_vectors(tokens, padding_value=0) + if self.return_cuts: + return audio, audio_lens, tokens, token_lens, cuts.drop_in_memory_data() return audio, audio_lens, tokens, token_lens diff --git a/nemo/collections/asr/models/configs/asr_models_config.py b/nemo/collections/asr/models/configs/asr_models_config.py index 29dbbe06d1f8..081233da5d32 100644 --- a/nemo/collections/asr/models/configs/asr_models_config.py +++ b/nemo/collections/asr/models/configs/asr_models_config.py @@ -41,6 +41,17 @@ class ASRDatasetConfig(nemo.core.classes.dataset.DatasetConfig): shard_manifests: bool = False shuffle_n: int = 0 + # lhotse support + use_lhotse: bool = False + tarred_random_access: bool = False + use_bucketing: bool = False + batch_duration: Optional[int] = None + quadratic_duration: Optional[int] = None + bucket_batch_size: Optional[int] = None + bucket_duration_bins: Optional[list] = None + num_buckets: Optional[int] = 0 + pin_memory: bool = False + # Optional int_values: Optional[int] = None augmentor: Optional[Dict[str, Any]] = None diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index 79c22794de01..1f84989c8ebe 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -97,9 +97,15 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): return get_lhotse_dataloader_from_config( config, - global_rank=self.global_rank, - world_size=self.world_size, - dataset=LhotseSpeechToTextBpeDataset(tokenizer=self.tokenizer), + # During transcription, the model is initially loaded on the CPU. + # To ensure the correct global_rank and world_size are set, + # these values must be passed from the configuration. + global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), + world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), + dataset=LhotseSpeechToTextBpeDataset( + tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), + ), tokenizer=self.tokenizer, ) diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index 76dcd13cca50..ae8c35220931 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -309,8 +309,11 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): return get_lhotse_dataloader_from_config( config, - global_rank=self.global_rank, - world_size=self.world_size, + # During transcription, the model is initially loaded on the CPU. + # To ensure the correct global_rank and world_size are set, + # these values must be passed from the configuration. + global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), + world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), dataset=LhotseSpeechToTextBpeDataset( tokenizer=make_parser( labels=config.get('labels', None), @@ -319,6 +322,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): blank_id=config.get('blank_index', -1), do_normalize=config.get('normalize_transcripts', False), ), + return_cuts=config.get("do_transcribe", False), ), ) @@ -614,7 +618,8 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): return_hypotheses=False, ) - sample_id = sample_id.cpu().detach().numpy() + if isinstance(sample_id, torch.Tensor): + sample_id = sample_id.cpu().detach().numpy() return list(zip(sample_id, transcribed_texts)) def validation_pass(self, batch, batch_idx, dataloader_idx=0): diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index 7e8720ee3ad8..cd04a5ad2462 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -140,10 +140,14 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): return get_lhotse_dataloader_from_config( config, - global_rank=self.global_rank, - world_size=self.world_size, + # During transcription, the model is initially loaded on the CPU. + # To ensure the correct global_rank and world_size are set, + # these values must be passed from the configuration. + global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), + world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), dataset=LhotseSpeechToTextBpeDataset( tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), ), tokenizer=self.tokenizer, ) diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py index 34dd9aae5711..1f63c617cea2 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py @@ -519,8 +519,8 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False ) - - sample_id = sample_id.cpu().detach().numpy() + if isinstance(sample_id, torch.Tensor): + sample_id = sample_id.cpu().detach().numpy() return list(zip(sample_id, best_hyp_text)) def validation_pass(self, batch, batch_idx, dataloader_idx): diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index c92bcfaaef7a..cd8667f2f0fe 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -509,10 +509,14 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): return get_lhotse_dataloader_from_config( config, - global_rank=self.global_rank, - world_size=self.world_size, + # During transcription, the model is initially loaded on the CPU. + # To ensure the correct global_rank and world_size are set, + # these values must be passed from the configuration. + global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), + world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), dataset=LhotseSpeechToTextBpeDataset( tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), ), tokenizer=self.tokenizer, ) diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index e4d1abd0b50c..78038d404107 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -469,8 +469,11 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): return get_lhotse_dataloader_from_config( config, - global_rank=self.global_rank, - world_size=self.world_size, + # During transcription, the model is initially loaded on the CPU. + # To ensure the correct global_rank and world_size are set, + # these values must be passed from the configuration. + global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), + world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), dataset=LhotseSpeechToTextBpeDataset( tokenizer=make_parser( labels=config.get('labels', None), @@ -479,6 +482,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): blank_id=config.get('blank_index', -1), do_normalize=config.get('normalize_transcripts', False), ), + return_cuts=config.get("do_transcribe", False), ), ) @@ -814,7 +818,8 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False ) - sample_id = sample_id.cpu().detach().numpy() + if isinstance(sample_id, torch.Tensor): + sample_id = sample_id.cpu().detach().numpy() return list(zip(sample_id, best_hyp_text)) def validation_pass(self, batch, batch_idx, dataloader_idx=0): diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index 8d0f2b2223a3..4692cb662b4b 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -225,10 +225,14 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): config = self._update_default_values(config) return get_lhotse_dataloader_from_config( config, - global_rank=self.global_rank, - world_size=self.world_size, + # During transcription, the model is initially loaded on the CPU. + # To ensure the correct global_rank and world_size are set, + # these values must be passed from the configuration. + global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), + world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), dataset=LhotseSpeechToTextBpeDataset( tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), ), tokenizer=self.tokenizer, ) diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index 98b63a07fa9d..bf6b77ad907e 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -147,6 +147,28 @@ class LhotseDataLoadingConfig: # In most cases (such as regular multi-GPU training) it will result in a deadlock due to # a different number of steps on different DDP ranks. force_finite: bool = False + # The following two options may be used to override auto-detection of appropriate PyTorch dataset flavor + # for your data types. PyTorch DataLoader uses two objects to yield data: dataset and sampler. + # *Map-dataset flavor.* There is one sampler per GPU that lives in the training loop process; + # it selects the examples to be prepared by map-dataset class. Each batch selection determined by the sampler + # is then passed by the dataloader to one of its worker processes to be processed by the dataset class. + # *Iterable-dataset flavor.* Each dataloading worker has its own sampler replica instead; + # the sampler must have the logic for either data deduplication or unique order shuffling to avoid + # duplicated data across workers and GPUs. Lhotse relies on unique order shuffling. + # The default settings are: + # * use iterable dataset for tarred audio data. + # * use iterable dataset for any text data. + # * use map dataset for non-tarred audio data (we might change this in the future) + force_map_dataset: bool = False + force_iterable_dataset: bool = False + + +def determine_use_iterable_dataset(use_iterable_dataset: bool, config: DictConfig) -> bool: + assert not ( + config.force_map_dataset and config.force_iterable_dataset + ), "Conflicting options: force_map_dataset=True and force_iterable_dataset=True" + use_iterable_dataset = (use_iterable_dataset or config.force_iterable_dataset) and not config.force_map_dataset + return use_iterable_dataset def get_lhotse_dataloader_from_config( @@ -176,7 +198,6 @@ def get_lhotse_dataloader_from_config( Note that ``tokenizer`` can be any tokenizer type (e.g. both SentencePiece and Aggregate tokenizers work). """ logging.info("We will be using a Lhotse DataLoader.") - config = make_structured_with_schema_warnings(config) maybe_set_cuda_expandable_segments(enabled=config.cuda_expandable_segments) @@ -186,8 +207,8 @@ def get_lhotse_dataloader_from_config( fix_random_seed(seed) # 1. Load a manifest as a Lhotse CutSet. - cuts, is_tarred = read_cutset_from_config(config) - + cuts, use_iterable_dataset = read_cutset_from_config(config) + use_iterable_dataset = determine_use_iterable_dataset(use_iterable_dataset, config) # Apply channel selector if config.channel_selector is not None: logging.info('Using channel selector %s.', config.channel_selector) @@ -202,7 +223,7 @@ def get_lhotse_dataloader_from_config( if tokenizer is not None and config.pretokenize: from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper - if not is_tarred: + if not use_iterable_dataset: logging.warning( "You are using a non-tarred dataset and requested tokenization during data sampling (pretokenize=True). " "This will cause the tokenization to happen in the main (GPU) process, possibly impacting the training speed " @@ -317,8 +338,8 @@ def get_lhotse_dataloader_from_config( duration_bins=determine_bucket_duration_bins(config), num_cuts_for_bins_estimate=config.num_cuts_for_bins_estimate, buffer_size=config.bucket_buffer_size, - rank=0 if is_tarred else global_rank, - world_size=1 if is_tarred else world_size, + rank=0 if use_iterable_dataset else global_rank, + world_size=1 if use_iterable_dataset else world_size, ) else: # Non-bucketing sampler, similar to original NeMo dataloading without bucketing, @@ -335,8 +356,8 @@ def get_lhotse_dataloader_from_config( drop_last=config.drop_last, shuffle_buffer_size=config.shuffle_buffer_size, seed=config.shard_seed, - rank=0 if is_tarred else global_rank, - world_size=1 if is_tarred else world_size, + rank=0 if use_iterable_dataset else global_rank, + world_size=1 if use_iterable_dataset else world_size, ) if config.concatenate_samples: @@ -368,7 +389,7 @@ def get_lhotse_dataloader_from_config( ) # 4. Creating dataloader. - if is_tarred and not config.tarred_random_access: + if use_iterable_dataset and not config.tarred_random_access: # Wrapper here is necessary when using NeMo tarred data or Lhotse Shar data, # because then I/O happens upon sampler iteration. Normally, the sampler resides # in the training loop process, but when we use iterable dataset, we can move it to @@ -601,8 +622,8 @@ class DurationFilter: """Callable, returns ``True`` if a cut's duration is in range [d_min, d_max] and ``False`` otherwise.""" def __init__(self, d_min: float, d_max: float) -> None: - self.d_min = d_min - self.d_max = d_max + self.d_min = d_min if d_min is not None else -1.0 + self.d_max = d_max if d_max is not None else float("inf") def __call__(self, example) -> bool: if isinstance(example, Cut): diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py index 247906247091..02442291a918 100644 --- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py +++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py @@ -19,9 +19,12 @@ import pytest import torch +from lhotse import CutSet, MonoCut +from lhotse.testing.dummies import DummyManifest from omegaconf import DictConfig from nemo.collections.asr.data import audio_to_text +from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.models import configs from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE from nemo.collections.asr.parts.submodules import ctc_beam_decoding as beam_decode @@ -118,6 +121,18 @@ def test_forward(self, asr_model): diff = torch.max(torch.abs(logprobs_instance - logprobs_batch)) assert diff <= 1e-6 + @pytest.mark.unit + def test_predict_step(self, asr_model): + asr_model = asr_model.eval() + cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True) + dataset = LhotseSpeechToTextBpeDataset(tokenizer=asr_model.tokenizer, return_cuts=True) + batch = dataset[cuts] + outputs = asr_model.predict_step(batch, 0) + assert len(outputs) == 1 + assert len(outputs[0]) == 2 + assert isinstance(outputs[0][0], MonoCut) + assert isinstance(outputs[0][1], str) + @pytest.mark.with_downloads() @pytest.mark.unit def test_save_restore_artifact(self, asr_model): @@ -333,6 +348,15 @@ def test_ASRDatasetConfig_for_AudioToBPEDataset(self): 'bucketing_strategy', 'bucketing_weights', 'channel_selector', + 'use_lhotse', + 'tarred_random_access', + 'use_bucketing', + 'batch_duration', + 'quadratic_duration', + 'bucket_batch_size', + 'bucket_duration_bins', + 'num_buckets', + 'pin_memory', ] REMAP_ARGS = {'trim_silence': 'trim', 'labels': 'tokenizer'} @@ -372,6 +396,15 @@ def test_ASRDatasetConfig_for_TarredAudioToBPEDataset(self): 'bucketing_strategy', 'bucketing_weights', 'max_utts', + 'use_lhotse', + 'tarred_random_access', + 'use_bucketing', + 'batch_duration', + 'quadratic_duration', + 'bucket_batch_size', + 'bucket_duration_bins', + 'num_buckets', + 'pin_memory', ] REMAP_ARGS = { diff --git a/tests/collections/asr/test_asr_ctcencdec_model.py b/tests/collections/asr/test_asr_ctcencdec_model.py index 28a07fd54663..55451758578f 100644 --- a/tests/collections/asr/test_asr_ctcencdec_model.py +++ b/tests/collections/asr/test_asr_ctcencdec_model.py @@ -15,12 +15,16 @@ import pytest import torch +from lhotse import CutSet, MonoCut +from lhotse.testing.dummies import DummyManifest from omegaconf import DictConfig, OmegaConf, open_dict import nemo.collections.asr as nemo_asr from nemo.collections.asr.data import audio_to_text +from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.models import EncDecCTCModel, configs from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig +from nemo.collections.common.parts.preprocessing.parsers import make_parser from nemo.utils.config_utils import assert_dataclass_signature_match, update_model_config @@ -131,6 +135,19 @@ def test_forward(self, asr_model): diff = torch.max(torch.abs(logprobs_instance - logprobs_batch)) assert diff <= 1e-6 + @pytest.mark.unit + def test_predict_step(self, asr_model): + token_list = [" ", "a", "b", "c"] + asr_model = asr_model.eval() + cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True) + dataset = LhotseSpeechToTextBpeDataset(tokenizer=make_parser(labels=token_list), return_cuts=True) + batch = dataset[cuts] + outputs = asr_model.predict_step(batch, 0) + assert len(outputs) == 1 + assert len(outputs[0]) == 2 + assert isinstance(outputs[0][0], MonoCut) + assert isinstance(outputs[0][1], str) + @pytest.mark.unit def test_vocab_change(self, asr_model): old_vocab = copy.deepcopy(asr_model.decoder.vocabulary) @@ -274,6 +291,15 @@ def test_ASRDatasetConfig_for_AudioToCharDataset(self): 'bucketing_strategy', 'bucketing_weights', 'channel_selector', + 'use_lhotse', + 'tarred_random_access', + 'use_bucketing', + 'batch_duration', + 'quadratic_duration', + 'bucket_batch_size', + 'bucket_duration_bins', + 'num_buckets', + 'pin_memory', ] REMAP_ARGS = {'trim_silence': 'trim'} @@ -307,6 +333,15 @@ def test_ASRDatasetConfig_for_TarredAudioToCharDataset(self): 'bucketing_strategy', 'bucketing_weights', 'max_utts', + 'use_lhotse', + 'tarred_random_access', + 'use_bucketing', + 'batch_duration', + 'quadratic_duration', + 'bucket_batch_size', + 'bucket_duration_bins', + 'num_buckets', + 'pin_memory', ] REMAP_ARGS = { diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py index 1743acc6878c..d13c879e47f9 100644 --- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py +++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_bpe.py @@ -18,8 +18,11 @@ import pytest import torch +from lhotse import CutSet, MonoCut +from lhotse.testing.dummies import DummyManifest from omegaconf import DictConfig +from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.models.hybrid_rnnt_ctc_bpe_models import EncDecHybridRNNTCTCBPEModel from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode @@ -166,6 +169,18 @@ def test_forward(self, hybrid_asr_model): diff = torch.max(torch.abs(logits_instance - logprobs_batch)) assert diff <= 1e-6 + @pytest.mark.unit + def test_predict_step(self, hybrid_asr_model): + hybrid_asr_model = hybrid_asr_model.eval() + cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True) + dataset = LhotseSpeechToTextBpeDataset(tokenizer=hybrid_asr_model.tokenizer, return_cuts=True) + batch = dataset[cuts] + outputs = hybrid_asr_model.predict_step(batch, 0) + assert len(outputs) == 1 + assert len(outputs[0]) == 2 + assert isinstance(outputs[0][0], MonoCut) + assert isinstance(outputs[0][1], str) + @pytest.mark.with_downloads() @pytest.mark.skipif( not NUMBA_RNNT_LOSS_AVAILABLE, diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py index 5362966e2e9e..b5c34e197237 100644 --- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py +++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py @@ -16,14 +16,18 @@ import pytest import torch +from lhotse import CutSet, MonoCut +from lhotse.testing.dummies import DummyManifest from omegaconf import DictConfig, ListConfig +from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.models import EncDecHybridRNNTCTCModel from nemo.collections.asr.modules import RNNTDecoder, RNNTJoint, SampledRNNTJoint, StatelessTransducerDecoder from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig from nemo.collections.asr.parts.utils import rnnt_utils +from nemo.collections.common.parts.preprocessing.parsers import make_parser from nemo.core.utils import numba_utils from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__ from nemo.utils.config_utils import assert_dataclass_signature_match @@ -164,6 +168,19 @@ def test_forward(self, hybrid_asr_model): diff = torch.max(torch.abs(logprobs_instance - logprobs_batch)) assert diff <= 1e-6 + @pytest.mark.unit + def test_predict_step(self, hybrid_asr_model): + token_list = [" ", "a", "b", "c"] + hybrid_asr_model = hybrid_asr_model.eval() + cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True) + dataset = LhotseSpeechToTextBpeDataset(tokenizer=make_parser(labels=token_list), return_cuts=True) + batch = dataset[cuts] + outputs = hybrid_asr_model.predict_step(batch, 0) + assert len(outputs) == 1 + assert len(outputs[0]) == 2 + assert isinstance(outputs[0][0], MonoCut) + assert isinstance(outputs[0][1], str) + @pytest.mark.skipif( not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', diff --git a/tests/collections/asr/test_asr_lhotse_dataset.py b/tests/collections/asr/test_asr_lhotse_dataset.py index 5a1450e606ac..c131fac70310 100644 --- a/tests/collections/asr/test_asr_lhotse_dataset.py +++ b/tests/collections/asr/test_asr_lhotse_dataset.py @@ -65,3 +65,35 @@ def test_lhotse_asr_dataset(tokenizer): assert tokens[2].tolist() == [1, 7, 10, 19, 20, 21, 1, 20, 6, 4, 16, 15, 5] assert token_lens.tolist() == [11, 11, 13] + + +def test_lhotse_asr_dataset_metadata(tokenizer): + + cuts = DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True) + + cuts[0].id = "cuts0" + cuts[1].id = "cuts1" + cuts[0].supervisions = [ + SupervisionSegment(id="cuts0-sup0", recording_id=cuts[0].recording_id, start=0.2, duration=0.5, text="first"), + ] + cuts[1].supervisions = [ + SupervisionSegment(id="cuts1-sup0", recording_id=cuts[1].recording_id, start=0, duration=1, text=""), + ] + + datasets_metadata = LhotseSpeechToTextBpeDataset(tokenizer=tokenizer, return_cuts=True) + batch = datasets_metadata[cuts] + assert isinstance(batch, tuple) + assert len(batch) == 5 + + _, _, _, _, cuts_metadata = batch + + assert cuts_metadata[0].supervisions[0].text == "first" + assert cuts_metadata[1].supervisions[0].text == "" + assert cuts_metadata[0].id == "cuts0" + assert cuts_metadata[1].id == "cuts1" + + assert cuts_metadata[0].supervisions[0].duration == 0.5 + assert cuts_metadata[0].supervisions[0].start == 0.2 + + assert cuts_metadata[1].supervisions[0].duration == 1 + assert cuts_metadata[1].supervisions[0].start == 0.0 diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py index d68088fce376..5e810243c919 100644 --- a/tests/collections/asr/test_asr_rnnt_encdec_model.py +++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py @@ -17,13 +17,17 @@ import pytest import torch import torch.nn.functional as F +from lhotse import CutSet, MonoCut +from lhotse.testing.dummies import DummyManifest from omegaconf import DictConfig, ListConfig +from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.models import EncDecRNNTModel from nemo.collections.asr.modules import HATJoint, RNNTDecoder, RNNTJoint, SampledRNNTJoint, StatelessTransducerDecoder from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode from nemo.collections.asr.parts.utils import rnnt_utils +from nemo.collections.common.parts.preprocessing.parsers import make_parser from nemo.core.utils import numba_utils from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__ from nemo.utils.config_utils import assert_dataclass_signature_match @@ -296,6 +300,19 @@ def test_forward(self, asr_model): diff = torch.max(torch.abs(logprobs_instance - logprobs_batch)) assert diff <= 1e-6 + @pytest.mark.unit + def test_predict_step(self, asr_model): + token_list = [" ", "a", "b", "c"] + asr_model = asr_model.eval() + cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True) + dataset = LhotseSpeechToTextBpeDataset(tokenizer=make_parser(labels=token_list), return_cuts=True) + batch = dataset[cuts] + outputs = asr_model.predict_step(batch, 0) + assert len(outputs) == 1 + assert len(outputs[0]) == 2 + assert isinstance(outputs[0][0], MonoCut) + assert isinstance(outputs[0][1], str) + @pytest.mark.skipif( not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', diff --git a/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py b/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py index 960445061e24..aba364868e88 100644 --- a/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py +++ b/tests/collections/asr/test_asr_rnnt_encoder_model_bpe.py @@ -18,8 +18,11 @@ import pytest import torch +from lhotse import CutSet, MonoCut +from lhotse.testing.dummies import DummyManifest from omegaconf import DictConfig +from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.models import ASRModel from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode @@ -64,12 +67,18 @@ def asr_model(test_data_dir): decoder = { '_target_': 'nemo.collections.asr.modules.RNNTDecoder', - 'prednet': {'pred_hidden': model_defaults['pred_hidden'], 'pred_rnn_layers': 1,}, + 'prednet': { + 'pred_hidden': model_defaults['pred_hidden'], + 'pred_rnn_layers': 1, + }, } joint = { '_target_': 'nemo.collections.asr.modules.RNNTJoint', - 'jointnet': {'joint_hidden': 32, 'activation': 'relu',}, + 'jointnet': { + 'joint_hidden': 32, + 'activation': 'relu', + }, } decoding = {'strategy': 'greedy_batch', 'greedy': {'max_symbols': 30}} @@ -123,7 +132,8 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): class TestEncDecRNNTBPEModel: @pytest.mark.skipif( - not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', + not NUMBA_RNNT_LOSS_AVAILABLE, + reason='RNNTLoss has not been compiled with appropriate numba version.', ) @pytest.mark.with_downloads() @pytest.mark.unit @@ -137,7 +147,8 @@ def test_constructor(self, asr_model): @pytest.mark.with_downloads() @pytest.mark.skipif( - not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', + not NUMBA_RNNT_LOSS_AVAILABLE, + reason='RNNTLoss has not been compiled with appropriate numba version.', ) @pytest.mark.unit def test_forward(self, asr_model): @@ -170,9 +181,22 @@ def test_forward(self, asr_model): diff = torch.max(torch.abs(logits_instance - logprobs_batch)) assert diff <= 1e-6 + @pytest.mark.unit + def test_predict_step(self, asr_model): + asr_model = asr_model.eval() + cuts = DummyManifest(CutSet, begin_id=0, end_id=1, with_data=True) + dataset = LhotseSpeechToTextBpeDataset(tokenizer=asr_model.tokenizer, return_cuts=True) + batch = dataset[cuts] + outputs = asr_model.predict_step(batch, 0) + assert len(outputs) == 1 + assert len(outputs[0]) == 2 + assert isinstance(outputs[0][0], MonoCut) + assert isinstance(outputs[0][1], str) + @pytest.mark.with_downloads() @pytest.mark.skipif( - not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', + not NUMBA_RNNT_LOSS_AVAILABLE, + reason='RNNTLoss has not been compiled with appropriate numba version.', ) @pytest.mark.unit def test_save_restore_artifact(self, asr_model): @@ -190,7 +214,8 @@ def test_save_restore_artifact(self, asr_model): @pytest.mark.with_downloads() @pytest.mark.skipif( - not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', + not NUMBA_RNNT_LOSS_AVAILABLE, + reason='RNNTLoss has not been compiled with appropriate numba version.', ) @pytest.mark.unit def test_save_restore_artifact_spe(self, asr_model, test_data_dir): @@ -236,7 +261,8 @@ def test_save_restore_artifact_agg(self, asr_model, test_data_dir): @pytest.mark.with_downloads() @pytest.mark.skipif( - not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', + not NUMBA_RNNT_LOSS_AVAILABLE, + reason='RNNTLoss has not been compiled with appropriate numba version.', ) @pytest.mark.unit def test_vocab_change(self, test_data_dir, asr_model): @@ -266,7 +292,8 @@ def test_vocab_change(self, test_data_dir, asr_model): @pytest.mark.with_downloads() @pytest.mark.skipif( - not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', + not NUMBA_RNNT_LOSS_AVAILABLE, + reason='RNNTLoss has not been compiled with appropriate numba version.', ) @pytest.mark.unit def test_decoding_change(self, asr_model): @@ -309,7 +336,8 @@ def test_decoding_change(self, asr_model): @pytest.mark.with_downloads() @pytest.mark.unit @pytest.mark.skipif( - not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.', + not NUMBA_RNNT_LOSS_AVAILABLE, + reason='RNNTLoss has not been compiled with appropriate numba version.', ) def test_save_restore_nested_model(self): with tempfile.TemporaryDirectory() as tmp_dir: @@ -330,7 +358,7 @@ def test_save_restore_nested_model(self): # Check size of the checkpoint, which contains weights from pretrained model + linear layer fp_weights = os.path.join(tmp_dir, 'model_weights.ckpt') - assert os.path.getsize(fp_weights) > 50 * (2 ** 20) # Assert the weights are more than 50 MB + assert os.path.getsize(fp_weights) > 50 * (2**20) # Assert the weights are more than 50 MB # Check if param after restoration is exact match original_state_dict = model.inner_model.state_dict() From 42d164e558555669fd96ba9a56e9afb6c1bc1ee1 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Mon, 25 Nov 2024 02:25:51 -0800 Subject: [PATCH 10/11] Fix environment variables in torchrun executor (#11363) Signed-off-by: Hemil Desai Co-authored-by: Marc Romeyn --- nemo/collections/llm/recipes/run/executor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/nemo/collections/llm/recipes/run/executor.py b/nemo/collections/llm/recipes/run/executor.py index 305fa6b0a3c7..fe14a4f55bd2 100644 --- a/nemo/collections/llm/recipes/run/executor.py +++ b/nemo/collections/llm/recipes/run/executor.py @@ -18,11 +18,7 @@ def torchrun(devices: int = 8) -> run.Config[run.LocalExecutor]: """Local executor using torchrun.""" env_vars = { - "TRANSFORMERS_OFFLINE": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", - "NCCL_NVLS_ENABLE": "0", - "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", - "NVTE_ASYNC_AMAX_REDUCTION": "1", } executor = run.Config( From 8f779babf33203f0ea42ebfcb3edc92fde5742d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Kami=C5=84ski?= <67481570+Laplasjan107@users.noreply.github.com> Date: Mon, 25 Nov 2024 13:24:15 +0100 Subject: [PATCH 11/11] Add sample generate to PTQ for NeMo 2.0 (#11339) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Initial commit Signed-off-by: Piotr Kaminski * Remove leftover print Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * Fix docs and type annotations Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * Applied code review suggestions Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * Fix _get_decoder_type parameter Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com> --------- Signed-off-by: Piotr Kaminski Signed-off-by: Laplasjan107 Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com> Co-authored-by: Piotr Kaminski Co-authored-by: Laplasjan107 --- .../collections/llm/quantization/quantizer.py | 113 ++++++++++-------- nemo/collections/llm/quantization/utils.py | 32 ++++- scripts/llm/ptq.py | 9 ++ 3 files changed, 103 insertions(+), 51 deletions(-) diff --git a/nemo/collections/llm/quantization/quantizer.py b/nemo/collections/llm/quantization/quantizer.py index 45f72f06741e..d41ba39f39ea 100644 --- a/nemo/collections/llm/quantization/quantizer.py +++ b/nemo/collections/llm/quantization/quantizer.py @@ -24,10 +24,12 @@ from tqdm import tqdm from nemo.collections import llm -from nemo.lightning.ckpt_utils import CONTEXT_PATH +from nemo.collections.llm.inference import MCoreTokenizerWrappper, generate +from nemo.lightning.ckpt_utils import ckpt_to_context_subdir +from nemo.lightning.megatron_parallel import MegatronParallel from nemo.utils import logging -from .utils import get_unwrapped_mcore_model +from .utils import get_modelopt_decoder_type, get_unwrapped_mcore_model try: import modelopt.torch.quantization as mtq @@ -83,35 +85,12 @@ class ExportConfig: decoder_type: Optional[str] = None inference_tensor_parallel: int = 1 inference_pipeline_parallel: int = 1 + generate_sample: bool = False def __post_init__(self): self.path = Path(self.path) -def get_modelopt_decoder_type(config: llm.GPTConfig) -> str: - """Infers the modelopt decoder type from GPTConfig class.""" - mapping = [ - (llm.Baichuan2Config, "baichuan"), - (llm.ChatGLMConfig, "chatglm"), - (llm.GemmaConfig, "gemma"), - (llm.LlamaConfig, "llama"), - (llm.MistralConfig7B, "llama"), - (llm.MixtralConfig, "llama"), - (llm.NemotronConfig, "gptnext"), - (llm.Qwen2Config, "qwen"), - # TODO: (llm.StarcoderConfig, ""), - (llm.Starcoder2Config, "gptnext"), - ] - - for config_class, decoder_type in mapping: - if isinstance(config, config_class): - return decoder_type - - logging.warning("Could not directly infer the decoder type") - # TODO: Add a reasonable behavior for GPTConfig (for instance based on position_embedding_type) - return "llama" - - class Quantizer: """Post-training quantization (PTQ) and TensorRT-LLM export of NeMo 2.0 checkpoints. @@ -146,16 +125,37 @@ def __init__(self, quantization_config: QuantizationConfig, export_config: Expor assert dtype in SUPPORTED_DTYPE, f"Unsupported export dtype: {dtype}" self.torch_dtype = torch_dtype_from_precision(dtype) - def _setup(self, model: llm.GPTModel) -> None: + @staticmethod + def _setup(model: MegatronParallel) -> None: """Setup model for quantization.""" # TODO: disable activation checkpointing model.config.vocab_size = model.tokenizer.vocab_size model.freeze() - def _get_decoder_type(self, config: llm.GPTConfig): - return self.export_config.decoder_type or get_modelopt_decoder_type(config) + def _get_decoder_type(self, model: MegatronParallel): + if self.export_config.decoder_type is not None: + return self.export_config.decoder_type + unwrapped_model = model + while not isinstance(unwrapped_model, llm.GPTModel): + unwrapped_model = unwrapped_model.module + + return get_modelopt_decoder_type(unwrapped_model) + + @staticmethod + def _generate_sample(model: MegatronParallel): + prompts = ["Born in north-east France, Soyer trained as a", "Born in California, Soyer trained as a"] + + mcore_tokenizer = MCoreTokenizerWrappper(model.tokenizer) + mcore_inference = model.get_inference_wrapper( + params_dtype=torch.bfloat16, inference_batch_times_seqlen_threshold=30 + ) + + generated = [r.generated_text for r in generate(mcore_inference, mcore_tokenizer, prompts)] + outputs = [prompt + generation for prompt, generation in zip(prompts, generated)] + + logging.info(f'Sample generation after PTQ (with prompts): {outputs}') - def quantize(self, model: llm.GPTModel, forward_loop=None): + def quantize(self, model: MegatronParallel, forward_loop=None): """Quantize the model and calibrate using given forward loop.""" if forward_loop is None: get_dataloader = create_data_iterator_getter( @@ -185,7 +185,7 @@ def quantize(self, model: llm.GPTModel, forward_loop=None): self._setup(model) unwrapped_model = get_unwrapped_mcore_model(model) - decoder_type = self._get_decoder_type(unwrapped_model.config) + decoder_type = self._get_decoder_type(model) quant_cfg = QUANT_CFG_CHOICES[algorithm] if "awq" in algorithm: weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"] @@ -230,6 +230,10 @@ def quantize(self, model: llm.GPTModel, forward_loop=None): if dist.get_rank() == 0: mtq.print_quant_summary(unwrapped_model) + if self.export_config.generate_sample: + logging.info("Generating a sample output after model quantization.") + self._generate_sample(model) + return model def create_megatron_forward_loop( @@ -266,21 +270,34 @@ def loop(model): return loop - def export(self, model: llm.GPTModel, model_dir: str) -> None: + @staticmethod + def _validate_quantized_checkpoint(checkpoint_dir: Path, tensor_parallelism_size: int) -> bool: + """Basic validation of the model structure.""" + + saved_config = (checkpoint_dir / 'config.json').exists() + saved_weights = True + for i in range(tensor_parallelism_size): + saved_weights &= (checkpoint_dir / f'rank{i}.safetensors').exists() + + export_successful = saved_config and saved_weights + if not export_successful: + logging.error("Failed to export the quantized model.") + return export_successful + + def export(self, model: MegatronParallel, model_dir: str) -> None: """Export model to a TensorRT-LLM checkpoint.""" - assert self.export_config is not None, "Export config is not set" - # TODO: Add sample generate - # TODO: Support megatron_amp_O2 export_dir = self.export_config.path + inference_tp = self.export_config.inference_tensor_parallel + inference_pp = self.export_config.inference_pipeline_parallel use_nfs_workspace = model.config.pipeline_model_parallel_size > 1 export_tensorrt_llm_checkpoint( model=get_unwrapped_mcore_model(model), - decoder_type=self._get_decoder_type(model.config), + decoder_type=self._get_decoder_type(model), dtype=self.torch_dtype, export_dir=export_dir, - inference_tensor_parallel=self.export_config.inference_tensor_parallel, - inference_pipeline_parallel=self.export_config.inference_pipeline_parallel, + inference_tensor_parallel=inference_tp, + inference_pipeline_parallel=inference_pp, use_nfs_workspace=use_nfs_workspace, ) dist.barrier() @@ -288,14 +305,13 @@ def export(self, model: llm.GPTModel, model_dir: str) -> None: # Save the model context in order to restore its tokenizer later. The destination # path is "nemo_context" as this name is used in nemo.export to setup tokenizer. if dist.get_rank() == 0: + assert self._validate_quantized_checkpoint(export_dir, inference_tp) shutil.copytree( - os.path.join(model_dir, CONTEXT_PATH), + ckpt_to_context_subdir(model_dir), os.path.join(export_dir, "nemo_context"), dirs_exist_ok=True, ) - logging.info("Model context saved.") - - logging.info(f"Export succeeded, model has been exported to {export_dir}.") + logging.info(f"Export succeeded, model has been exported to {export_dir}.") def get_calib_data_iter( @@ -323,7 +339,7 @@ def get_calib_data_iter( def create_data_iterator_getter(model, dataset, seq_len, batch_size, calibration_size): """Create a function that provides iterator over a given dataset.""" - def _iterator(): + def _get_iterator(): CHARACTERS_PER_TOKEN = 4 dataloader = get_calib_data_iter( @@ -332,14 +348,13 @@ def _iterator(): batch_size=batch_size, calib_size=calibration_size, ) + + data = [] for batch in dataloader: batch = [model.tokenizer.text_to_ids(text)[:seq_len] for text in batch] batch = [ids + (seq_len - len(ids)) * [model.tokenizer.eos] for ids in batch] - yield torch.tensor(batch, device=model.device) + data.append(torch.tensor(batch, device=model.device)) - def _iterator_getter(): - dataloader = _iterator() - dataloader = [data for data in dataloader] - return iter(tqdm(dataloader)) + return iter(tqdm(data)) - return _iterator_getter + return _get_iterator diff --git a/nemo/collections/llm/quantization/utils.py b/nemo/collections/llm/quantization/utils.py index bdfccb208d06..20739c872e80 100644 --- a/nemo/collections/llm/quantization/utils.py +++ b/nemo/collections/llm/quantization/utils.py @@ -23,8 +23,33 @@ from nemo.utils import logging +def get_modelopt_decoder_type(model: llm.GPTModel) -> str: + """Infers the modelopt decoder type from GPTModel subclass.""" + mapping = [ + (llm.Baichuan2Model, "baichuan"), + (llm.ChatGLMModel, "chatglm"), + (llm.Gemma2Model, "gemma2"), + (llm.GemmaModel, "gemma"), + (llm.LlamaModel, "llama"), + (llm.MistralModel, "llama"), + (llm.MixtralModel, "llama"), + (llm.NemotronModel, "gptnext"), + (llm.Qwen2Model, "qwen"), + (llm.StarcoderModel, "gptnext"), + (llm.Starcoder2Model, "gptnext"), + (llm.Phi3Model, "phi3"), + ] + + for config_class, decoder_type in mapping: + if isinstance(model, config_class): + return decoder_type + + logging.warning("Could not infer the decoder type") + return None + + def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig: - """Modify model config for TensorRT Model Optimizer""" + """Modify model config for TensorRT-Model-Optimizer quantization""" from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec import ( get_gpt_layer_modelopt_spec, @@ -46,7 +71,9 @@ def quantizable_model_config(model_cfg: llm.GPTConfig) -> llm.GPTConfig: def load_with_modelopt_layer_spec( nemo_checkpoint_path: str, calib_tp: int = 1, calib_pp: int = 1, inference_only: bool = True ): - # TODO: setting ddp="pytorch" with manually deleting model.optim is a hackish way to disable DDP initialization. Needs a systematic solution. + """Loads a model from a NeMo 2.0 checkpoint using modelopt layer spec.""" + # TODO: setting ddp="pytorch" and deleting model.optim is a hackish way to disable DDP initialization. + # Needs a systematic solution. if inference_only: strategy = nl.MegatronStrategy( tensor_model_parallel_size=calib_tp, @@ -81,6 +108,7 @@ def load_with_modelopt_layer_spec( def get_unwrapped_mcore_model(model): + """Unwraps NeMo 2.0 to base MCore model.""" from megatron.core.models.gpt import GPTModel as MCoreGPTModel unwrapped_model = model diff --git a/scripts/llm/ptq.py b/scripts/llm/ptq.py index c04d32290e5f..2afe38c37b4d 100644 --- a/scripts/llm/ptq.py +++ b/scripts/llm/ptq.py @@ -17,6 +17,8 @@ def get_args(): + """Parses PTQ arguments""" + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="NeMo PTQ argument parser", @@ -58,6 +60,10 @@ def get_args(): type=str, help='Calibration dataset to be used. Should be \"wikitext\", \"cnn_dailymail\" or path to a local .json file', ) + parser.add_argument( + '--generate_sample', help='Generate sample model output after performing PTQ', action='store_true' + ) + parser.set_defaults(generate_sample=False) args = parser.parse_args() if args.output_path is None: @@ -68,6 +74,8 @@ def get_args(): def main(): + """Example NeMo 2.0 Post Training Quantization workflow""" + args = get_args() quantization_config = quantization.QuantizationConfig( @@ -87,6 +95,7 @@ def main(): inference_tensor_parallel=args.tensor_parallelism_size, inference_pipeline_parallel=args.pipeline_parallelism_size, dtype=args.dtype, + generate_sample=args.generate_sample, ) quantizer = quantization.Quantizer(quantization_config, export_config)