diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 2c4946bbbde1..ae22ede4807b 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -14,13 +14,13 @@ Add a one line overview of what this PR aims to accomplish. # Add a code snippet demonstrating how to use this ``` -# Jenkins CI +# GitHub Actions CI The Jenkins CI system has been replaced by GitHub Actions self-hosted runners. -There's no need to comment `jenkins` on the PR to trigger Jenkins CI. -The GitHub Actions CI will run automatically when the PR is opened. -To run CI on an untrusted fork, a NeMo user with write access must click "Approve and run". +The GitHub Actions CI will run automatically when the "Run CICD" label is added to the PR. +To re-run CI remove and add the label again. +To run CI on an untrusted fork, a NeMo user with write access must first click "Approve and run". # Before your PR is "Ready for review" **Pre checks**: diff --git a/.github/scripts/slackHelper.sh b/.github/scripts/slackHelper.sh new file mode 100644 index 000000000000..4696cebcf13b --- /dev/null +++ b/.github/scripts/slackHelper.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +function sendSlackMessage() { + + WEBHOOK_URL="$1" + PIPELINE_URL="$2" + + curl -X POST -H "Content-type: application/json" --data "{ + \"blocks\": [ + { + \"type\": \"section\", + \"text\": { + \"type\": \"mrkdwn\", + \"text\": \"\ +🚨 *CI/CD failure at <$PIPELINE_URL|NeMo CI>*: + +\" + } + } + ] + }" $WEBHOOK_URL + +} diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml new file mode 100644 index 000000000000..31e9452d0fe5 --- /dev/null +++ b/.github/workflows/_test_template.yml @@ -0,0 +1,58 @@ +name: ~test template + +on: + workflow_call: + inputs: + RUNNER: + type: string + description: Runner to use for test + required: true + TIMEOUT: + type: number + description: Max runtime of test in minutes + required: false + default: 10 + SCRIPT: + type: string + description: Test script to execute + required: true + AFTER_SCRIPT: + type: string + description: Script to run after main test + required: false + default: ":" + IS_OPTIONAL: + type: boolean + description: Failure will cancel all other tests if set to true + required: false + default: false + outputs: + conclusion: + description: Conclusion of main test step + value: ${{ jobs.main.outputs.conclusion }} + +jobs: + main: + runs-on: ${{ inputs.RUNNER }} + timeout-minutes: ${{ inputs.TIMEOUT }} + outputs: + conclusion: ${{ steps.main.conclusion }} + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - id: main + run: ${{ inputs.SCRIPT }} + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: failure() && inputs.IS_OPTIONAL == false + - name: after_script + if: always() && inputs.AFTER_SCRIPT != ':' + run: ${{ inputs.AFTER_SCRIPT }} \ No newline at end of file diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml deleted file mode 100644 index bdfb24c4b1e5..000000000000 --- a/.github/workflows/blossom-ci.yml +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# A workflow to trigger ci on hybrid infra (github + self hosted runner) -name: Blossom-CI -on: - issue_comment: - types: [created] - workflow_dispatch: - inputs: - platform: - description: 'runs-on argument' - required: false - args: - description: 'argument' - required: false -jobs: - Authorization: - name: Authorization - runs-on: blossom - outputs: - args: ${{ env.args }} - - # This job only runs for pull request comments - if: | - contains( 'okuchaiev,ericharper,titu1994,MaximumEntropy,nithinraok,redoctopus,yidong72,SeanNaren,yzhang123,ekmb,arendu,', format('{0},', github.actor)) && - github.event.comment.body == '/blossom-ci' - steps: - - name: Check if comment is issued by authorized person - run: blossom-ci - env: - OPERATION: 'AUTH' - REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }} - - Vulnerability-scan: - name: Vulnerability scan - needs: [Authorization] - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v2 - with: - repository: ${{ fromJson(needs.Authorization.outputs.args).repo }} - ref: ${{ fromJson(needs.Authorization.outputs.args).ref }} - lfs: 'true' - - # repo specific steps - #- name: Setup java - # uses: actions/setup-java@v1 - # with: - # java-version: 1.8 - - # add blackduck properties https://synopsys.atlassian.net/wiki/spaces/INTDOCS/pages/631308372/Methods+for+Configuring+Analysis#Using-a-configuration-file - #- name: Setup blackduck properties - # run: | - # PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g') - # echo detect.maven.build.command="-pl=$PROJECTS -am" >> application.properties - # echo detect.maven.included.scopes=compile >> application.properties - - - name: Run blossom action - uses: NVIDIA/blossom-action@main - env: - REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }} - with: - args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }} - args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }} - args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }} - - Job-trigger: - name: Start ci job - needs: [Vulnerability-scan] - runs-on: blossom - steps: - - name: Start ci job - run: blossom-ci - env: - OPERATION: 'START-CI-JOB' - CI_SERVER: ${{ secrets.CI_SERVER }} - REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - Upload-Log: - name: Upload log - runs-on: blossom - if : github.event_name == 'workflow_dispatch' - steps: - - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here) - run: blossom-ci - env: - OPERATION: 'POST-PROCESSING' - CI_SERVER: ${{ secrets.CI_SERVER }} - REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index e6e8fb808943..12b8cdcb8eed 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -15,7 +15,9 @@ name: "CICD NeMo" on: pull_request: - branches: [ "main" ] + branches: + - 'main' + - 'r**' types: [ labeled ] concurrency: @@ -41,550 +43,420 @@ jobs: docker container prune --filter "until=24h" --force docker image prune -a --filter "until=24h" --force -# checkout-repository: -# runs-on: self-hosted-azure -# container: -# image: nvcr.io/nvidia/pytorch:24.01-py3 -# volumes: -# - ${{ github.workspace }}:/workspace -# steps: -# - name: Checkout repository -# uses: actions/checkout@v4 -# with: -# path: ${{ github.run_id }} - cicd-test-container-setup: needs: [cicd-cluster-clean] runs-on: self-hosted-azure-builder if: ${{ github.event.label.name == 'Run CICD' }} - # uses: actions/cache@v2 - #container: -# image: nvcr.io/nvidia/pytorch:24.01-py3 -# options: -# # --user 0:128 -# --device=/dev/nvidia0 -# --gpus all -# --shm-size=8g -# --env TRANSFORMERS_OFFLINE=0 -# --env HYDRA_FULL_ERROR=1 steps: - name: Checkout repository uses: actions/checkout@v4 with: path: ${{ github.run_id }} - - - name: Container setup - run: | - # Pull base PyTorch container - docker pull nvcr.io/nvidia/pytorch:24.02-py3 - docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c ' - set -x - - # PyTorch version - python -c "import torch; print(torch.__version__)" - python -c "import torchvision; print(torchvision.__version__)" - - # Install test requirements - apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt - - # Code formatting checks - python setup.py style - - # Copyright Headers check - python tests/check_copyright_header.py --dir . - - # NeMo Installation - ./reinstall.sh release - - # Transformer Engine installation - git clone https://github.com/NVIDIA/TransformerEngine.git && \ - pushd TransformerEngine && \ - git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \ - git checkout FETCH_HEAD && \ - git submodule init && git submodule update && \ - NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . && \ - popd - - # Apex installation - git clone https://github.com/NVIDIA/apex.git && \ - pushd apex && \ - git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \ - cp -R apex /usr/local/lib/python3.10/dist-packages && \ - popd - - # pip package should be working with main, if not we can update the commit here - # until the pip package is updated - # Megatron Core installation - git clone https://github.com/NVIDIA/Megatron-LM.git && \ - pushd Megatron-LM && \ - git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \ - pip install . && \ - pushd megatron/core/datasets && \ - make && \ - popd && \ - popd - export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM" - - # Install only for test: L2: Segmentation Tool - pushd tools/ctc_segmentation && \ - pip install -r requirements.txt && \ - apt-get update && apt-get install libsox-fmt-all -y && \ - popd - - # PyTorch Lightning version - python -c "import pytorch_lightning; print(pytorch_lightning.__version__)" - - # PyTorch Lightning DDP Checks - CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py" - - # Basic Import Checks - python -c "import nemo.collections.asr as nemo_asr" - python -c "import nemo.collections.nlp as nemo_nlp" - python -c "import nemo.collections.tts as nemo_tts" - - # set permission - chmod 777 -R /workspace - ' - ### \'\' - - - name: Push container to registry for future use + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + # We use `docker` driver as this speeds things up for + # trivial (non-multi-stage) builds. + driver: docker + + - name: Build and push + uses: docker/build-push-action@v5 + with: + file: Dockerfile.ci + push: true + cache-from: nemoci.azurecr.io/nemo_container:latest + cache-to: type=inline + tags: | + nemoci.azurecr.io/nemo_container_${{ github.run_id }} + nemoci.azurecr.io/nemo_container:latest + + - name: Run some checks run: | - # Push container - echo "Docker: List containers" && docker ps -a - DOCKER_COMMIT=$(docker ps --latest --quiet) # latest container - docker commit $DOCKER_COMMIT nemoci.azurecr.io/nemo_container_${{ github.run_id }} - docker tag nemoci.azurecr.io/nemo_container_${{ github.run_id }} nemoci.azurecr.io/nemo_container_${{ github.run_id }} - docker push nemoci.azurecr.io/nemo_container_${{ github.run_id }} - - # - name: Build and push to local registry - # uses: docker/build-push-action@v5 - # with: - # context: . - # push: true - # tags: nemoci.azurecr.io/name/app:latest - - # - name: Inspect - # run: | - # docker buildx imagetools inspect nemoci.azurecr.io/name/app:latest + docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '\ + # PyTorch Lightning version + python -c "import pytorch_lightning; print(pytorch_lightning.__version__)" + + # PyTorch Lightning DDP Checks + CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py" + + # Basic Import Checks + python -c "import nemo.collections.asr as nemo_asr" + python -c "import nemo.collections.nlp as nemo_nlp" + python -c "import nemo.collections.tts as nemo_tts" + + python setup.py style + python tests/check_copyright_header.py --dir . - #- name: Post-workflow execution - # uses: gacts/run-and-post-run@v1 - # with: - # post: | - # chmod -R 777 . + # These checks are not crucial + exit 0 + ' + ### \'\' - L0_Unit_Tests_GPU: + OPTIONAL_L0_Unit_Tests_GPU: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: "L0: Unit Tests GPU" - run: | + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + TIMEOUT: 30 + SCRIPT: | NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + IS_OPTIONAL: true L0_Unit_Tests_CPU: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-cpu - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: "L0: Unit Tests CPU" - run: | + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-cpu + TIMEOUT: 60 + SCRIPT: | CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + L0_Setup_Test_Data_And_Models: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python -m tests.setup --save_dir /home/TestData/nlp -## - name: L2: Multimodal Imagen Train + ## - name: L2: Multimodal Imagen Train # L2: Community LLM Checkpoints tests L2_Community_LLM_Checkpoints_tests_Llama: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ - --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \ - --output_path=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \ - --precision=16 - rm -f /home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ + --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \ + --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + --precision=16 + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_llama/model_weights + + L2_Community_LLM_Checkpoints_tests_Llama3: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ + --input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \ + --output_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo \ + --precision=16 + AFTER_SCRIPT: | + rm -f /home/TestData/nlp/megatron_llama/llama3-ci-hf/llama3_ci.nemo + rm -rf /home/TestData/nlp/megatron_llama/llama3-ci-hf/model_weights L2_Community_LLM_Checkpoints_tests_StarCoder: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \ - --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \ - --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf - rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + mkdir -p /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}; + python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \ + --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \ + --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }} + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo; + rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/${{ github.run_id }}/ + rm -rf /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/model_weights L2_Community_LLM_Checkpoints_tests_Falcon: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \ - --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \ - --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo - rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \ + --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \ + --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo + rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights + + # this test is using a 7B model which is too large for GitHub CI + # replace the model in this test with a toy model or move the test + # to the nightly CI + # OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2: + # needs: [cicd-test-container-setup] + # runs-on: self-hosted-azure + # container: + # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + # options: + # # --user 0:128 + # --device=/dev/nvidia0 + # --gpus all + # --shm-size=8g + # --env TRANSFORMERS_OFFLINE=0 + # --env HYDRA_FULL_ERROR=1 + # --volume /mnt/datadrive/TestData:/home/TestData + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - run: | + # python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \ + # --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \ + # --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo + # rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo + # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + # if: "failure()" - L2_Community_LLM_Checkpoints_tests_Baichuan2: + L2_PTQ_Llama2_Export_Only: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \ - --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \ - --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo - rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + quantization.algorithm=null \ + model_save=/home/TestData/nlp/megatron_llama/ci_baseline + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_llama/ci_baseline + + L2_PTQ_Llama2_FP8: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + tensor_model_parallel_size=2 \ + trainer.devices=2 \ + quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + quantization.algorithm=fp8 \ + quantization.num_calib_size=8 \ + inference.batch_size=2 \ + export.inference_tensor_parallel=2 \ + model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo + + L2_PTQ_Llama2_INT8_SQ: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_quantization.py \ + model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + quantization.algorithm=int8_sq \ + quantization.num_calib_size=8 \ + inference.batch_size=2 \ + model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo + + # TODO: investigate int4_awq stuck issues and restore the test + #L2_PTQ_Llama2_INT4_AWQ: + # needs: [cicd-test-container-setup] + # runs-on: self-hosted-azure + # timeout-minutes: 10 + # container: + # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + # options: + # # --user 0:128 + # --device=/dev/nvidia0 + # --gpus all + # --shm-size=8g + # --env TRANSFORMERS_OFFLINE=0 + # --env HYDRA_FULL_ERROR=1 + # --volume /mnt/datadrive/TestData:/home/TestData + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - run: | + # python examples/nlp/language_modeling/megatron_quantization.py \ + # model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ + # tensor_model_parallel_size=1 \ + # trainer.devices=1 \ + # quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ + # quantization.algorithm=int4_awq \ + # quantization.num_calib_size=8 \ + # inference.batch_size=2 \ + # model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo + # + # rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo + #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + # if: "failure()" # L2: ASR dev run ASR_dev_run_Speech_to_Text: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_results - rm -rf examples/asr/speech_to_text_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc.py \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_results ASR_dev_run_Speech_to_Text_WPE_-_CitriNet: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/citrinet/" --config-name="config_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results - rm -rf examples/asr/speech_to_text_wpe_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ + --config-path="../conf/citrinet/" --config-name="config_bpe" \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ + model.tokenizer.type="wpe" \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_wpe_results ASR_dev_run_Speech_Pre-training_-_CitriNet: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/speech_pretraining/speech_pre_training.py \ - --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_pre_training_results - rm -rf examples/asr/speech_pre_training_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/speech_pretraining/speech_pre_training.py \ + --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_pre_training_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_pre_training_results ASR_dev_run_Speech_To_Text_Finetuning: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/speech_to_text_finetune.py \ - --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - model.tokenizer.update_tokenizer=False \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_finetuning_results - rm -rf examples/asr/speech_finetuning_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - ASR_dev_run_Speech_To_Text_HF_Finetuning: + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/speech_to_text_finetune.py \ + --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ + model.tokenizer.update_tokenizer=False \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_finetuning_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_finetuning_results + + OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/speech_to_text_finetune.py \ - --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ - ~model.train_ds.hf_data_cfg \ - model.train_ds.num_workers=1 \ - model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ - model.train_ds.streaming=true \ - +model.train_ds.hf_data_cfg.path="librispeech_asr" \ - +model.train_ds.hf_data_cfg.name=null \ - +model.train_ds.hf_data_cfg.split="test.clean" \ - +model.train_ds.hf_data_cfg.streaming=true \ - ~model.validation_ds.hf_data_cfg \ - model.validation_ds.streaming=true \ - +model.validation_ds.hf_data_cfg.path="librispeech_asr" \ - +model.validation_ds.hf_data_cfg.name=null \ - +model.validation_ds.hf_data_cfg.split="test.clean" \ - +model.validation_ds.hf_data_cfg.streaming=true \ - ~model.test_ds \ - init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - model.tokenizer.update_tokenizer=False \ - model.optim.sched.warmup_steps=0 \ - +model.optim.sched.max_steps=3 \ - trainer.max_epochs=null \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_finetuning_results - rm -rf examples/asr/speech_finetuning_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: |- + python examples/asr/speech_to_text_finetune.py \ + --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ + ~model.train_ds.hf_data_cfg \ + model.train_ds.num_workers=1 \ + model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ + model.train_ds.streaming=true \ + +model.train_ds.hf_data_cfg.path="librispeech_asr" \ + +model.train_ds.hf_data_cfg.name=null \ + +model.train_ds.hf_data_cfg.split="test.clean" \ + +model.train_ds.hf_data_cfg.streaming=true \ + ~model.validation_ds.hf_data_cfg \ + model.validation_ds.streaming=true \ + +model.validation_ds.hf_data_cfg.path="librispeech_asr" \ + +model.validation_ds.hf_data_cfg.name=null \ + +model.validation_ds.hf_data_cfg.split="test.clean" \ + +model.validation_ds.hf_data_cfg.streaming=true \ + ~model.test_ds \ + init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ + model.tokenizer.update_tokenizer=False \ + model.optim.sched.warmup_steps=0 \ + +model.optim.sched.max_steps=3 \ + trainer.max_epochs=null \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_finetuning_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_finetuning_results + IS_OPTIONAL: true ASR_dev_run_Speech_to_Text_WPE_-_Conformer: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - model.train_ds.batch_size=4 \ - model.validation_ds.batch_size=4 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results - rm -rf examples/asr/speech_to_text_wpe_conformer_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ + --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ + model.tokenizer.type="wpe" \ + model.train_ds.batch_size=4 \ + model.validation_ds.batch_size=4 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_wpe_conformer_results # L2: ASR dev run - part two ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - model.encoder.d_model=144 \ - model.train_ds.batch_size=4 \ - model.validation_ds.batch_size=4 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results - rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ + --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ + model.tokenizer.type="wpe" \ + model.encoder.d_model=144 \ + model.train_ds.batch_size=4 \ + model.validation_ds.batch_size=4 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results L2_Speech_to_Text_EMA: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=2 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - +exp_manager.ema.enable=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_results - rm -rf examples/asr/speech_to_text_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc.py \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices=2 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + +exp_manager.ema.enable=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_results + # L2_Speech_to_Text_AED: # needs: [cicd-test-container-setup] @@ -638,514 +510,315 @@ jobs: # L2: Speaker dev run L2_Speaker_dev_run_Speaker_Recognition: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/speaker_tasks/recognition/speaker_reco.py \ - model.train_ds.batch_size=10 \ - model.validation_ds.batch_size=2 \ - model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \ - model.decoder.num_classes=2 \ - trainer.max_epochs=10 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results - rm -rf examples/speaker_tasks/recognition/speaker_recognition_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/speaker_tasks/recognition/speaker_reco.py \ + model.train_ds.batch_size=10 \ + model.validation_ds.batch_size=2 \ + model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \ + model.decoder.num_classes=2 \ + trainer.max_epochs=10 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results + AFTER_SCRIPT: | + rm -rf examples/speaker_tasks/recognition/speaker_recognition_results L2_Speaker_dev_run_Speaker_Diarization: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \ - model.diarizer.speaker_embeddings.model_path=titanet_large \ - model.train_ds.batch_size=5 \ - model.validation_ds.batch_size=5 \ - model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ - model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ - model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results - rm -rf examples/speaker_tasks/diarization/speaker_diarization_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \ + model.diarizer.speaker_embeddings.model_path=titanet_large \ + model.train_ds.batch_size=5 \ + model.validation_ds.batch_size=5 \ + model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ + model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ + model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results + AFTER_SCRIPT: | + rm -rf examples/speaker_tasks/diarization/speaker_diarization_results L2_Speaker_dev_run_Speech_to_Label: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/speech_classification/speech_to_label.py \ - model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ - model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ - model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ - ~model.preprocessor.window_size \ - ~model.preprocessor.window_stride \ - ~model.preprocessor.window \ - ~model.preprocessor.n_mels \ - ~model.preprocessor.n_mfcc \ - ~model.preprocessor.n_fft \ - exp_manager.exp_dir=examples/asr/speech_to_label_results - rm -rf examples/asr/speech_to_label_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/speech_classification/speech_to_label.py \ + model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ + model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ + model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ + ~model.preprocessor.window_size \ + ~model.preprocessor.window_stride \ + ~model.preprocessor.window \ + ~model.preprocessor.n_mels \ + ~model.preprocessor.n_mfcc \ + ~model.preprocessor.n_fft \ + exp_manager.exp_dir=examples/asr/speech_to_label_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_label_results L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \ - diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \ - diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \ - diarizer.asr.model_path=QuartzNet15x5Base-En \ - diarizer.asr.parameters.asr_based_vad=True \ - diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results - rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \ + diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ + diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ + diarizer.speaker_embeddings.parameters.save_embeddings=True \ + diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \ + diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \ + diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \ + diarizer.asr.model_path=QuartzNet15x5Base-En \ + diarizer.asr.parameters.asr_based_vad=True \ + diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results + AFTER_SCRIPT: | + rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results L2_Speaker_dev_run_Clustering_Diarizer_Inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \ - diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \ - diarizer.speaker_embeddings.parameters.multiscale_weights=null \ - diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ - diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results - rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ + diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ + diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ + diarizer.speaker_embeddings.parameters.save_embeddings=True \ + diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \ + diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \ + diarizer.speaker_embeddings.parameters.multiscale_weights=null \ + diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ + diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results + AFTER_SCRIPT: | + rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results L2_Speaker_dev_run_Neural_Diarizer_Inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ - diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results - rm -rf examples/speaker_tasks/diarization/neural_diarizer_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \ + diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ + diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \ + diarizer.speaker_embeddings.parameters.save_embeddings=True \ + diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ + diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results + AFTER_SCRIPT: | + rm -rf examples/speaker_tasks/diarization/neural_diarizer_results L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python tools/speech_data_simulator/multispeaker_simulator.py \ - --config-path=conf --config-name=data_simulator.yaml \ - data_simulator.random_seed=42 \ - data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \ - data_simulator.outputs.output_dir=./test_simulator \ - data_simulator.session_config.num_sessions=2 \ - data_simulator.session_config.session_length=60 - rm -rf ./test_simulator - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python tools/speech_data_simulator/multispeaker_simulator.py \ + --config-path=conf --config-name=data_simulator.yaml \ + data_simulator.random_seed=42 \ + data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \ + data_simulator.outputs.output_dir=./test_simulator \ + data_simulator.session_config.num_sessions=2 \ + data_simulator.session_config.session_length=60 + AFTER_SCRIPT: | + rm -rf ./test_simulator # L2: ASR Multi-dataloader dev run L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - +trainer.num_sanity_val_steps=1 \ - exp_manager.exp_dir=examples/asr/speech_to_text_results - rm -rf examples/asr/speech_to_text_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_ctc/speech_to_text_ctc.py \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + +trainer.num_sanity_val_steps=1 \ + exp_manager.exp_dir=examples/asr/speech_to_text_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_results L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/speech_classification/speech_to_label.py \ - model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ - model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - +trainer.num_sanity_val_steps=1 \ - model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ - ~model.preprocessor.window_size \ - ~model.preprocessor.window_stride \ - ~model.preprocessor.window \ - ~model.preprocessor.n_mels \ - ~model.preprocessor.n_mfcc \ - ~model.preprocessor.n_fft \ - exp_manager.exp_dir=examples/asr/speech_to_label_results - rm -rf examples/asr/speech_to_label_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/speech_classification/speech_to_label.py \ + model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ + model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + +trainer.num_sanity_val_steps=1 \ + model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ + ~model.preprocessor.window_size \ + ~model.preprocessor.window_stride \ + ~model.preprocessor.window \ + ~model.preprocessor.n_mels \ + ~model.preprocessor.n_mfcc \ + ~model.preprocessor.n_fft \ + exp_manager.exp_dir=examples/asr/speech_to_label_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_label_results # L2: ASR Adapters L2_ASR_Adapters_Linear_Adapters: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_adapters/train_asr_adapter.py \ - model.pretrained_model="stt_en_conformer_ctc_small" \ - model.adapter.adapter_name="an4" \ - model.adapter.linear.in_features=176 \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.max_steps=5 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results - rm -rf examples/asr/speech_to_text_adapters_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_adapters/train_asr_adapter.py \ + model.pretrained_model="stt_en_conformer_ctc_small" \ + model.adapter.adapter_name="an4" \ + model.adapter.linear.in_features=176 \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + trainer.max_steps=5 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_adapters_results L2_ASR_Adapters_RelPos_MHA_Adapters: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/asr_adapters/train_asr_adapter.py \ - model.pretrained_model="stt_en_conformer_ctc_small" \ - model.adapter.adapter_name="encoder:an4" \ - model.adapter.adapter_type="tiny_attn" \ - model.adapter.tiny_attn.n_feat=176 \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.max_steps=5 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results - rm -rf examples/asr/speech_to_text_adapters_mha_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/asr/asr_adapters/train_asr_adapter.py \ + model.pretrained_model="stt_en_conformer_ctc_small" \ + model.adapter.adapter_name="encoder:an4" \ + model.adapter.adapter_type="tiny_attn" \ + model.adapter.tiny_attn.n_feat=176 \ + model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ + model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ + trainer.max_steps=5 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=True \ + exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results + AFTER_SCRIPT: | + rm -rf examples/asr/speech_to_text_adapters_mha_results # L2: Speech Transcription L2_Speech_Transcription_Speech_to_Text_Transcribe: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/asr/transcribe_speech.py \ - pretrained_name="QuartzNet15x5Base-En" \ - audio_dir="/home/TestData/an4_transcribe/test_subset/" \ - output_filename="stt_test_res.json" \ - amp=true - rm -rf stt_test_res.json - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/asr/transcribe_speech.py \ + pretrained_name="QuartzNet15x5Base-En" \ + audio_dir="/home/TestData/an4_transcribe/test_subset/" \ + output_filename="stt_test_res.json" \ + amp=true + AFTER_SCRIPT: | + rm -rf stt_test_res.json # L2: Transducer alignment L2_Transducer_alignment_Running_pytest: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 # L2: Segmentation Tool L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd tools/ctc_segmentation && \ - TIME=`date +"%Y-%m-%d-%T"` && \ - /bin/bash run_segmentation.sh \ - --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \ - --DATA_DIR=/home/TestData/ctc_segmentation/eng \ - --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \ - --LANGUAGE=en \ - --USE_NEMO_NORMALIZATION="TRUE" && \ - python /home/TestData/ctc_segmentation/verify_alignment.py \ - -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \ - -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \ - rm -rf /home/TestData/ctc_segmentation/eng/output${TIME} - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd tools/ctc_segmentation && \ + TIME=`date +"%Y-%m-%d-%T"` && \ + /bin/bash run_segmentation.sh \ + --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \ + --DATA_DIR=/home/TestData/ctc_segmentation/eng \ + --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \ + --LANGUAGE=en \ + --USE_NEMO_NORMALIZATION="TRUE" && \ + python /home/TestData/ctc_segmentation/verify_alignment.py \ + -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \ + -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt; + AFTER_SCRIPT: | + rm -rf /home/TestData/ctc_segmentation/eng/output${TIME} L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd tools/ctc_segmentation && \ - TIME=`date +"%Y-%m-%d-%T"` && \ - /bin/bash run_segmentation.sh \ - --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \ - --DATA_DIR=/home/TestData/ctc_segmentation/ru \ - --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \ - --LANGUAGE=ru \ - --ADDITIONAL_SPLIT_SYMBOLS=";" && \ - python /home/TestData/ctc_segmentation/verify_alignment.py \ - -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \ - -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \ - rm -rf /home/TestData/ctc_segmentation/ru/output${TIME} - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd tools/ctc_segmentation && \ + TIME=`date +"%Y-%m-%d-%T"` && \ + /bin/bash run_segmentation.sh \ + --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \ + --DATA_DIR=/home/TestData/ctc_segmentation/ru \ + --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \ + --LANGUAGE=ru \ + --ADDITIONAL_SPLIT_SYMBOLS=";" && \ + python /home/TestData/ctc_segmentation/verify_alignment.py \ + -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \ + -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt; + + rm -rf /home/TestData/ctc_segmentation/eng/output${TIME} # L2: G2P Models L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/tts/g2p && \ - TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \ - python g2p_train_and_evaluate.py \ - train_manifest=/home/TestData/g2p/g2p.json \ - validation_manifest=/home/TestData/g2p/g2p.json \ - model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ - model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \ - trainer.max_epochs=1 \ - model.max_source_len=64 \ - trainer.devices=1 \ - do_training=True \ - do_testing=True \ - exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \ - +exp_manager.use_datetime_version=False\ - +exp_manager.version=test \ - --config-name=g2p_conformer_ctc && \ - python g2p_inference.py \ - pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \ - manifest_filepath=/home/TestData/g2p/g2p.json \ - phoneme_field=text - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/tts/g2p && \ + TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \ + python g2p_train_and_evaluate.py \ + train_manifest=/home/TestData/g2p/g2p.json \ + validation_manifest=/home/TestData/g2p/g2p.json \ + model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ + model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \ + trainer.max_epochs=1 \ + model.max_source_len=64 \ + trainer.devices=1 \ + do_training=True \ + do_testing=True \ + exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \ + +exp_manager.use_datetime_version=False\ + +exp_manager.version=test \ + --config-name=g2p_conformer_ctc && \ + python g2p_inference.py \ + pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \ + manifest_filepath=/home/TestData/g2p/g2p.json \ + phoneme_field=text # TODO: pleasefixme @redoctopus # - name: ByT5G2P training, evaluation and inference @@ -1175,42 +848,28 @@ jobs: L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/tts/g2p && \ - TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \ - python g2p_heteronym_classification_train_and_evaluate.py \ - train_manifest=/home/TestData/g2p/manifest.json \ - validation_manifest=/home/TestData/g2p/manifest.json \ - test_manifest=/home/TestData/g2p/manifest.json \ - model.wordids=/home/TestData/g2p/wordids.tsv \ - trainer.max_epochs=1 \ - model.max_seq_length=64 \ - do_training=True \ - do_testing=True \ - exp_manager.exp_dir=${OUTPUT_DIR} \ - +exp_manager.use_datetime_version=False\ - +exp_manager.version=test && \ - python g2p_heteronym_classification_inference.py \ - manifest=/home/TestData/g2p/manifest.json \ - pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ - output_manifest=preds.json - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/tts/g2p && \ + TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \ + python g2p_heteronym_classification_train_and_evaluate.py \ + train_manifest=/home/TestData/g2p/manifest.json \ + validation_manifest=/home/TestData/g2p/manifest.json \ + test_manifest=/home/TestData/g2p/manifest.json \ + model.wordids=/home/TestData/g2p/wordids.tsv \ + trainer.max_epochs=1 \ + model.max_seq_length=64 \ + do_training=True \ + do_testing=True \ + exp_manager.exp_dir=${OUTPUT_DIR} \ + +exp_manager.use_datetime_version=False\ + +exp_manager.version=test && \ + python g2p_heteronym_classification_inference.py \ + manifest=/home/TestData/g2p/manifest.json \ + pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ + output_manifest=preds.json # L2: Dialogue Classification @@ -1258,320 +917,217 @@ jobs: L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \ - model.dataset.task_name=debug_sample \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.dataset.num_tasks=6 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-cased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + model.dataset.data_dir=/home/TestData/nlp/sgd_small \ + model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \ + model.dataset.task_name=debug_sample \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.dataset.num_tasks=6 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=bert-base-cased \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf sgd_gen_bert_outputs L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/processed_assistant \ - model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \ - model.dataset.task=assistant \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_intent_classification_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + model.dataset.data_dir=/home/TestData/nlp/processed_assistant \ + model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \ + model.dataset.task=assistant \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=bert-base-uncased \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf sgd_gen_bert_intent_classification_outputs L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \ - model.dataset.task=zero_shot \ - model.dataset.prompt_template="This example is" \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_zero_shot_intent_classification_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \ + model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ + model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \ + model.dataset.task=zero_shot \ + model.dataset.prompt_template="This example is" \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=bert-base-uncased \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf sgd_gen_zero_shot_intent_classification_outputs L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=megatron \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/design_dataset \ + model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ + model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \ + model.dataset.task=design \ + model.dataset.prompt_template="This example is related to" \ + model.library=megatron \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=bert-base-uncased \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf design_zero_shot_intent_classification_outputs L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=huggingface \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_bart_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/design_dataset \ + model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ + model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \ + model.dataset.task=design \ + model.dataset.prompt_template="This example is related to" \ + model.library=huggingface \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=bert-base-uncased \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf design_zero_shot_intent_classification_bart_outputs L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="" \ - model.library=huggingface \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_dialogue_nearest_neighbour_classification_outputs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/design_dataset \ + model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \ + model.dataset.task=design \ + model.dataset.prompt_template="" \ + model.library=huggingface \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf design_dialogue_nearest_neighbour_classification_outputs # L2: Dialogue Generation L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender_s2s \ - model.dataset.task=ms_marco \ - model.library=huggingface \ - model.dataset.debug_mode=True \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender_s2s - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ + model.dataset.dialogues_example_dir=answer_extender_s2s \ + model.dataset.task=ms_marco \ + model.library=huggingface \ + model.dataset.debug_mode=True \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=facebook/bart-large \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf answer_extender_s2s L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \ - model.dataset.task_name=debug_sample \ - model.dataset.task=sgd_generation \ - model.dataset.input_field=utterance+system_actions \ - model.dataset.output_field=system_utterance \ - model.dataset.use_cache=false \ - model.dataset.system_utterance=next_turn \ - model.dataset.debug_mode=True \ - model.dataset.prompt_template=slots_values \ - model.library=huggingface \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_answer_extender_s2s - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/sgd_small \ + model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \ + model.dataset.task_name=debug_sample \ + model.dataset.task=sgd_generation \ + model.dataset.input_field=utterance+system_actions \ + model.dataset.output_field=system_utterance \ + model.dataset.use_cache=false \ + model.dataset.system_utterance=next_turn \ + model.dataset.debug_mode=True \ + model.dataset.prompt_template=slots_values \ + model.library=huggingface \ + trainer.max_steps=1 \ + trainer.max_epochs=1 \ + model.train_ds.batch_size=2 \ + model.validation_ds.batch_size=2 \ + model.test_ds.batch_size=2 \ + model.nemo_path=null \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.language_model.pretrained_model_name=facebook/bart-large \ + trainer.accelerator=gpu \ + exp_manager=null + AFTER_SCRIPT: | + rm -rf sgd_answer_extender_s2s # - name: L2: Dialogue Generation Part 2 # when { @@ -1607,80 +1163,54 @@ jobs: # L2: COPY L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender \ - model.library=huggingface \ - model.dataset.task=ms_marco \ - model.dataset.debug_mode=True \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=gpt2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/dialogue && \ + python dialogue.py \ + do_training=False \ + model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ + model.dataset.dialogues_example_dir=answer_extender \ + model.library=huggingface \ + model.dataset.task=ms_marco \ + model.dataset.debug_mode=True \ + trainer.val_check_interval=0.0 \ + trainer.devices=1 \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name=gpt2 \ + trainer.accelerator=gpu \ + exp_manager=null && \ + rm -rf answer_extender # L2: Duplex Text Normalization L2_Duplex_Text_Normalization_with_Tarred_dataset: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/duplex_text_normalization && \ - python duplex_text_normalization_train.py \ - data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \ - mode=tn \ - lang=en \ - tagger_model.do_training=false \ - decoder_model.transformer=t5-small \ - data.validation_ds.batch_size=2 \ - data.train_ds.use_cache=false \ - data.validation_ds.use_cache=false \ - data.test_ds.batch_size=2 \ - data.train_ds.decoder_data_augmentation=false \ - data.train_ds.num_workers=2 \ - decoder_trainer.devices=[0,1] \ - decoder_trainer.accelerator="gpu" \ - data.train_ds.use_tarred_dataset=true \ - +decoder_trainer.fast_dev_run=true \ - decoder_exp_manager.create_checkpoint_callback=false \ - data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \ - data.test_ds.use_cache=false \ - data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/duplex_text_normalization && \ + python duplex_text_normalization_train.py \ + data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \ + mode=tn \ + lang=en \ + tagger_model.do_training=false \ + decoder_model.transformer=t5-small \ + data.validation_ds.batch_size=2 \ + data.train_ds.use_cache=false \ + data.validation_ds.use_cache=false \ + data.test_ds.batch_size=2 \ + data.train_ds.decoder_data_augmentation=false \ + data.train_ds.num_workers=2 \ + decoder_trainer.devices=[0,1] \ + decoder_trainer.accelerator="gpu" \ + data.train_ds.use_tarred_dataset=true \ + +decoder_trainer.fast_dev_run=true \ + decoder_exp_manager.create_checkpoint_callback=false \ + data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \ + data.test_ds.use_cache=false \ + data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv # Runs out of memory on the 12G TITAN V (GPU 0 on main CI) # TODO: add when megatron bert is supported again in NeMo @@ -1713,336 +1243,221 @@ jobs: # L2: BERT Text Classification L2_BERT_Text_Classification_with_BERT_Test: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/text_classification && \ - python text_classification_with_bert.py \ - model.dataset.num_classes=6 \ - model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.train_ds.batch_size=10 \ - model.dataset.max_seq_length=50 \ - model.dataset.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/text_classification && \ + python text_classification_with_bert.py \ + model.dataset.num_classes=6 \ + model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ + model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ + model.language_model.pretrained_model_name=distilbert-base-uncased \ + model.train_ds.batch_size=10 \ + model.dataset.max_seq_length=50 \ + model.dataset.use_cache=false \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + exp_manager=null # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0 L2_Parallel_BERT_Question-Answering_SQUAD_v1_1: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - # Cannot do fast_dev_run because squad needs whole dev dataset - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + # Cannot do fast_dev_run because squad needs whole dev dataset + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ + model.dataset.use_cache=false \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + model.test_ds.num_samples=2 \ + model.test_ds.batch_size=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.language_model.pretrained_model_name=bert-base-uncased \ + model.dataset.version_2_with_negative=false \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null L2_Parallel_BERT_Question-Answering_SQUAD_v2_0: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - # Cannot do fast_dev_run because squad needs whole dev dataset - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + # Cannot do fast_dev_run because squad needs whole dev dataset + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ + model.dataset.use_cache=false \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ + model.language_model.pretrained_model_name=bert-base-uncased \ + model.dataset.version_2_with_negative=true \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0 L2_Parallel_BART_Question-Answering_SQUAD_v1_1: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ + model.dataset.use_cache=false \ + model.dataset.check_if_answer_in_context=false \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + model.test_ds.num_samples=2 \ + model.test_ds.batch_size=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.language_model.pretrained_model_name=facebook/bart-base \ + model.dataset.version_2_with_negative=false \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null L2_Parallel_BART_Question-Answering_SQUAD_v2_0: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ + model.dataset.use_cache=false \ + model.dataset.check_if_answer_in_context=false \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ + model.language_model.pretrained_model_name=facebook/bart-base \ + model.dataset.version_2_with_negative=true \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0 L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ + model.dataset.use_cache=false \ + model.dataset.check_if_answer_in_context=false \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + model.test_ds.num_samples=2 \ + model.test_ds.batch_size=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.language_model.pretrained_model_name=gpt2 \ + model.dataset.version_2_with_negative=false \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/question_answering && \ + python question_answering.py \ + model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ + model.dataset.use_cache=false \ + model.dataset.check_if_answer_in_context=false \ + model.train_ds.batch_size=2 \ + model.train_ds.num_samples=2 \ + model.validation_ds.batch_size=2 \ + model.validation_ds.num_samples=2 \ + trainer.max_epochs=1 \ + trainer.max_steps=1 \ + model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ + model.language_model.pretrained_model_name=gpt2 \ + model.dataset.version_2_with_negative=true \ + trainer.precision=16 \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + exp_manager=null # L2: Intent and Slot Classification Tasks L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/intent_slot_classification && \ - python intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/retail \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints - rm -rf checkpoints - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/intent_slot_classification && \ + python intent_slot_classification.py \ + model.data_dir=/home/TestData/nlp/retail \ + model.validation_ds.prefix=dev \ + model.test_ds.prefix=dev \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + exp_manager.exp_dir=checkpoints + AFTER_SCRIPT: | + rm -rf checkpoints L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/intent_slot_classification && \ - python multi_label_intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/new_multiatis \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=1 \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints2 - rm -rf checkpoints2 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/intent_slot_classification && \ + python multi_label_intent_slot_classification.py \ + model.data_dir=/home/TestData/nlp/new_multiatis \ + model.validation_ds.prefix=dev \ + model.test_ds.prefix=dev \ + trainer.devices=1 \ + +trainer.fast_dev_run=true \ + exp_manager.exp_dir=checkpoints2 + AFTER_SCRIPT: | + rm -rf checkpoints2 # TODO: add when megatron-bert is supported again # stage('L2: Model Parallel Size 2 Megatron Text Classification') { @@ -2153,342 +1568,246 @@ jobs: # L2: Parallel NLP Examples 2 L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/token_classification && \ - python token_classification_train.py \ - pretrained_model=ner_en_bert \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.train_ds.batch_size=2 \ - model.dataset.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.class_balancing="weighted_loss" \ - exp_manager.exp_dir=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/token_classification && \ + python token_classification_train.py \ + pretrained_model=ner_en_bert \ + model.dataset.data_dir=/home/TestData/nlp/ner/ \ + model.train_ds.batch_size=2 \ + model.dataset.use_cache=false \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + model.dataset.class_balancing="weighted_loss" \ + exp_manager.exp_dir=null L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/token_classification && \ - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - pretrained_model=punctuation_en_bert \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=null && \ - rm -rf "${data_dir}" - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/token_classification && \ + data_dir="$(mktemp -d -p "$(pwd)")" && \ + cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ + python punctuation_capitalization_train_evaluate.py \ + pretrained_model=punctuation_en_bert \ + model.train_ds.ds_item="${data_dir}" \ + model.validation_ds.ds_item="${data_dir}" \ + model.test_ds.ds_item="${data_dir}" \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + exp_manager.exp_dir=null; + + rm -rf "${data_dir}" L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/token_classification && \ - python token_classification_train.py \ - model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \ - exp_manager.exp_dir=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/token_classification && \ + python token_classification_train.py \ + model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + model.dataset.use_cache=false \ + model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \ + exp_manager.exp_dir=null + L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/token_classification/token_classification_evaluate.py \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.dataset.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/token_classification/token_classification_evaluate.py \ + model.dataset.data_dir=/home/TestData/nlp/ner/ \ + model.dataset.use_cache=false \ + pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - model.test_ds.ds_item="${data_dir}" \ - ~model.train_ds \ - ~model.validation_ds \ - +model.test_ds.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \ - rm -rf "${data_dir}" - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + data_dir="$(mktemp -d -p "$(pwd)")" && \ + cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ + python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ + +do_training=false \ + +do_testing=true \ + model.test_ds.ds_item="${data_dir}" \ + ~model.train_ds \ + ~model.validation_ds \ + +model.test_ds.use_cache=false \ + pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo; + + rm -rf "${data_dir}" + L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/token_classification && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir}" \ - model.validation_ds.ds_item="${tmp_data_dir}" \ - model.test_ds.ds_item="${tmp_data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=true && \ - tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \ - mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \ - rm -rf "${tmp_data_dir}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir_2}" \ - model.validation_ds.ds_item="${tmp_data_dir_2}" \ - model.test_ds.ds_item="${tmp_data_dir_2}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \ - "${tmp_data_dir_2}" \ - "${output_dir}" - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/token_classification && \ + output_dir="$(mktemp -d -p "$(pwd)")" && \ + tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \ + cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \ + python punctuation_capitalization_train_evaluate.py \ + model.train_ds.use_tarred_dataset=false \ + model.train_ds.ds_item="${tmp_data_dir}" \ + model.validation_ds.ds_item="${tmp_data_dir}" \ + model.test_ds.ds_item="${tmp_data_dir}" \ + model.language_model.pretrained_model_name=distilbert-base-uncased \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=[0,1] \ + trainer.accelerator="gpu" \ + trainer.strategy=ddp \ + trainer.max_epochs=1 \ + +exp_manager.explicit_log_dir="${output_dir}" \ + +do_testing=true && \ + tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \ + mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \ + rm -rf "${tmp_data_dir}" && \ + python punctuation_capitalization_train_evaluate.py \ + model.train_ds.use_tarred_dataset=false \ + model.train_ds.ds_item="${tmp_data_dir_2}" \ + model.validation_ds.ds_item="${tmp_data_dir_2}" \ + model.test_ds.ds_item="${tmp_data_dir_2}" \ + pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=[0,1] \ + trainer.accelerator="gpu" \ + trainer.strategy=ddp \ + trainer.max_epochs=1 \ + exp_manager=null; + + rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \ + "${tmp_data_dir_2}" \ + "${output_dir}" # Punctuation & Capitalization tarred dataset: Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \ - /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \ - "${data_dir}"/ && \ - usual_data=${data_dir}/wmt_wiki_10000 && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tarred_data=${output_dir}/train_tarred && \ - tokens_in_batch=2000 && \ - max_seq_length=512 && \ - lm_model=distilbert-base-uncased && \ - python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \ - --text ${usual_data}/input.txt \ - --labels ${usual_data}/labels.txt \ - --output_dir ${tarred_data} \ - --tokens_in_batch ${tokens_in_batch} \ - --max_seq_length 512 \ - --lines_per_dataset_fragment 2000 \ - --num_batches_per_tarfile 5 \ - --tar_file_prefix punctuation_capitalization \ - --tokenizer_name ${lm_model} \ - --use_fast_tokenizer \ - --pad_label O \ - --n_jobs 3 && \ - echo "Number of tarred files in dataset:" && \ - ls ${tarred_data}/*.tar | wc -l && \ - echo "Label id files in dataset:" && \ - ls ${tarred_data}/*.csv && \ - metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.train_ds.ds_item=${tarred_data} \ - model.language_model.pretrained_model_name=${lm_model} \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.tar_metadata_file=${metadata_file} \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir=${output_dir}/output && \ - rm -rf "${output_dir}" "${data_dir}" - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + data_dir="$(mktemp -d -p "$(pwd)")" && \ + cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \ + /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \ + "${data_dir}"/ && \ + usual_data=${data_dir}/wmt_wiki_10000 && \ + output_dir="$(mktemp -d -p "$(pwd)")" && \ + tarred_data=${output_dir}/train_tarred && \ + tokens_in_batch=2000 && \ + max_seq_length=512 && \ + lm_model=distilbert-base-uncased && \ + python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \ + --text ${usual_data}/input.txt \ + --labels ${usual_data}/labels.txt \ + --output_dir ${tarred_data} \ + --tokens_in_batch ${tokens_in_batch} \ + --max_seq_length 512 \ + --lines_per_dataset_fragment 2000 \ + --num_batches_per_tarfile 5 \ + --tar_file_prefix punctuation_capitalization \ + --tokenizer_name ${lm_model} \ + --use_fast_tokenizer \ + --pad_label O \ + --n_jobs 3 && \ + echo "Number of tarred files in dataset:" && \ + ls ${tarred_data}/*.tar | wc -l && \ + echo "Label id files in dataset:" && \ + ls ${tarred_data}/*.csv && \ + metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \ + python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ + model.validation_ds.ds_item="${data_dir}" \ + model.test_ds.ds_item="${data_dir}" \ + model.train_ds.ds_item=${tarred_data} \ + model.language_model.pretrained_model_name=${lm_model} \ + model.train_ds.use_tarred_dataset=true \ + model.train_ds.tar_metadata_file=${metadata_file} \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=[0,1] \ + trainer.accelerator="gpu" \ + trainer.strategy=ddp \ + trainer.max_epochs=1 \ + +exp_manager.explicit_log_dir=${output_dir}/output; + + rm -rf "${output_dir}" "${data_dir}" # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/token_classification && \ - work_dir="$(mktemp -d -p "$(pwd)")" && \ - label_vocab_dir="${work_dir}/labels" && \ - mkdir -p ${label_vocab_dir} && \ - data_dir="${work_dir}/data" && \ - mkdir -p "${data_dir}" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - output_dir="${work_dir}/output" && \ - mkdir -p "${output_dir}" && \ - punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \ - capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \ - printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \ - printf "O\nU\n" > "${capit_label_vocab}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \ - model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \ - model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=false && \ - python punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf "${work_dir}" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/token_classification && \ + work_dir="$(mktemp -d -p "$(pwd)")" && \ + label_vocab_dir="${work_dir}/labels" && \ + mkdir -p ${label_vocab_dir} && \ + data_dir="${work_dir}/data" && \ + mkdir -p "${data_dir}" && \ + cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ + output_dir="${work_dir}/output" && \ + mkdir -p "${output_dir}" && \ + punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \ + capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \ + printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \ + printf "O\nU\n" > "${capit_label_vocab}" && \ + python punctuation_capitalization_train_evaluate.py \ + model.train_ds.use_tarred_dataset=false \ + model.train_ds.ds_item="${data_dir}" \ + model.validation_ds.ds_item="${data_dir}" \ + model.test_ds.ds_item="${data_dir}" \ + model.language_model.pretrained_model_name=distilbert-base-uncased \ + model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \ + model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \ + model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=[0,1] \ + trainer.strategy=ddp \ + trainer.max_epochs=1 \ + +exp_manager.explicit_log_dir="${output_dir}" \ + +do_testing=false && \ + python punctuation_capitalization_train_evaluate.py \ + +do_training=false \ + +do_testing=true \ + ~model.train_ds \ + ~model.validation_ds \ + model.test_ds.ds_item="${data_dir}" \ + pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ + +model.train_ds.use_cache=false \ + +model.validation_ds.use_cache=false \ + +model.test_ds.use_cache=false \ + trainer.devices=[0,1] \ + trainer.strategy=ddp \ + trainer.max_epochs=1 \ + exp_manager=null && \ + rm -rf "${work_dir}" + # TODO: pleasefixme # Punctuation_Capitalization_Using_model-common_datasets_parameters-punct-capit-_label_ids: # needs: [cicd-test-container-setup] @@ -2555,670 +1874,501 @@ jobs: # Punctuation & Capitalization inference Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - output_dir="$(mktemp -d -p "$(pwd)")" && \ - python examples/nlp/token_classification/punctuate_capitalize_infer.py \ - --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \ - --output_text "${output_dir}/iwslt_inference_result.txt" \ - --max_seq_length 92 \ - --step 8 \ - --margin 16 \ - --pretrained_name punctuation_en_bert \ - --batch_size 32 && \ - rm -rf "${output_dir}" - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + output_dir="$(mktemp -d -p "$(pwd)")" && \ + python examples/nlp/token_classification/punctuate_capitalize_infer.py \ + --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \ + --output_text "${output_dir}/iwslt_inference_result.txt" \ + --max_seq_length 92 \ + --step 8 \ + --margin 16 \ + --pretrained_name punctuation_en_bert \ + --batch_size 32; + rm -rf "${output_dir}" + # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed L2_Pretraining_BERT_pretraining_from_Text: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/language_modeling && \ - python bert_pretraining.py \ - --config-name=bert_pretraining_from_text_config.yaml \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.precision=16 \ - +trainer.fast_dev_run=true \ - model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt \ - model.train_ds.batch_size=32 \ - model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt \ - model.validation_ds.batch_size=32 \ - model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \ - model.optim.lr=0.01 \ - model.optim.sched.warmup_ratio=0.1 \ - model.tokenizer.tokenizer_name=sentencepiece \ - model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \ - model.mask_prob=0.15 \ - model.short_seq_prob=0.1 \ - exp_manager.exp_dir=PretrainingBERTFromText \ - - rm -f /home/TestData/nlp/wikitext-2/*.pkl - #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/language_modeling && \ + python bert_pretraining.py \ + --config-name=bert_pretraining_from_text_config.yaml \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + trainer.precision=16 \ + +trainer.fast_dev_run=true \ + model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt \ + model.train_ds.batch_size=32 \ + model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt \ + model.validation_ds.batch_size=32 \ + model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \ + model.optim.lr=0.01 \ + model.optim.sched.warmup_ratio=0.1 \ + model.tokenizer.tokenizer_name=sentencepiece \ + model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \ + model.mask_prob=0.15 \ + model.short_seq_prob=0.1 \ + exp_manager.exp_dir=PretrainingBERTFromText; + AFTER_SCRIPT: | + rm -f /home/TestData/nlp/wikitext-2/*.pkl + #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText L2_Pretraining_BERT_from_Preprocessed: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/language_modeling && \ - python bert_pretraining.py \ - --config-name=bert_pretraining_from_preprocessed_config.yaml \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.precision=16 \ - +trainer.fast_dev_run=false \ - +trainer.max_epochs=1 \ - +trainer.limit_val_batches=0 \ - +trainer.limit_train_batches=1 \ - model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \ - model.train_ds.batch_size=8 \ - model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \ - model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \ - model.optim.lr=0.875e-4 \ - model.optim.weight_decay=0.01 \ - model.optim.sched.warmup_ratio=0.01 \ - exp_manager.exp_dir=PretrainingBERTFromPreprocessed \ - exp_manager.create_checkpoint_callback=False \ - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/language_modeling && \ + python bert_pretraining.py \ + --config-name=bert_pretraining_from_preprocessed_config.yaml \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + trainer.precision=16 \ + +trainer.fast_dev_run=false \ + +trainer.max_epochs=1 \ + +trainer.limit_val_batches=0 \ + +trainer.limit_train_batches=1 \ + model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \ + model.train_ds.batch_size=8 \ + model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \ + model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \ + model.optim.lr=0.875e-4 \ + model.optim.weight_decay=0.01 \ + model.optim.sched.warmup_ratio=0.01 \ + exp_manager.exp_dir=PretrainingBERTFromPreprocessed \ + exp_manager.create_checkpoint_callback=False \ + #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" # L2: Entity Linking L2_Entity_Linking_Self_Alignment_Pretraining_BERT: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/entity_linking && \ - python self_alignment_pretraining.py \ - project_dir=. \ - trainer.val_check_interval=3 \ - model.raw_data=None \ - model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \ - model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \ - model.train_ds.batch_size=8 \ - model.validation_ds.batch_size=8 \ - exp_manager.exp_dir=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/entity_linking && \ + python self_alignment_pretraining.py \ + project_dir=. \ + trainer.val_check_interval=3 \ + model.raw_data=None \ + model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \ + model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \ + model.train_ds.batch_size=8 \ + model.validation_ds.batch_size=8 \ + exp_manager.exp_dir=null # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858 # is in the release container # L2: NMT Attention is All You Need Training L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=false \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.encoder.inner_size=256 \ - model.decoder.num_layers=1 \ - model.decoder.hidden_size=64 \ - model.decoder.inner_size=256 \ - +model.optim.capturable=True \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.val_check_interval=2 \ - +trainer.limit_val_batches=1 \ - +trainer.max_steps=2 \ - trainer.precision=16 \ - +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ - +exp_manager.create_checkpoint_callback=true - - python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.encoder.inner_size=256 \ - model.decoder.num_layers=1 \ - model.decoder.hidden_size=64 \ - model.decoder.inner_size=256 \ - +model.optim.capturable=True \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.val_check_interval=10 \ - +trainer.limit_val_batches=1 \ - +trainer.limit_test_batches=1 \ - +trainer.max_steps=10 \ - +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ - +exp_manager.create_checkpoint_callback=true \ - +exp_manager.resume_if_exists=True - - rm -rf examples/nlp/machine_translation/nmt_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/nlp/machine_translation/enc_dec_nmt.py \ + --config-path=conf \ + --config-name=aayn_base \ + do_testing=false \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.encoder.num_layers=1 \ + model.encoder.hidden_size=64 \ + model.encoder.inner_size=256 \ + model.decoder.num_layers=1 \ + model.decoder.hidden_size=64 \ + model.decoder.inner_size=256 \ + +model.optim.capturable=True \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.val_check_interval=2 \ + +trainer.limit_val_batches=1 \ + +trainer.max_steps=2 \ + trainer.precision=16 \ + +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ + +exp_manager.create_checkpoint_callback=true + + python examples/nlp/machine_translation/enc_dec_nmt.py \ + --config-path=conf \ + --config-name=aayn_base \ + do_testing=true \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.encoder.num_layers=1 \ + model.encoder.hidden_size=64 \ + model.encoder.inner_size=256 \ + model.decoder.num_layers=1 \ + model.decoder.hidden_size=64 \ + model.decoder.inner_size=256 \ + +model.optim.capturable=True \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.val_check_interval=10 \ + +trainer.limit_val_batches=1 \ + +trainer.limit_test_batches=1 \ + +trainer.max_steps=10 \ + +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ + +exp_manager.create_checkpoint_callback=true \ + +exp_manager.resume_if_exists=True + AFTER_SCRIPT: | + rm -rf examples/nlp/machine_translation/nmt_results L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.pre_ln=true \ - model.decoder.pre_ln=true \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/machine_translation && \ + python enc_dec_nmt.py \ + --config-path=conf \ + --config-name=aayn_base \ + do_testing=true \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.encoder.pre_ln=true \ + model.decoder.pre_ln=true \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + +trainer.limit_test_batches=2 \ + exp_manager=null L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ - model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/machine_translation && \ + python enc_dec_nmt.py \ + --config-path=conf \ + --config-name=aayn_base \ + do_testing=true \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ + model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ + model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ + model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ + model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ + model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + +trainer.limit_test_batches=2 \ + exp_manager=null # L2: NMT Attention is All You Need Inference L2_NMT_Attention_is_All_You_Need_Inference: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python nmt_transformer_infer.py \ - --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ - --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \ - --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \ - --target_lang en \ - --source_lang de - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/machine_translation && \ + python nmt_transformer_infer.py \ + --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ + --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \ + --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \ + --target_lang en \ + --source_lang de # L2: NMT Attention is All You Need Finetuning L2_NMT_Attention_is_All_You_Need_Finetuning: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt_finetune.py \ - model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ - trainer.devices=1 \ - ~trainer.max_epochs \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - +trainer.val_check_interval=10 \ - +trainer.limit_val_batches=1 \ - +trainer.limit_test_batches=1 \ - +trainer.max_steps=10 \ - +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \ - +exp_manager.create_checkpoint_callback=True \ - +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ - +exp_manager.checkpoint_callback_params.mode=max \ - +exp_manager.checkpoint_callback_params.save_best_model=true - - rm -rf examples/nlp/machine_translation/nmt_finetune - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/machine_translation && \ + python enc_dec_nmt_finetune.py \ + model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ + trainer.devices=1 \ + ~trainer.max_epochs \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + +trainer.val_check_interval=10 \ + +trainer.limit_val_batches=1 \ + +trainer.limit_test_batches=1 \ + +trainer.max_steps=10 \ + +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \ + +exp_manager.create_checkpoint_callback=True \ + +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ + +exp_manager.checkpoint_callback_params.mode=max \ + +exp_manager.checkpoint_callback_params.save_best_model=true + AFTER_SCRIPT: | + rm -rf examples/nlp/machine_translation/nmt_finetune # L2: NMT Tarred Dataset Creation L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_training=false \ - model.preproc_out_dir=$PWD/preproc_out_dir \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.n_preproc_jobs=2 \ - model.train_ds.lines_per_dataset_fragment=500 \ - model.train_ds.num_batches_per_tarfile=10 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.vocab_size=2000 \ - model.decoder_tokenizer.vocab_size=2000 \ - ~model.test_ds \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null \ - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + cd examples/nlp/machine_translation && \ + python enc_dec_nmt.py \ + --config-path=conf \ + --config-name=aayn_base \ + do_training=false \ + model.preproc_out_dir=$PWD/preproc_out_dir \ + model.train_ds.use_tarred_dataset=true \ + model.train_ds.n_preproc_jobs=2 \ + model.train_ds.lines_per_dataset_fragment=500 \ + model.train_ds.num_batches_per_tarfile=10 \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.encoder_tokenizer.vocab_size=2000 \ + model.decoder_tokenizer.vocab_size=2000 \ + ~model.test_ds \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.fast_dev_run=true \ + exp_manager=null L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - cd examples/nlp/machine_translation && \ - python create_tarred_parallel_dataset.py \ - --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - --out_dir $PWD/out_dir \ - --encoder_tokenizer_vocab_size=2000 \ - --decoder_tokenizer_vocab_size=2000 \ - --tokens_in_batch=1000 \ - --lines_per_dataset_fragment=500 \ - --num_batches_per_tarfile=10 \ - --n_preproc_jobs=2 \ - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + cd examples/nlp/machine_translation && \ + python create_tarred_parallel_dataset.py \ + --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + --out_dir $PWD/out_dir \ + --encoder_tokenizer_vocab_size=2000 \ + --decoder_tokenizer_vocab_size=2000 \ + --tokens_in_batch=1000 \ + --lines_per_dataset_fragment=500 \ + --num_batches_per_tarfile=10 \ + --n_preproc_jobs=2 L2_Megatron_NMT_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/machine_translation/megatron_nmt_training.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - +trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.train_ds.num_workers=1 \ - model.validation_ds.num_workers=1 \ - ~model.test_ds \ - model.train_ds.dataset_type=text_memmap \ - model.encoder_tokenizer.library=sentencepiece \ - model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - model.decoder_tokenizer.library=sentencepiece \ - model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model - # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - python examples/nlp/machine_translation/megatron_nmt_training.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.train_ds.num_workers=1 \ - model.validation_ds.num_workers=1 \ - ~model.test_ds \ - model.train_ds.dataset_type=text_memmap \ - model.encoder_tokenizer.library=sentencepiece \ - model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - model.decoder_tokenizer.library=sentencepiece \ - model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model - rm -rf examples/nlp/machine_translation/megatron_nmt_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/machine_translation/megatron_nmt_training.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + +trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='swiglu' \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='swiglu' \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.micro_batch_size=2 \ + model.global_batch_size=4 \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.train_ds.num_workers=1 \ + model.validation_ds.num_workers=1 \ + ~model.test_ds \ + model.train_ds.dataset_type=text_memmap \ + model.encoder_tokenizer.library=sentencepiece \ + model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ + model.decoder_tokenizer.library=sentencepiece \ + model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model + # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error + # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() + python examples/nlp/machine_translation/megatron_nmt_training.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + +trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='swiglu' \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='swiglu' \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.micro_batch_size=2 \ + model.global_batch_size=4 \ + model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ + model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.train_ds.num_workers=1 \ + model.validation_ds.num_workers=1 \ + ~model.test_ds \ + model.train_ds.dataset_type=text_memmap \ + model.encoder_tokenizer.library=sentencepiece \ + model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ + model.decoder_tokenizer.library=sentencepiece \ + model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model + AFTER_SCRIPT: | + rm -rf examples/nlp/machine_translation/megatron_nmt_results L2_Megatron_BART_Perceiver_MIM_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string='"800,100,100"' \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5 - # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string='"800,100,100"' \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5 - rm -rf examples/nlp/language_modeling/megatron_mim_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.arch=perceiver \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='swiglu' \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='swiglu' \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.micro_batch_size=2 \ + model.global_batch_size=4 \ + model.data.data_impl=text_mmap \ + model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ + model.data.splits_string='"800,100,100"' \ + model.data.whole_word_masking=False \ + model.tokenizer.library=sentencepiece \ + model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ + ++model.hiddens.enc_output_name=z \ + ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ + ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ + ++model.hiddens.loss.mim.cls_name=a_mim \ + ++model.hiddens.loss.mim.loss_weight=0.5 + # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error + # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.arch=perceiver \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='swiglu' \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='swiglu' \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.micro_batch_size=2 \ + model.global_batch_size=4 \ + model.data.data_impl=text_mmap \ + model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ + model.data.splits_string='"800,100,100"' \ + model.data.whole_word_masking=False \ + model.tokenizer.library=sentencepiece \ + model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ + ++model.hiddens.enc_output_name=z \ + ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ + ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ + ++model.hiddens.loss.mim.cls_name=a_mim \ + ++model.hiddens.loss.mim.loss_weight=0.5 + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/megatron_mim_results # stage('L2: NMT Bottleneck Fallback') { # when { @@ -3431,63 +2581,322 @@ jobs: L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.pipeline_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.pipeline_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings + + L2_Megatron_Bert_Pretraining_and_Resume_Training: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.sequence_parallel=True \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings + + L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=32 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.mcore_bert=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.sequence_parallel=True \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=32 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.mcore_bert=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings + + L2_Megatron_RETRO_Pretraining_and_Resume_Training: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + trainer.num_nodes=1 \ + trainer.devices=2 \ + trainer.precision=bf16 \ + trainer.accelerator=gpu \ + model.data.data_prefix=['none'] \ + exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ + model.mcore_gpt=True \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.optim.name=distributed_fused_adam \ + model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ + model.data.num_workers=4 \ + model.micro_batch_size=1 \ + model.data.shuffle_documents=False \ + trainer.val_check_interval=30 \ + +trainer.num_sanity_val_steps=0 \ + model.init_method_std=0.023 \ + model.optim.lr=6.0e-4 \ + model.megatron_amp_O2=True \ + model.data.splits_string=\'\"98,2,0\"\' \ + model.data.dataloader_type=cyclic \ + trainer.max_steps=10 + + python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + trainer.num_nodes=1 \ + trainer.devices=2 \ + trainer.precision=bf16 \ + trainer.accelerator=gpu \ + model.data.data_prefix=['none'] \ + exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ + model.mcore_gpt=True \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.optim.name=distributed_fused_adam \ + model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ + model.data.num_workers=4 \ + model.micro_batch_size=1 \ + model.data.shuffle_documents=False \ + trainer.val_check_interval=30 \ + +trainer.num_sanity_val_steps=0 \ + model.init_method_std=0.023 \ + model.optim.lr=6.0e-4 \ + model.megatron_amp_O2=True \ + model.data.splits_string=\'\"98,2,0\"\' \ + model.data.dataloader_type=cyclic \ + trainer.max_steps=20 + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/mcore_retro_results + + L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ + trainer.devices=2 \ + trainer.num_nodes=1 \ + trainer.accelerator=gpu \ + trainer.accumulate_grad_batches=1 \ + trainer.limit_val_batches=2 \ + exp_manager.resume_if_exists=True \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + trainer.val_check_interval=10 \ + exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ + model.data.data_prefix= \ + model.data.knn_index= \ + model.data.retrieval_prefix= \ + model.tensor_model_parallel_size=2 \ + model.micro_batch_size=4 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.chunk_size=32 \ + model.enc_num_layers=2 \ + model.dec_num_layers=2 \ + model.enc_cross_attention=[1] \ + model.dec_cross_attention=[1] \ + +model.data.mock=True + + python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ trainer.devices=2 \ + trainer.num_nodes=1 \ trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ + exp_manager.resume_if_exists=True \ trainer.max_steps=20 \ trainer.precision=16 \ trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ + trainer.val_check_interval=10 \ + exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ + model.data.data_prefix= \ + model.data.knn_index= \ + model.data.retrieval_prefix= \ + model.tensor_model_parallel_size=2 \ + model.micro_batch_size=4 \ model.optim.name=fused_adam \ model.optim.lr=2e-4 \ model.optim.sched.warmup_steps=2 \ @@ -3495,24 +2904,113 @@ jobs: model.optim.sched.min_lr=8e-5 \ model.max_position_embeddings=128 \ model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_Bert_Pretraining_and_Resume_Training: + model.chunk_size=32 \ + model.enc_num_layers=2 \ + model.dec_num_layers=2 \ + model.enc_cross_attention=[1] \ + model.dec_cross_attention=[1] \ + +model.data.mock=True + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/retro_legacy_results + + # L2_Megatron_RETRO_muTransfer_Pretraining_Performance: + # needs: [cicd-test-container-setup] + # runs-on: self-hosted-azure + # container: + # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + # options: + # # --user 0:128 + # --device=/dev/nvidia0 + # --gpus all + # --shm-size=8g + # --env TRANSFORMERS_OFFLINE=0 + # --env HYDRA_FULL_ERROR=1 + # --volume /mnt/datadrive/TestData:/home/TestData + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - run: | + # python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \ + # trainer.devices=2 \ + # trainer.num_nodes=1 \ + # trainer.accelerator=gpu \ + # trainer.accumulate_grad_batches=1 \ + # trainer.max_steps=100 \ + # trainer.log_every_n_steps=1 \ + # trainer.precision=16 \ + # trainer.val_check_interval=100 \ + # trainer.limit_val_batches=0 \ + # trainer.gradient_clip_val=1.0 \ + # +trainer.num_sanity_val_steps=0 \ + # exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \ + # +exp_manager.version=smalltest \ + # model.data.neighbors=2 \ + # model.megatron_amp_O2=False \ + # model.apply_query_key_layer_scaling=False \ + # model.tensor_model_parallel_size=1 \ + # model.optim.name=muadamw \ + # model.optim.weight_decay=0.1 \ + # model.optim.betas=[0.9,0.95] \ + # model.optim.lr=6e-4 \ + # model.optim.sched.warmup_steps=1000 \ + # model.optim.sched.constant_steps=0 \ + # model.optim.sched.min_lr=6e-5 \ + # model.add_position_embedding=False \ + # model.enc_num_layers=2 \ + # model.dec_num_layers=6 \ + # model.enc_cross_attention=[0] \ + # model.dec_cross_attention=[3,5] \ + # model.hidden_size=96 \ + # model.ffn_hidden_size=384 \ + # model.init_method_std=0.023 \ + # model.num_attention_heads=12 \ + # model.max_position_embeddings=1024 \ + # model.encoder_seq_length=1024 \ + # model.tokenizer.library=megatron \ + # model.tokenizer.type=GPT2BPETokenizer \ + # model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \ + # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \ + # model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \ + # model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \ + # model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \ + # model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \ + # model.data.num_workers=8 \ + # model.micro_batch_size=8 \ + # model.normalization=rmsnorm \ + # model.transformer_block_type=pre_ln \ + # model.bias_activation_fusion=True \ + # model.bias_dropout_add_fusion=False \ + # model.masked_softmax_fusion=True \ + # model.hidden_dropout=0 \ + # model.attention_dropout=0 \ + # model.fp32_residual_connection=True \ + # model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml + + # python -c "import pandas as pd + # import pathlib + # from pandas.testing import assert_frame_equal + # from tensorboard.backend.event_processing.event_accumulator import EventAccumulator + # import torch + # if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()): + # import sys + # sys.exit(0) + # event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0] + # ea = EventAccumulator(str(event_file)).Reload() + # vals = [] + # for i in ea.Scalars('reduced_train_loss'): + # vals.append(i.value) + # training_curve = pd.DataFrame({'loss': vals}) + # gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv') + # assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)" + + # rm -rf examples/nlp/language_modeling/retro_results + # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + # if: "failure()" + + L2_RAG_Pipeline_Indexing: needs: [cicd-test-container-setup] runs-on: self-hosted-azure + timeout-minutes: 10 container: image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} options: @@ -3527,74 +3025,23 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - run: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings + python examples/nlp/rag/rag_indexing.py \ + trainer.num_nodes=1 \ + trainer.devices=1 \ + trainer.precision='bf16-mixed' \ + indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \ + indexing.embedder.embed_batch_size=128 \ + indexing.data.data_path='/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data' \ + indexing.data.chunk_size=256 \ + indexing.data.chunk_overlap=10 \ + indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: "failure()" - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: + L2_RAG_Pipeline_Generating: needs: [cicd-test-container-setup] runs-on: self-hosted-azure + timeout-minutes: 10 container: image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} options: @@ -3609,537 +3056,199 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - run: | - NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_RETRO_Pretraining_and_Resume_Training: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.num_nodes=1 \ - trainer.devices=2 \ - trainer.precision=bf16 \ - trainer.accelerator=gpu \ - model.data.data_prefix=['none'] \ - exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ - model.data.num_workers=4 \ - model.micro_batch_size=1 \ - model.data.shuffle_documents=False \ - trainer.val_check_interval=30 \ - +trainer.num_sanity_val_steps=0 \ - model.init_method_std=0.023 \ - model.optim.lr=6.0e-4 \ - model.megatron_amp_O2=True \ - model.data.splits_string=\'\"98,2,0\"\' \ - model.data.dataloader_type=cyclic \ - trainer.max_steps=10 - - python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.num_nodes=1 \ - trainer.devices=2 \ - trainer.precision=bf16 \ - trainer.accelerator=gpu \ - model.data.data_prefix=['none'] \ - exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ - model.data.num_workers=4 \ - model.micro_batch_size=1 \ - model.data.shuffle_documents=False \ - trainer.val_check_interval=30 \ - +trainer.num_sanity_val_steps=0 \ - model.init_method_std=0.023 \ - model.optim.lr=6.0e-4 \ - model.megatron_amp_O2=True \ - model.data.splits_string=\'\"98,2,0\"\' \ - model.data.dataloader_type=cyclic \ - trainer.max_steps=20 - - rm -rf examples/nlp/language_modeling/mcore_retro_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ - model.data.data_prefix= \ - model.data.knn_index= \ - model.data.retrieval_prefix= \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True - - python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ - model.data.data_prefix= \ - model.data.knn_index= \ - model.data.retrieval_prefix= \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True - - rm -rf examples/nlp/language_modeling/retro_legacy_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - # L2_Megatron_RETRO_muTransfer_Pretraining_Performance: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \ - # trainer.devices=2 \ - # trainer.num_nodes=1 \ - # trainer.accelerator=gpu \ - # trainer.accumulate_grad_batches=1 \ - # trainer.max_steps=100 \ - # trainer.log_every_n_steps=1 \ - # trainer.precision=16 \ - # trainer.val_check_interval=100 \ - # trainer.limit_val_batches=0 \ - # trainer.gradient_clip_val=1.0 \ - # +trainer.num_sanity_val_steps=0 \ - # exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \ - # +exp_manager.version=smalltest \ - # model.data.neighbors=2 \ - # model.megatron_amp_O2=False \ - # model.apply_query_key_layer_scaling=False \ - # model.tensor_model_parallel_size=1 \ - # model.optim.name=muadamw \ - # model.optim.weight_decay=0.1 \ - # model.optim.betas=[0.9,0.95] \ - # model.optim.lr=6e-4 \ - # model.optim.sched.warmup_steps=1000 \ - # model.optim.sched.constant_steps=0 \ - # model.optim.sched.min_lr=6e-5 \ - # model.add_position_embedding=False \ - # model.enc_num_layers=2 \ - # model.dec_num_layers=6 \ - # model.enc_cross_attention=[0] \ - # model.dec_cross_attention=[3,5] \ - # model.hidden_size=96 \ - # model.ffn_hidden_size=384 \ - # model.init_method_std=0.023 \ - # model.num_attention_heads=12 \ - # model.max_position_embeddings=1024 \ - # model.encoder_seq_length=1024 \ - # model.tokenizer.library=megatron \ - # model.tokenizer.type=GPT2BPETokenizer \ - # model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \ - # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \ - # model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \ - # model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \ - # model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \ - # model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \ - # model.data.num_workers=8 \ - # model.micro_batch_size=8 \ - # model.normalization=rmsnorm \ - # model.transformer_block_type=pre_ln \ - # model.bias_activation_fusion=True \ - # model.bias_dropout_add_fusion=False \ - # model.masked_softmax_fusion=True \ - # model.hidden_dropout=0 \ - # model.attention_dropout=0 \ - # model.fp32_residual_connection=True \ - # model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml - - # python -c "import pandas as pd - # import pathlib - # from pandas.testing import assert_frame_equal - # from tensorboard.backend.event_processing.event_accumulator import EventAccumulator - # import torch - # if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()): - # import sys - # sys.exit(0) - # event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0] - # ea = EventAccumulator(str(event_file)).Reload() - # vals = [] - # for i in ea.Scalars('reduced_train_loss'): - # vals.append(i.value) - # training_curve = pd.DataFrame({'loss': vals}) - # gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv') - # assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)" - - # rm -rf examples/nlp/language_modeling/retro_results - # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - # if: "failure()" - - L2_BioMegatron_Bert_NER_Task: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/token_classification/token_classification_train.py \ - exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \ - trainer.max_epochs=1 \ - model.dataset.data_dir=/home/TestData/nlp/ner \ - model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \ - model.tokenizer.tokenizer_name=null - rm -rf examples/nlp/language_modeling/token_classification_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=rope \ - model.rotary_percentage=0.5 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - # trainer.devices=2 \ - # trainer.accelerator=gpu \ - # trainer.log_every_n_steps=1 \ - # trainer.val_check_interval=2 \ - # trainer.limit_val_batches=1 \ - # trainer.accumulate_grad_batches=1 \ - # trainer.max_steps=6 \ - # trainer.precision=16 \ - # trainer.gradient_clip_val=1.0 \ - # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - # exp_manager.resume_if_exists=True \ - # model.tensor_model_parallel_size=2 \ - # model.optim.name=fused_adam \ - # model.optim.lr=2e-4 \ - # model.optim.sched.warmup_steps=2 \ - # model.optim.sched.constant_steps=2 \ - # model.optim.sched.min_lr=8e-5 \ - # model.max_position_embeddings=128 \ - # model.encoder_seq_length=128 \ - # model.data.seq_length=128 \ - # model.position_embedding_type=rope \ - # model.rotary_percentage=0.5 \ - # model.normalization=rmsnorm \ - # model.bias=False \ - # model.bias_activation_fusion=False \ - # model.bias_dropout_add_fusion=False \ - # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - # model.num_layers=8 \ - # model.hidden_size=256 \ - # model.num_attention_heads=8 \ - # model.activations_checkpoint_method=block \ - # model.activations_checkpoint_granularity=full \ - # model.activations_checkpoint_num_layers=1 \ - # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings + python examples/nlp/rag/rag_generating.py \ + trainer.devices=1 \ + trainer.precision='bf16-mixed' \ + indexing.embedder.model_path='/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo' \ + indexing.index_path='/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index' \ + generating.llm.model_path='/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo' \ + generating.inference.tokens_to_generate=50 \ + generating.inference.greedy=False \ + generating.inference.temperature=1.0 \ + generating.query='Which art schools did I applied to?' - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: "failure()" + L2_BioMegatron_Bert_NER_Task: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/token_classification/token_classification_train.py \ + exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \ + trainer.max_epochs=1 \ + model.dataset.data_dir=/home/TestData/nlp/ner \ + model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \ + model.tokenizer.tokenizer_name=null + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/token_classification_results + + L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + + L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=rope \ + model.rotary_percentage=0.5 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # commented out to save time on github ci @adithyare + # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + # trainer.devices=2 \ + # trainer.accelerator=gpu \ + # trainer.log_every_n_steps=1 \ + # trainer.val_check_interval=2 \ + # trainer.limit_val_batches=1 \ + # trainer.accumulate_grad_batches=1 \ + # trainer.max_steps=6 \ + # trainer.precision=16 \ + # trainer.gradient_clip_val=1.0 \ + # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + # exp_manager.resume_if_exists=True \ + # model.tensor_model_parallel_size=2 \ + # model.optim.name=fused_adam \ + # model.optim.lr=2e-4 \ + # model.optim.sched.warmup_steps=2 \ + # model.optim.sched.constant_steps=2 \ + # model.optim.sched.min_lr=8e-5 \ + # model.max_position_embeddings=128 \ + # model.encoder_seq_length=128 \ + # model.data.seq_length=128 \ + # model.position_embedding_type=rope \ + # model.rotary_percentage=0.5 \ + # model.normalization=rmsnorm \ + # model.bias=False \ + # model.bias_activation_fusion=False \ + # model.bias_dropout_add_fusion=False \ + # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + # model.num_layers=8 \ + # model.hidden_size=256 \ + # model.num_attention_heads=8 \ + # model.activations_checkpoint_method=block \ + # model.activations_checkpoint_granularity=full \ + # model.activations_checkpoint_num_layers=1 \ + # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + # This test requires Ampere but some of the test GPUs are Volta # Need to add a check for compute capability before uncommenting this test # - name: L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2 @@ -4209,1910 +3318,1551 @@ jobs: # # model.optim.sched.min_lr=8e-5 \ # # model.max_position_embeddings=128 \ # # model.encoder_seq_length=128 \ - # # model.data.seq_length=128 \ - # # model.position_embedding_type=rope \ - # # model.rotary_percentage=0.5 \ - # # model.normalization=rmsnorm \ - # # model.bias=False \ - # # model.bias_activation_fusion=False \ - # # model.bias_dropout_add_fusion=False \ - # # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - # # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - # # model.num_layers=8 \ - # # model.hidden_size=256 \ - # # model.num_attention_heads=8 \ - # # model.activations_checkpoint_method=block \ - # # model.activations_checkpoint_granularity=full \ - # # model.activations_checkpoint_num_layers=1 \ - # # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - # # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ - # # model.use_flash_attention=True" - # rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - # rm -rf examples/nlp/language_modeling/gpt_index_mappings" - # } - # } - - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=alibi \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # not testing resume functionality to save time on ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=alibi \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=kerple \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=kerple \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.mcore_gpt=True \ - model.megatron_amp_O2=True \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.activation=fast-swiglu \ - model.bias_activation_fusion=False \ - model.hidden_dropout=0.0 \ - model.attention_dropout=0.0 \ - model.transformer_block_type=normformer \ - model.headscale=True \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - model.mcore_gpt=True \ - model.megatron_amp_O2=True \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.activation=fast-swiglu \ - model.bias_activation_fusion=False \ - model.hidden_dropout=0.0 \ - model.attention_dropout=0.0 \ - model.transformer_block_type=normformer \ - model.headscale=True \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - #@athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0 - L2_Megatron_GPT_Finetuning_PP2: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - +trainer.limit_val_batches=2 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.peft.peft_scheme=null \ - model.data.train_ds.micro_batch_size=1 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.names=[quarel] \ - model.data.validation_ds.micro_batch_size=1 \ - model.data.validation_ds.global_batch_size=1 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.peft.peft_scheme=null \ - model.data.train_ds.micro_batch_size=1 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.names=[quarel] \ - model.data.validation_ds.micro_batch_size=1 \ - model.data.validation_ds.global_batch_size=1 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - rm -rf examples/nlp/language_modeling/gpt_sft_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_GPT_Finetuning_StarCoder_PP1: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.precision=32 \ - trainer.max_steps=4 \ - trainer.val_check_interval=4 \ - trainer.enable_checkpointing=False \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - exp_manager.checkpoint_callback_params.save_best_model=False \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.optim.name=distributed_fused_adam \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.num_workers=0 \ - model.data.train_ds.concat_sampling_probabilities=[1.0] - - rm -rf examples/nlp/language_modeling/gpt_sft_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_GPT_PEFT_Lora_PP2: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 - - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.peft.peft_scheme=lora \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_GPT_PEFT_Lora_TP2: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - rm -rf /home/TestData/nlp/lora_tuning_tp2 - - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - model.peft.peft_scheme='lora' \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=['quarel4'] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl' - - rm -rf /home/TestData/nlp/lora_tuning_tp2 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + # # model.data.seq_length=128 \ + # # model.position_embedding_type=rope \ + # # model.rotary_percentage=0.5 \ + # # model.normalization=rmsnorm \ + # # model.bias=False \ + # # model.bias_activation_fusion=False \ + # # model.bias_dropout_add_fusion=False \ + # # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + # # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + # # model.num_layers=8 \ + # # model.hidden_size=256 \ + # # model.num_attention_heads=8 \ + # # model.activations_checkpoint_method=block \ + # # model.activations_checkpoint_granularity=full \ + # # model.activations_checkpoint_num_layers=1 \ + # # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + # # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ + # # model.use_flash_attention=True" + # rm -rf examples/nlp/language_modeling/gpt_pretrain_results" + # rm -rf examples/nlp/language_modeling/gpt_index_mappings" + # } + # } - L2_Megatron_GPT_Eval: + L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_eval.py \ - gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \ - prompts=['How to fix GPU memory? A:'] \ - tensor_model_parallel_size=1 \ - inference.tokens_to_generate=32 \ - trainer.precision=32 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=alibi \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # not testing resume functionality to save time on ci @adithyare + #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + #trainer.devices=2 \ + #trainer.accelerator=gpu \ + #trainer.log_every_n_steps=1 \ + #trainer.val_check_interval=2 \ + #trainer.limit_val_batches=1 \ + #trainer.accumulate_grad_batches=1 \ + #trainer.max_steps=6 \ + #trainer.precision=16 \ + #trainer.gradient_clip_val=1.0 \ + #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + #exp_manager.resume_if_exists=True \ + #model.tensor_model_parallel_size=2 \ + #model.optim.name=fused_adam \ + #model.optim.lr=2e-4 \ + #model.optim.sched.warmup_steps=2 \ + #model.optim.sched.constant_steps=2 \ + #model.optim.sched.min_lr=8e-5 \ + #model.max_position_embeddings=128 \ + #model.encoder_seq_length=128 \ + #model.data.seq_length=128 \ + #model.position_embedding_type=alibi \ + #model.normalization=rmsnorm \ + #model.bias=False \ + #model.bias_activation_fusion=False \ + #model.bias_dropout_add_fusion=False \ + #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + #model.num_layers=8 \ + #model.hidden_size=256 \ + #model.num_attention_heads=8 \ + #model.activations_checkpoint_method=block \ + #model.activations_checkpoint_granularity=full \ + #model.activations_checkpoint_num_layers=1 \ + #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings - L2_Megatron_GPT_Eval_PP2: + L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_eval.py \ - gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - server=False \ - tensor_model_parallel_size=1 \ - pipeline_model_parallel_size=2 \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.precision=32 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=kerple \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # commented out to save time on github ci @adithyare + #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + #trainer.devices=2 \ + #trainer.accelerator=gpu \ + #trainer.log_every_n_steps=1 \ + #trainer.val_check_interval=2 \ + #trainer.limit_val_batches=1 \ + #trainer.accumulate_grad_batches=1 \ + #trainer.max_steps=6 \ + #trainer.precision=16 \ + #trainer.gradient_clip_val=1.0 \ + #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + #exp_manager.resume_if_exists=True \ + #model.tensor_model_parallel_size=2 \ + #model.optim.name=fused_adam \ + #model.optim.lr=2e-4 \ + #model.optim.sched.warmup_steps=2 \ + #model.optim.sched.constant_steps=2 \ + #model.optim.sched.min_lr=8e-5 \ + #model.max_position_embeddings=128 \ + #model.encoder_seq_length=128 \ + #model.data.seq_length=128 \ + #model.position_embedding_type=kerple \ + #model.normalization=rmsnorm \ + #model.bias=False \ + #model.bias_activation_fusion=False \ + #model.bias_dropout_add_fusion=False \ + #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + #model.num_layers=8 \ + #model.hidden_size=256 \ + #model.num_attention_heads=8 \ + #model.activations_checkpoint_method=block \ + #model.activations_checkpoint_granularity=full \ + #model.activations_checkpoint_num_layers=1 \ + #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len: + L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \ - model.peft.restore_from_path=null \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \ - model.data.test_ds.names=[test] \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=30 \ - model.data.test_ds.max_seq_length=6000 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path=examples/nlp/language_modeling/out.jsonl && \ - rm -rf examples/nlp/language_modeling/out.jsonl - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=bf16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.mcore_gpt=True \ + model.megatron_amp_O2=True \ + model.optim.name=distributed_fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.activation=fast-swiglu \ + model.bias_activation_fusion=False \ + model.hidden_dropout=0.0 \ + model.attention_dropout=0.0 \ + model.transformer_block_type=normformer \ + model.headscale=True \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=bf16 \ + trainer.gradient_clip_val=1.0 \ + model.mcore_gpt=True \ + model.megatron_amp_O2=True \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.optim.name=distributed_fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.activation=fast-swiglu \ + model.bias_activation_fusion=False \ + model.hidden_dropout=0.0 \ + model.attention_dropout=0.0 \ + model.transformer_block_type=normformer \ + model.headscale=True \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings - # TODO: Add this test back. Test was failing on CI machines due to HW error - # - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval - # when { - # anyOf { - # branch main - # changeRequest target: main - # } - # } - # failFast true - # - run: | - # python -m torch.distributed.launch --nproc_per_node=2 \ - # examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \ - # --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \ - # --checkpoint_name=model_optim_rng.pt \ - # --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \ - # --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \ - # --model_type=gpt \ - # --pipeline_model_parallel_size=1 \ - # --gpus_per_node=2 \ - # --tensor_model_parallel_size=2" - # python examples/nlp/language_modeling/megatron_gpt_eval.py \ - # --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \ - # --tokens_to_generate=32 \ - # --tensor_model_parallel_size=2 \ - # --prompt=This is a test. - # rm examples/nlp/language_modeling/small_gpt.nemo - - # L2_Megatron_Change_Partitions - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2: + L2_Megatron_GPT_Finetuning_PP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_change_num_partitions.py \ - --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \ - --tensor_model_parallel_size 2 \ - --target_tensor_model_parallel_size 1 \ - --pipeline_model_parallel_size 1 \ - --target_pipeline_model_parallel_size 2 - - rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + +trainer.limit_val_batches=2 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.peft.peft_scheme=null \ + model.data.train_ds.micro_batch_size=1 \ + model.data.train_ds.global_batch_size=4 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ + model.data.train_ds.num_workers=0 \ + model.data.test_ds.micro_batch_size=1 \ + model.data.test_ds.global_batch_size=1 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.names=[quarel] \ + model.data.validation_ds.micro_batch_size=1 \ + model.data.validation_ds.global_batch_size=1 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel] + + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + +trainer.limit_val_batches=2 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.peft.peft_scheme=null \ + model.data.train_ds.micro_batch_size=1 \ + model.data.train_ds.global_batch_size=4 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ + model.data.train_ds.num_workers=0 \ + model.data.test_ds.micro_batch_size=1 \ + model.data.test_ds.global_batch_size=1 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.names=[quarel] \ + model.data.validation_ds.micro_batch_size=1 \ + model.data.validation_ds.global_batch_size=1 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel] + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_sft_results - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2: + L2_Megatron_GPT_Finetuning_StarCoder_PP1: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_change_num_partitions.py \ - --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \ - --tensor_model_parallel_size 2 \ - --target_tensor_model_parallel_size 4 \ - --pipeline_model_parallel_size 1 \ - --target_pipeline_model_parallel_size 1 - - rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2: + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.precision=32 \ + trainer.max_steps=4 \ + trainer.val_check_interval=4 \ + trainer.enable_checkpointing=False \ + +trainer.limit_val_batches=2 \ + +trainer.limit_test_batches=2 \ + exp_manager.checkpoint_callback_params.save_best_model=False \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ + model.peft.peft_scheme=none \ + model.optim.name=distributed_fused_adam \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.num_workers=0 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.num_workers=0 \ + model.data.train_ds.concat_sampling_probabilities=[1.0] + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_sft_results + + L2_Megatron_GPT_Embedding: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=fast-swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=fast-swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + rm -rf /home/TestData/nlp/megatron_ir/working_dir + + python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \ + exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \ + model.global_batch_size=4 \ + model.micro_batch_size=4 \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.max_epochs=null \ + trainer.max_steps=20 \ + trainer.val_check_interval=10 \ + model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ + model.peft.lora_tuning.adapter_dim=8 \ + model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ + model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \ + model.data.validation_ds.write_embeddings_to_file=True \ + model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] + + + python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \ + model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \ + model.global_batch_size=4 \ + model.micro_batch_size=4 \ + model.peft.lora_tuning.adapter_dim=8 \ + model.data.test_ds.write_embeddings_to_file=True \ + model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \ + model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ + model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/megatron_ir/working_dir - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2: + L2_Megatron_GPT_PEFT_Lora_PP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=alibi \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=alibi \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 + + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.max_epochs=9999 \ + trainer.max_steps=3 \ + trainer.val_check_interval=3 \ + ++trainer.limit_val_batches=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \ + model.pipeline_model_parallel_size=2 \ + model.tensor_model_parallel_size=1 \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + model.peft.peft_scheme=lora \ + model.answer_only_loss=True \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[1.0] \ + model.data.train_ds.num_workers=0 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel] + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2 - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2: + L2_Megatron_GPT_PEFT_Lora_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=kerple \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=kerple \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + rm -rf /home/TestData/nlp/lora_tuning_tp2 + + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.max_epochs=9999 \ + trainer.max_steps=3 \ + trainer.val_check_interval=3 \ + ++trainer.limit_val_batches=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ + model.pipeline_model_parallel_size=1 \ + model.tensor_model_parallel_size=2 \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.peft.peft_scheme='lora' \ + model.answer_only_loss=True \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[1.0] \ + model.data.train_ds.num_workers=0 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel] + + python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ + model.tensor_model_parallel_size=2 \ + trainer.devices=2 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ + model.data.test_ds.names=['quarel4'] \ + model.global_batch_size=2 \ + model.micro_batch_size=1 \ + model.data.test_ds.tokens_to_generate=10 \ + model.data.test_ds.write_predictions_to_file=True \ + model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \ + inference.greedy=True \ + inference.repetition_penalty=1.0 \ + inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl' + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/lora_tuning_tp2 - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2: + L2_Megatron_GPT_Eval: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation=gelu \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=post_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation=gelu \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=post_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_eval.py \ + gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \ + prompts=['How to fix GPU memory? A:'] \ + tensor_model_parallel_size=1 \ + inference.tokens_to_generate=32 \ + trainer.precision=32 - L2_Megatron_T5_w_Mixture_of_Expert_Pretraining: + L2_Megatron_GPT_Eval_PP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_eval.py \ + gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ + server=False \ + tensor_model_parallel_size=1 \ + pipeline_model_parallel_size=2 \ trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.num_moe_experts=4 \ - model.decoder.num_moe_experts=4 \ - model.encoder.moe_frequency=3 \ - model.decoder.moe_frequency=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation=gelu \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=post_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + trainer.num_nodes=1 \ + trainer.precision=32 - L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2: + L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=normformer \ - model.encoder.headscale=True \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.decoder.transformer_block_type=normformer \ - model.decoder.headscale=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=normformer \ - model.encoder.headscale=True \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.decoder.transformer_block_type=normformer \ - model.decoder.headscale=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \ + model.peft.restore_from_path=null \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \ + model.data.test_ds.names=[test] \ + model.data.test_ds.global_batch_size=1 \ + model.data.test_ds.micro_batch_size=1 \ + model.data.test_ds.tokens_to_generate=30 \ + model.data.test_ds.max_seq_length=6000 \ + model.data.test_ds.write_predictions_to_file=True \ + model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \ + inference.greedy=True \ + inference.repetition_penalty=1.0 \ + inference.outfile_path=examples/nlp/language_modeling/out.jsonl + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/out.jsonl - L2_Megatron_T5_Eval: + # TODO: Add this test back. Test was failing on CI machines due to HW error + # - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval + # when { + # anyOf { + # branch main + # changeRequest target: main + # } + # } + # failFast true + # - run: | + # python -m torch.distributed.launch --nproc_per_node=2 \ + # examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \ + # --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \ + # --checkpoint_name=model_optim_rng.pt \ + # --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \ + # --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \ + # --model_type=gpt \ + # --pipeline_model_parallel_size=1 \ + # --gpus_per_node=2 \ + # --tensor_model_parallel_size=2" + # python examples/nlp/language_modeling/megatron_gpt_eval.py \ + # --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \ + # --tokens_to_generate=32 \ + # --tensor_model_parallel_size=2 \ + # --prompt=This is a test. + # rm examples/nlp/language_modeling/small_gpt.nemo + + # L2_Megatron_Change_Partitions + L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_eval.py \ - --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - --prompt 'How do I fix my GPU memory issue? I am seeing out of memory.' \ - --tensor_model_parallel_size 1 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_change_num_partitions.py \ + --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \ + --tensor_model_parallel_size 2 \ + --target_tensor_model_parallel_size 1 \ + --pipeline_model_parallel_size 1 \ + --target_pipeline_model_parallel_size 2 + AFTER_SCRIPT: | + rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2: + L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='reglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='reglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' - - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=5 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='reglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='reglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' - - rm -rf examples/nlp/language_modeling/bart_pretrain_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_change_num_partitions.py \ + --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \ + --tensor_model_parallel_size 2 \ + --target_tensor_model_parallel_size 4 \ + --pipeline_model_parallel_size 1 \ + --target_pipeline_model_parallel_size 1 + AFTER_SCRIPT: | + rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2: + L2_Megatron_T5_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=geglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] - - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=geglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] - - rm -rf examples/nlp/language_modeling/bart_pretrain_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=relative \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=fast-swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=relative \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=fast-swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings - # L2: Megatron T5 GLUE/XNLI Finetuning - # TODO(Oktai15): update it in 1.8.0 version - L2_Megatron_T5_GLUE_RTE: + L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=rte \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv - - rm -rf examples/nlp/language_modeling/t5_glue_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_T5_GLUE_XNLI: + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=alibi \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=alibi \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings + + L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=kerple \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=kerple \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=swiglu \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=pre_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings + + L2_Megatron_T5_Pretraining_and_Resume_Training_PP2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.pipeline_model_parallel_size=2 \ + model.pipeline_model_parallel_split_rank=1 \ + model.seq_length=256 \ + model.encoder.num_layers=4 \ + model.decoder.num_layers=1 \ + model.encoder.hidden_size=64 \ + model.decoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.decoder.num_attention_heads=8 \ + model.decoder.ffn_hidden_size=2048 \ + model.encoder.activation=gelu \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=post_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings + + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.pipeline_model_parallel_size=2 \ + model.pipeline_model_parallel_split_rank=1 \ + model.seq_length=256 \ + model.encoder.num_layers=4 \ + model.decoder.num_layers=1 \ + model.encoder.hidden_size=64 \ + model.decoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.decoder.num_attention_heads=8 \ + model.decoder.ffn_hidden_size=2048 \ + model.encoder.activation=gelu \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=post_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings + + L2_Megatron_T5_w_Mixture_of_Expert_Pretraining: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - -cn megatron_t5_config_finetune_glue_xnli \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.test_ds.global_batch_size=2 \ - model.data.test_ds.micro_batch_size=2 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=xnli \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ - model.data.test_ds.task_name=xnli \ - model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv - - rm -rf examples/nlp/language_modeling/t5_xnli_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_T5_PEFT_Lora_TP2: + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.pipeline_model_parallel_split_rank=1 \ + model.seq_length=256 \ + model.encoder.num_layers=4 \ + model.decoder.num_layers=1 \ + model.encoder.num_moe_experts=4 \ + model.decoder.num_moe_experts=4 \ + model.encoder.moe_frequency=3 \ + model.decoder.moe_frequency=1 \ + model.encoder.hidden_size=64 \ + model.decoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.decoder.num_attention_heads=8 \ + model.decoder.ffn_hidden_size=2048 \ + model.encoder.activation=gelu \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=pre_ln \ + model.decoder.transformer_block_type=post_ln \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings + + L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - rm -rf /home/TestData/nlp/t5_lora_tuning_tp2 + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=normformer \ + model.encoder.headscale=True \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=geglu \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.decoder.transformer_block_type=normformer \ + model.decoder.headscale=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings + + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=swiglu \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type=normformer \ + model.encoder.headscale=True \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=geglu \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.decoder.transformer_block_type=normformer \ + model.decoder.headscale=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results + rm -rf examples/nlp/language_modeling/t5_index_mappings - python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.peft_scheme=lora \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \ - model.peft.restore_from_ckpt_name=null \ - model.peft.restore_from_hparams_path=null \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=[quarel4] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl + L2_Megatron_T5_Eval: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_eval.py \ + --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ + --prompt 'How do I fix my GPU memory issue? I am seeing out of memory.' \ + --tensor_model_parallel_size 1 - rm -rf /home/TestData/nlp/t5_lora_tuning_tp2 - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + L2_Megatron_BART_Pretraining_and_Resume_Training_TP2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='reglu' \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='reglu' \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' + + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=5 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='reglu' \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='reglu' \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}' + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/bart_pretrain_results + + L2_Megatron_BART_Pretraining_and_Resume_Training_PP2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ + model.pipeline_model_parallel_size=2 \ + model.pipeline_model_parallel_split_rank=1 \ + model.seq_length=256 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=geglu \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=geglu \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.data.respect_document_boundaries=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] + + python examples/nlp/language_modeling/megatron_bart_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.pipeline_model_parallel_size=2 \ + model.pipeline_model_parallel_split_rank=1 \ + model.seq_length=256 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation=geglu \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method=block \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.decoder.num_layers=4 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation=geglu \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method=block \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.data.respect_document_boundaries=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/bart_pretrain_results + + # L2: Megatron T5 GLUE/XNLI Finetuning + # TODO(Oktai15): update it in 1.8.0 version + L2_Megatron_T5_GLUE_RTE: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ + trainer.devices=1 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + +trainer.limit_val_batches=2 \ + +trainer.limit_test_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \ + model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ + model.pipeline_model_parallel_size=1 \ + model.pipeline_model_parallel_split_rank=0 \ + model.data.train_ds.task_name=rte \ + model.data.train_ds.global_batch_size=4 \ + model.data.train_ds.micro_batch_size=2 \ + model.data.validation_ds.global_batch_size=2 \ + model.data.validation_ds.micro_batch_size=2 \ + model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ + model.data.validation_ds.task_name=rte \ + model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_glue_results + + L2_Megatron_T5_GLUE_XNLI: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ + -cn megatron_t5_config_finetune_glue_xnli \ + trainer.devices=1 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=1 \ + +trainer.limit_val_batches=2 \ + +trainer.limit_test_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \ + model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ + model.pipeline_model_parallel_size=1 \ + model.pipeline_model_parallel_split_rank=0 \ + model.data.train_ds.global_batch_size=4 \ + model.data.train_ds.micro_batch_size=2 \ + model.data.validation_ds.global_batch_size=2 \ + model.data.validation_ds.micro_batch_size=2 \ + model.data.test_ds.global_batch_size=2 \ + model.data.test_ds.micro_batch_size=2 \ + model.data.train_ds.task_name=rte \ + model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ + model.data.validation_ds.task_name=xnli \ + model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ + model.data.test_ds.task_name=xnli \ + model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_xnli_results + + L2_Megatron_T5_PEFT_Lora_TP2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + rm -rf /home/TestData/nlp/t5_lora_tuning_tp2 + + python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \ + trainer.devices=2 \ + trainer.log_every_n_steps=1 \ + trainer.max_epochs=9999 \ + trainer.max_steps=3 \ + trainer.val_check_interval=3 \ + ++trainer.limit_val_batches=2 \ + trainer.precision=16 \ + exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \ + model.pipeline_model_parallel_size=1 \ + model.tensor_model_parallel_size=2 \ + model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ + model.peft.peft_scheme=lora \ + model.answer_only_loss=True \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.concat_sampling_probabilities=[1.0] \ + model.data.train_ds.num_workers=0 \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.names=[quarel] + + python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \ + model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ + model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \ + model.peft.restore_from_ckpt_name=null \ + model.peft.restore_from_hparams_path=null \ + model.tensor_model_parallel_size=2 \ + trainer.devices=2 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ + model.data.test_ds.names=[quarel4] \ + model.global_batch_size=2 \ + model.micro_batch_size=1 \ + model.data.test_ds.tokens_to_generate=10 \ + model.data.test_ds.write_predictions_to_file=True \ + model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \ + inference.greedy=True \ + inference.repetition_penalty=1.0 \ + inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl + AFTER_SCRIPT: | + rm -rf /home/TestData/nlp/t5_lora_tuning_tp2 # L2: Megatron Mock Data Generation L2_Megatron_Mock_Data_Generation_MockGPTDataset: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.max_steps=10 \ - trainer.limit_val_batches=7 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.data.data_impl=mock \ - model.data.data_prefix=[] - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_Megatron_Mock_Data_Generation_MockT5Dataset: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ trainer.max_steps=10 \ - trainer.limit_val_batches=3 \ + trainer.limit_val_batches=7 \ trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.mcore_gpt=True \ model.data.data_impl=mock \ model.data.data_prefix=[] - rm -rf examples/nlp/language_modeling/t5_pretrain_results - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + L2_Megatron_Mock_Data_Generation_MockT5Dataset: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.max_steps=10 \ + trainer.limit_val_batches=3 \ + trainer.val_check_interval=10 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.data.data_impl=mock \ + model.data.data_prefix=[] + AFTER_SCRIPT: | + rm -rf examples/nlp/language_modeling/t5_pretrain_results # L2: TTS Fast dev runs 1 L2_TTS_Fast_dev_runs_1_Tacotron_2: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure-gpus-1 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/tacotron2.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.decoder.decoder_rnn_dim=256 \ - model.decoder.attention_rnn_dim=1024 \ - model.decoder.prenet_dim=128 \ - model.postnet.postnet_n_convolutions=3 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs \ - ~trainer.check_val_every_n_epoch - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure-gpus-1 + SCRIPT: | + python examples/tts/tacotron2.py \ + train_dataset=/home/TestData/an4_dataset/an4_train.json \ + validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices=1 \ + trainer.accelerator="gpu" \ + +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ + trainer.strategy=auto \ + model.decoder.decoder_rnn_dim=256 \ + model.decoder.attention_rnn_dim=1024 \ + model.decoder.prenet_dim=128 \ + model.postnet.postnet_n_convolutions=3 \ + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=0 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=0 \ + ~model.text_normalizer \ + ~model.text_normalizer_call_kwargs \ + ~trainer.check_val_every_n_epoch L2_TTS_Fast_dev_runs_1_WaveGlow: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/waveglow.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.waveglow.n_flows=4 \ - model.waveglow.n_wn_layers=2 \ - model.waveglow.n_wn_channels=32 \ - ~trainer.check_val_every_n_epoch - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/tts/waveglow.py \ + train_dataset=/home/TestData/an4_dataset/an4_train.json \ + validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices="[0]" \ + +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ + trainer.strategy=auto \ + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=0 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=0 \ + model.waveglow.n_flows=4 \ + model.waveglow.n_wn_layers=2 \ + model.waveglow.n_wn_channels=32 \ + ~trainer.check_val_every_n_epoch L2_TTS_Fast_dev_runs_1_FastPitch: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/fastpitch.py \ - --config-name fastpitch_align_v1.05 \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/beta_priors \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.symbols_embedding_dim=64 \ - model.input_fft.d_inner=384 \ - model.input_fft.n_layer=2 \ - model.output_fft.d_inner=384 \ - model.output_fft.n_layer=2 \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - - L2_TTS_Fast_dev_runs_1_RADTTS: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/radtts.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - export_dir=/home/TestData/radtts_test \ - model.optim.lr=0.0001 \ - model.modelConfig.decoder_use_partial_padding=True \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/tts/fastpitch.py \ + --config-name fastpitch_align_v1.05 \ + train_dataset=/home/TestData/an4_dataset/an4_train.json \ + validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + sup_data_path=/home/TestData/an4_dataset/beta_priors \ + trainer.devices="[0]" \ + +trainer.limit_train_batches=1 \ + +trainer.limit_val_batches=1 \ + trainer.max_epochs=1 \ + trainer.strategy=auto \ + model.pitch_mean=212.35873413085938 \ + model.pitch_std=68.52806091308594 \ + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=0 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=0 \ + model.symbols_embedding_dim=64 \ + model.input_fft.d_inner=384 \ + model.input_fft.n_layer=2 \ + model.output_fft.d_inner=384 \ + model.output_fft.n_layer=2 \ + ~trainer.check_val_every_n_epoch \ + ~model.text_normalizer \ + ~model.text_normalizer_call_kwargs + + # OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS: + # needs: [cicd-test-container-setup] + # runs-on: self-hosted-azure + # timeout-minutes: 10 + # container: + # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + # options: + # # --user 0:128 + # --device=/dev/nvidia0 + # --gpus all + # --shm-size=8g + # --env TRANSFORMERS_OFFLINE=0 + # --env HYDRA_FULL_ERROR=1 + # --volume /mnt/datadrive/TestData:/home/TestData + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - run: | + # python examples/tts/radtts.py \ + # train_dataset=/home/TestData/an4_dataset/an4_train.json \ + # validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + # sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \ + # trainer.devices="[0]" \ + # +trainer.limit_train_batches=1 \ + # +trainer.limit_val_batches=1 \ + # trainer.max_epochs=1 \ + # trainer.strategy=auto \ + # model.pitch_mean=212.35873413085938 \ + # model.pitch_std=68.52806091308594 \ + # model.train_ds.dataloader_params.batch_size=4 \ + # model.train_ds.dataloader_params.num_workers=0 \ + # model.validation_ds.dataloader_params.batch_size=4 \ + # model.validation_ds.dataloader_params.num_workers=0 \ + # export_dir=/home/TestData/radtts_test \ + # model.optim.lr=0.0001 \ + # model.modelConfig.decoder_use_partial_padding=True \ + # ~trainer.check_val_every_n_epoch \ + # ~model.text_normalizer \ + # ~model.text_normalizer_call_kwargs + # #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + # # if: "failure()" L2_TTS_Fast_dev_runs_1_Mixer-TTS: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/mixer_tts.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/sup_data \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/tts/mixer_tts.py \ + train_dataset=/home/TestData/an4_dataset/an4_train.json \ + validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + sup_data_path=/home/TestData/an4_dataset/sup_data \ + trainer.devices="[0]" \ + +trainer.limit_train_batches=1 \ + +trainer.limit_val_batches=1 \ + trainer.max_epochs=1 \ + trainer.strategy=auto \ + model.pitch_mean=212.35873413085938 \ + model.pitch_std=68.52806091308594 \ + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=0 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=0 \ + ~trainer.check_val_every_n_epoch \ + ~model.text_normalizer \ + ~model.text_normalizer_call_kwargs L2_TTS_Fast_dev_runs_1_Hifigan: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/tts/hifigan.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - +trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.generator.upsample_initial_channel=64 \ - +model.debug=true \ - ~trainer.check_val_every_n_epoch - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python examples/tts/hifigan.py \ + train_dataset=/home/TestData/an4_dataset/an4_train.json \ + validation_datasets=/home/TestData/an4_dataset/an4_val.json \ + trainer.devices="[0]" \ + +trainer.limit_train_batches=1 \ + +trainer.limit_val_batches=1 \ + +trainer.max_epochs=1 \ + trainer.strategy=auto \ + model.train_ds.dataloader_params.batch_size=4 \ + model.train_ds.dataloader_params.num_workers=0 \ + model.validation_ds.dataloader_params.batch_size=4 \ + model.validation_ds.dataloader_params.num_workers=0 \ + model.generator.upsample_initial_channel=64 \ + +model.debug=true \ + ~trainer.check_val_every_n_epoch # L2: NeRF # L2_NeRF_DreamFusion: @@ -6145,44 +4895,32 @@ jobs: Speech_Checkpoints_tests: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \ - pretrained_name=QuartzNet15x5Base-En \ - dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \ - batch_size=64 \ - tolerance=0.1012 - rm -f examples/asr/evaluation_transcripts.json - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + TIMEOUT: 20 + SCRIPT: | + CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \ + pretrained_name=QuartzNet15x5Base-En \ + dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \ + batch_size=64 \ + tolerance=0.1012 + AFTER_SCRIPT: | + rm -f examples/asr/evaluation_transcripts.json Nemo_CICD_Test: - needs: - - L0_Unit_Tests_GPU + needs: + #- OPTIONAL_L0_Unit_Tests_GPU - L0_Unit_Tests_CPU - L2_Community_LLM_Checkpoints_tests_Llama - L2_Community_LLM_Checkpoints_tests_StarCoder - L2_Community_LLM_Checkpoints_tests_Falcon - - L2_Community_LLM_Checkpoints_tests_Baichuan2 + #- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2 - ASR_dev_run_Speech_to_Text - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet - ASR_dev_run_Speech_Pre-training_-_CitriNet - ASR_dev_run_Speech_To_Text_Finetuning - - ASR_dev_run_Speech_To_Text_HF_Finetuning + #- OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning - ASR_dev_run_Speech_to_Text_WPE_-_Conformer - ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer - L2_Speech_to_Text_EMA @@ -6248,6 +4986,8 @@ jobs: - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training - L2_Megatron_RETRO_Pretraining_and_Resume_Training + - L2_RAG_Pipeline_Indexing + - L2_RAG_Pipeline_Generating - L2_BioMegatron_Bert_NER_Task - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2 @@ -6256,6 +4996,7 @@ jobs: - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2 - L2_Megatron_GPT_Finetuning_PP2 - L2_Megatron_GPT_Finetuning_StarCoder_PP1 + - L2_Megatron_GPT_Embedding - L2_Megatron_GPT_PEFT_Lora_PP2 - L2_Megatron_GPT_PEFT_Lora_TP2 - L2_Megatron_GPT_Eval @@ -6280,13 +5021,41 @@ jobs: - L2_TTS_Fast_dev_runs_1_Tacotron_2 - L2_TTS_Fast_dev_runs_1_WaveGlow - L2_TTS_Fast_dev_runs_1_FastPitch - - L2_TTS_Fast_dev_runs_1_RADTTS + #- OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS - L2_TTS_Fast_dev_runs_1_Mixer-TTS - L2_TTS_Fast_dev_runs_1_Hifigan - Speech_Checkpoints_tests - + if: always() runs-on: ubuntu-latest - steps: - # This should depend on all the tests so we block/unblock based on all tests passing - - run: exit 0 + steps: + - if: ${{ always() }} + id: pipeline-conclusion + run: | + # Slack notifications are send only on test failure (not cancelled): + FAILED=${{ contains(needs.*.outputs.conclusion, 'failure') }} + echo "FAILED=$FAILED" >> $GITHUB_OUTPUT + + # Mark as successful if no job was cancelled: + SUCCESS=${{ !contains(needs.*.result, 'cancelled') }} + echo "SUCCESS=$SUCCESS" >> $GITHUB_OUTPUT + + # This should depend on all the tests so we block/unblock based on all tests passing + - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }} + run: exit 0 + + - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }} + name: Checkout repository + uses: actions/checkout@v4 + + - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }} + run: | + source .github/scripts/slackHelper.sh + + WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK }} + PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL" + - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }} + run: | + exit 1 diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml new file mode 100644 index 000000000000..a4b8cf3d4072 --- /dev/null +++ b/.github/workflows/code-formatting.yml @@ -0,0 +1,66 @@ +name: Isort and Black Formatting +# Incrementally reformat only changed files with black, all files with isort +# +# Replaces pre-commit.ci, since it reformats all the files. +# See issue https://github.com/pre-commit-ci/issues/issues/90 +# +# The action requires a custom token to trigger workflow after pushing reformatted files back to the branch. +# `secrets.GITHUB_TOKEN` can be used instead, but this will result +# in not running necessary checks after reformatting, which is undesirable. +# For details see https://github.com/orgs/community/discussions/25702 + +on: + pull_request_target: + paths: + - '**.py' + +jobs: + reformat_with_isort_and_black: + runs-on: ubuntu-latest + permissions: + # write permissions required to commit changes + contents: write + steps: + - name: Checkout branch + uses: actions/checkout@v4 + with: + # setup repository and ref for PRs, see + # https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs + repository: ${{ github.event.pull_request.head.repo.full_name }} + ref: ${{ github.event.pull_request.head.ref }} + # custom token is required to trigger actions after reformatting + pushing + token: ${{ secrets.NEMO_REFORMAT_TOKEN }} + + # https://github.com/tj-actions/changed-files + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v44 + with: + files: | + **.py + + - name: Setup Python env + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: black + uses: psf/black@stable + with: + options: "--verbose" + # apply only to changed files (pass explicitly the files) + src: "${{ steps.changed-files.outputs.all_changed_files }}" + version: "~= 24.3" + + - name: isort + uses: isort/isort-action@v1 + with: + isort-version: "5.13.2" + # reformat all files with isort – safe since the whole repo is already reformatted + configuration: "" + + - uses: EndBug/add-and-commit@v9 + # Commit changes. Nothing is committed if no changes. + with: + message: Apply isort and black reformatting + commit: --signoff diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 75d1a6c51a1e..3f2213062872 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,6 +19,8 @@ ci: autofix_prs: true autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions' autoupdate_schedule: quarterly + # skip all hooks that can change the files, use GitHub Action "code-formatting.yml" for this + skip: [black,isort] repos: - repo: https://github.com/pre-commit/pre-commit-hooks @@ -32,15 +34,19 @@ repos: - id: requirements-txt-fixer - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort name: Format imports exclude: docs/ - - repo: https://github.com/psf/black - rev: 19.10b0 + # Using this mirror lets us use mypyc-compiled black, which is about 2x faster + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 24.3.0 hooks: - id: black - name: Format code - additional_dependencies: ['click==8.0.2'] + # It is recommended to specify the latest version of Python + # supported by your project here, or alternatively use + # pre-commit's default_language_version, see + # https://pre-commit.com/#top_level-default_language_version + language_version: python3.10 diff --git a/Dockerfile.ci b/Dockerfile.ci new file mode 100644 index 000000000000..18188f7be45f --- /dev/null +++ b/Dockerfile.ci @@ -0,0 +1,75 @@ +# syntax=docker/dockerfile:1-labs + +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3 + +FROM ${BASE_IMAGE} + +ENV TRANSFORMERS_OFFLINE=0 +ENV HYDRA_FULL_ERROR=1 +ENV PYTHONUNBUFFERED=1 + +# APT packages +RUN <<"EOF" bash -ex +apt-get update +apt-get install -y bc libsox-fmt-all -y +apt-get clean +EOF + +WORKDIR /workspace + +# Install NeMo requirements +ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e +ARG MODELOPT_VERSION=0.11.0 +ARG MCORE_TAG=c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9 +ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c +RUN \ +--mount=type=bind,source=requirements,target=requirements \ +--mount=type=bind,source=tools,target=tools \ +--mount=type=bind,source=setup.py,target=setup.py \ +--mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \ +--mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex +pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \ +"transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \ +"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \ +"nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \ +"apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \ +"llama-index==0.10.43" \ +-r tools/ctc_segmentation/requirements.txt \ +".[all]" + +# Megatron Core installation +git clone https://github.com/NVIDIA/Megatron-LM.git && \ +pushd Megatron-LM && \ +git checkout ${MCORE_TAG} && \ + pushd megatron/core/datasets && \ + make && \ + popd && \ +popd +export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM" +EOF + +# Copy over NeMo code +COPY ./ ./ +RUN <<"EOF" bash -ex +pip install --no-cache-dir --no-build-isolation ".[all]" + +# set permission +chmod 777 -R /workspace +EOF + +ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM" + diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index cbc52d20c41c..000000000000 --- a/Jenkinsfile +++ /dev/null @@ -1,5912 +0,0 @@ -pipeline { - agent { - docker { - image 'nvcr.io/nvidia/pytorch:24.02-py3' - args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1' - } - } - - environment { - NVTE_FUSED_ATTN = 0 - NVTE_FLASH_ATTN = 0 - PYTHONPATH = "/mnt/D3/JenkinsWorkDir/workspace/NeMo-multibranch_${GIT_BRANCH}/Megatron-LM" - } - - options { - timeout(time: 8, unit: 'HOURS') - disableConcurrentBuilds(abortPrevious: true) - } - - stages { - - stage('Add git safe directory'){ - steps{ - sh 'git config --global --add safe.directory /var/lib/jenkins/workspace/NeMo_$GIT_BRANCH' - sh 'git config --global --add safe.directory /raid/JenkinsWorkDir/workspace/NeMo_$GIT_BRANCH' - sh 'git config --global --add safe.directory /mnt/D3/JenkinsWorkDir/workspace/NeMo_$GIT_BRANCH' - } - } - - stage('nvidia-smi'){ - steps{ - sh 'nvidia-smi' - } - } - - stage('PyTorch version') { - steps { - sh 'python -c "import torch; print(torch.__version__)"' - sh 'python -c "import torchvision; print(torchvision.__version__)"' - } - } - - stage('Install test requirements') { - steps { - sh 'apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt' - } - } - - stage('Code formatting checks') { - steps { - sh 'python setup.py style' - } - } - - stage('Copyright Headers check') { - steps { - sh 'python tests/check_copyright_header.py --dir .' - } - } - - stage('NeMo Installation') { - steps { - sh './reinstall.sh release' - } - } - - stage('Transformer Engine installation') { - steps { - sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \ - cd TransformerEngine && \ - git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \ - git checkout FETCH_HEAD && \ - git submodule init && git submodule update && \ - NVTE_FRAMEWORK=pytorch pip install .' - } - } - - stage('Apex installation') { - steps { - sh 'git clone https://github.com/NVIDIA/apex.git && \ - cd apex && \ - git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \ - cp -R apex /usr/local/lib/python3.10/dist-packages' - } - } - - stage('Megatron Core installation') { - steps { - sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ - cd Megatron-LM && \ - git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \ - pip install . && \ - cd megatron/core/datasets && \ - make' - } - } - - stage('AMMO installation') { - steps { - sh 'pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir' - } - } - - stage('PyTorch Lightning version') { - steps { - sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"' - } - } - - stage('PyTorch Lightning DDP Checks') { - steps { - sh 'CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"' - } - } - - stage('Basic Import Checks') { - steps { - sh 'python -c "import nemo.collections.asr as nemo_asr"' - sh 'python -c "import nemo.collections.nlp as nemo_nlp"' - sh 'python -c "import nemo.collections.tts as nemo_tts"' - } - } - stage('Import Checks'){ - steps { - sh 'python tests/core_ptl/check_imports.py --domain "nlp"' - } - } - - stage('L0: Unit Tests GPU') { - steps { - sh 'NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads' - } - } - - stage('L0: Unit Tests CPU') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - steps { - sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat' - } - } - - stage('L2: Multimodal Imagen Train') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/multimodal/imagen_train" - sh "python examples/multimodal/text_to_image/imagen/imagen_training.py \ - trainer.precision=16 \ - trainer.num_nodes=1 \ - trainer.devices=1 \ - ++exp_manager.max_time_per_run=00:00:03:00 \ - trainer.max_steps=20 \ - model.conditioning.embed_dim=64 \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.synthetic_data=True \ - exp_manager.exp_dir=/home/TestData/multimodal/imagen_train \ - model.inductor=False \ - model.unet.flash_attention=False \ - " - sh "rm -rf /home/TestData/multimodal/imagen_train" - } - } - - stage('L2: Multimodal Stable Diffusion Train') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/multimodal/stable_diffusion_train" - sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ - trainer.precision=bf16 \ - trainer.num_nodes=1 \ - trainer.devices=1 \ - ++exp_manager.max_time_per_run=00:00:03:00 \ - trainer.max_steps=20 \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.optim.name=megatron_fused_adam \ - model.data.synthetic_data=True \ - exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train \ - model.inductor=False \ - model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ - ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ - ++model.cond_stage_config.max_length=77 \ - ~model.cond_stage_config.restore_from_path \ - ~model.cond_stage_config.freeze \ - ~model.cond_stage_config.layer \ - model.unet_config.from_pretrained=null \ - model.first_stage_config.from_pretrained=null \ - model.unet_config.use_flash_attention=False \ - model.unet_config.attention_resolutions=[1] \ - model.unet_config.channel_mult=[1] \ - model.ddp_overlap=False \ - " - sh "rm -rf /home/TestData/multimodal/stable_diffusion_train" - } - } - stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" - sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ - trainer.precision=bf16 \ - trainer.num_nodes=1 \ - trainer.devices=1 \ - ++exp_manager.max_time_per_run=00:00:03:00 \ - exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \ - trainer.max_steps=20 \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.synthetic_data=True \ - model.first_stage_key=images_moments \ - model.cond_stage_key=clip_encoded \ - model.optim.name=megatron_fused_adam \ - +model.optim.capturable=True \ - exp_manager.ema.enable=False \ - model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ - ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ - ++model.cond_stage_config.max_length=77 \ - model.inductor=False \ - ~model.cond_stage_config.restore_from_path \ - ~model.cond_stage_config.freeze \ - ~model.cond_stage_config.layer \ - model.first_stage_config.from_pretrained=null \ - model.ddp_overlap=False \ - model.capture_cudagraph_iters=15 \ - model.unet_config.use_flash_attention=False \ - model.unet_config.attention_resolutions=[1] \ - model.unet_config.channel_mult=[1] \ - " - sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" - } - } -// stage('L2: Multimodal ControlNet Train') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// steps { -// sh "rm -rf /home/TestData/multimodal/controlnet_train" -// sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \ -// trainer.precision=16 \ -// trainer.num_nodes=1 \ -// trainer.devices=1 \ -// ++exp_manager.max_time_per_run=00:00:03:00 \ -// trainer.max_steps=20 \ -// model.micro_batch_size=1 \ -// model.global_batch_size=1 \ -// model.data.synthetic_data=True \ -// exp_manager.exp_dir=/home/TestData/multimodal/controlnet_train \ -// model.inductor=False \ -// model.image_logger.max_images=0 \ -// model.control_stage_config.params.from_pretrained_unet=null \ -// model.unet_config.from_pretrained=null \ -// model.first_stage_config.from_pretrained=null \ -// model.unet_config.use_flash_attention=False \ -// " -// sh "rm -rf /home/TestData/multimodal/controlnet_train" -// } -// } -// stage('L2: Multimodal DreamBooth Train') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// steps { -// sh "rm -rf /home/TestData/multimodal/dreambooth_train" -// sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \ -// trainer.precision=16 \ -// trainer.num_nodes=1 \ -// trainer.devices=1 \ -// ++exp_manager.max_time_per_run=00:00:03:00 \ -// trainer.max_steps=20 \ -// model.micro_batch_size=1 \ -// model.global_batch_size=1 \ -// exp_manager.exp_dir=/home/TestData/multimodal/dreambooth_train \ -// model.inductor=False \ -// model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ -// ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ -// ++model.cond_stage_config.max_length=77 \ -// ~model.cond_stage_config.restore_from_path \ -// ~model.cond_stage_config.freeze \ -// ~model.cond_stage_config.layer \ -// model.unet_config.from_pretrained=null \ -// model.first_stage_config.from_pretrained=null \ -// model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \ -// model.unet_config.use_flash_attention=False \ -// " -// sh "rm -rf /home/TestData/multimodal/dreambooth_train" -// } -// } - stage('L2: Vision ViT Pretrain TP=1') { - when { - anyOf { - branch 'r1.23.0' - changeRequest target: 'r1.23.0' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/vision/vit_pretrain_tp1" - sh "python examples/vision/vision_transformer/megatron_vit_classification_pretrain.py \ - trainer.precision=16 \ - model.megatron_amp_O2=False \ - trainer.num_nodes=1 \ - trainer.devices=1 \ - trainer.val_check_interval=5 \ - ++exp_manager.max_time_per_run=00:00:03:00 \ - trainer.max_steps=20 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.data.num_workers=0 \ - exp_manager.create_checkpoint_callback=False \ - model.data.data_path=[/home/TestData/multimodal/tiny-imagenet/train,/home/TestData/multimodal/tiny-imagenet/val] \ - exp_manager.exp_dir=/home/TestData/vision/vit_pretrain_tp1 " - sh "rm -rf /home/TestData/vision/vit_pretrain_tp1" - } - } - - stage('L2: Multimodal CLIP Pretrain TP=1') { - when { - anyOf { - branch 'r1.23.0' - changeRequest target: 'r1.23.0' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1" - sh "python examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py \ - trainer.precision=16 \ - model.megatron_amp_O2=False \ - trainer.num_nodes=1 \ - trainer.devices=1 \ - trainer.val_check_interval=10 \ - ++exp_manager.max_time_per_run=00:00:03:00 \ - trainer.max_steps=20 \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - exp_manager.create_checkpoint_callback=False \ - model.data.num_workers=0 \ - model.vision.num_layers=2 \ - model.text.num_layers=2 \ - model.vision.patch_dim=32 \ - model.vision.encoder_seq_length=49 \ - model.vision.class_token_length=7 \ - model.data.train.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \ - model.data.validation.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \ - model.data.webdataset.local_root_path=/ \ - exp_manager.exp_dir=/home/TestData/multimodal/clip_pretrain_tp1 " - sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1" - } - } - - stage('L2: Multimodal NeVA Pretrain TP=1') { - when { - anyOf { - branch 'r1.23.0' - changeRequest target: 'r1.23.0' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1" - sh "python examples/multimodal/multimodal_llm/neva/neva_pretrain.py \ - trainer.precision=16 \ - model.megatron_amp_O2=False \ - trainer.num_nodes=1 \ - trainer.devices=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=5 \ - trainer.log_every_n_steps=1 \ - ++exp_manager.max_time_per_run=00:00:03:00 \ - trainer.max_steps=20 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - exp_manager.create_checkpoint_callback=False \ - model.data.data_path=/home/TestData/multimodal/tiny-neva/dummy.json \ - model.data.image_folder=/home/TestData/multimodal/tiny-neva/images \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/multimodal/tiny-neva/tokenizer_add_special.model \ - model.num_layers=2 \ - model.hidden_size=5120 \ - model.ffn_hidden_size=13824 \ - model.num_attention_heads=40 \ - model.normalization=rmsnorm \ - model.data.num_workers=0 \ - model.data.conv_template=llama_2 \ - model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \ - model.mm_cfg.llm.from_pretrained=null \ - model.use_flash_attention=false \ - exp_manager.exp_dir=/home/TestData/multimodal/neva_pretrain_tp1 " - sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1" - } - } - - stage('Setup test data and models') { - steps { - sh 'python -m tests.setup --save_dir /home/TestData/nlp' - } - } - - // TODO: this requires TE >= v0.11 which is not available in 23.06. - // please uncomment this test once mcore CI is ready. - - - stage('L2: Community LLM Checkpoints tests') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Llama') { - steps { - sh 'CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ - --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \ - --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo' - } - } - stage('StarCoder') { - steps { - sh 'python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \ - --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \ - --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf' - sh 'rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo' - } - } - stage('Falcon') { - steps { - sh 'python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \ - --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \ - --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo' - sh 'rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo' - } - } - stage('Baichuan2') { - steps { - sh 'python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \ - --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \ - --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo' - sh 'rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo' - } - } - } - } - - stage('L2: Nemo PTQ') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Llama2 - Export Only') { - steps { - sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ - model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - quantization.algorithm=null \ - model_save=/home/TestData/nlp/megatron_llama/ci_baseline' - sh 'rm -rf /home/TestData/nlp/megatron_llama/ci_baseline' - } - } - stage('Llama2 - INT8 SQ') { - steps { - sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ - model_file=/home/TestData/nlp/megatron_llama/llama_ci_megatron_amp_O2_hf_tokenizer.nemo \ - quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ - quantization.algorithm=int8_sq \ - quantization.num_calib_size=8 \ - inference.batch_size=2 \ - model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo' - } - } - stage('Llama2 - FP8') { - steps { - sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \ - model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - tensor_model_parallel_size=2 \ - trainer.devices=2 \ - quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ - quantization.algorithm=fp8 \ - quantization.num_calib_size=8 \ - inference.batch_size=2 \ - export.inference_tensor_parallel=2 \ - model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo' - } - } - } - } - - stage('L2: ASR dev run') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Speech to Text') { - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_results' - sh 'rm -rf examples/asr/speech_to_text_results' - } - } - - stage('Speech to Text WPE - CitriNet') { - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/citrinet/" --config-name="config_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results' - sh 'rm -rf examples/asr/speech_to_text_wpe_results' - } - } - - stage('Speech Pre-training - CitriNet') { - steps { - sh 'python examples/asr/speech_pretraining/speech_pre_training.py \ - --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_pre_training_results' - sh 'rm -rf examples/asr/speech_pre_training_results' - } - } - - stage('Speech To Text Finetuning') { - steps { - sh 'python examples/asr/speech_to_text_finetune.py \ - --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - model.tokenizer.update_tokenizer=False \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_finetuning_results' - sh 'rm -rf examples/asr/speech_finetuning_results' - } - } - - stage('Speech To Text HF Finetuning') { - steps { - sh 'python examples/asr/speech_to_text_finetune.py \ - --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ - ~model.train_ds.hf_data_cfg \ - model.train_ds.num_workers=1 \ - model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ - model.train_ds.streaming=true \ - +model.train_ds.hf_data_cfg.path="librispeech_asr" \ - +model.train_ds.hf_data_cfg.name=null \ - +model.train_ds.hf_data_cfg.split="test.clean" \ - +model.train_ds.hf_data_cfg.streaming=true \ - ~model.validation_ds.hf_data_cfg \ - model.validation_ds.streaming=true \ - +model.validation_ds.hf_data_cfg.path="librispeech_asr" \ - +model.validation_ds.hf_data_cfg.name=null \ - +model.validation_ds.hf_data_cfg.split="test.clean" \ - +model.validation_ds.hf_data_cfg.streaming=true \ - ~model.test_ds \ - init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - model.tokenizer.update_tokenizer=False \ - model.optim.sched.warmup_steps=0 \ - +model.optim.sched.max_steps=3 \ - trainer.max_epochs=null \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_finetuning_results' - sh 'rm -rf examples/asr/speech_finetuning_results' - } - } - - // TODO: Please Fix Me - // Error locating target 'nemo.collections.asr.modules.wav2vec_modules.ConvFeatureEncoder', see chained exception above. - // stage('L2: Speech Pre-training - Wav2Vec') { - // steps { - // sh 'python examples/asr/speech_pretraining/speech_pre_training.py \ - // --config-path="../conf/ssl/wav2vec/" --config-name="wav2vec_ci" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // trainer.devices=[1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_pre_training_results' - // sh 'rm -rf examples/asr/speech_pre_training_results' - // } - // } - - stage('L2: Speech to Text WPE - Conformer') { - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - model.train_ds.batch_size=4 \ - model.validation_ds.batch_size=4 \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results' - sh 'rm -rf examples/asr/speech_to_text_wpe_conformer_results' - } - } - } - } - - stage('L2: ASR dev run - part two') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: Speech to Text WPE - Squeezeformer') { - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - model.encoder.d_model=144 \ - model.train_ds.batch_size=4 \ - model.validation_ds.batch_size=4 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results' - sh 'rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results' - } - } - } - } - - stage('L2: Speech to Text EMA') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=2 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - +exp_manager.ema.enable=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_results' - sh 'rm -rf examples/asr/speech_to_text_results' - } - - } - - stage('L2: Speech to Text AED') { - when { - anyOf { - branch 'r1.23.0' - changeRequest target: 'r1.23.0' - } - } - steps { - sh 'python examples/asr/speech_multitask/speech_to_text_aed.py \ - model.prompt_format=canary \ - model.model_defaults.asr_enc_hidden=256 \ - model.model_defaults.lm_dec_hidden=256 \ - model.encoder.n_layers=12 \ - model.transf_encoder.num_layers=0 \ - model.transf_decoder.config_dict.num_layers=12 \ - model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \ - ++model.train_ds.is_tarred=false \ - model.train_ds.batch_duration=60 \ - +model.train_ds.text_field="answer" \ - +model.train_ds.lang_field="target_lang" \ - model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \ - +model.validation_ds.text_field="answer" \ - +model.validation_ds.lang_field="target_lang" \ - model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \ - +model.test_ds.text_field="answer" \ - +model.test_ds.lang_field="target_lang" \ - model.tokenizer.langs.spl_tokens.dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \ - model.tokenizer.langs.spl_tokens.type="bpe" \ - model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \ - model.tokenizer.langs.en.type=bpe \ - ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \ - ++model.tokenizer.langs.es.type=bpe \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.use_distributed_sampler=false \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_aed_results' - sh 'rm -rf examples/asr/speech_to_text_results' - } - - } - - stage('L2: Speaker dev run') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Speaker Recognition') { - steps { - sh 'python examples/speaker_tasks/recognition/speaker_reco.py \ - model.train_ds.batch_size=10 \ - model.validation_ds.batch_size=2 \ - model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \ - model.decoder.num_classes=2 \ - trainer.max_epochs=10 \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results' - sh 'rm -rf examples/speaker_tasks/recognition/speaker_recognition_results' - } - } - - stage('Speaker Diarization') { - steps { - sh 'python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \ - model.diarizer.speaker_embeddings.model_path=titanet_large \ - model.train_ds.batch_size=5 \ - model.validation_ds.batch_size=5 \ - model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ - model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ - model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results' - sh 'rm -rf examples/speaker_tasks/diarization/speaker_diarization_results' - } - } - - stage('Speech to Label') { - steps { - sh 'python examples/asr/speech_classification/speech_to_label.py \ - model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ - model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ - model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ - ~model.preprocessor.window_size \ - ~model.preprocessor.window_stride \ - ~model.preprocessor.window \ - ~model.preprocessor.n_mels \ - ~model.preprocessor.n_mfcc \ - ~model.preprocessor.n_fft \ - exp_manager.exp_dir=examples/asr/speech_to_label_results' - sh 'rm -rf examples/asr/speech_to_label_results' - } - } - - stage('Speaker Diarization with ASR Inference') { - steps { - sh 'python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \ - diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \ - diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \ - diarizer.asr.model_path=QuartzNet15x5Base-En \ - diarizer.asr.parameters.asr_based_vad=True \ - diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results' - sh 'rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results' - } - } - - stage('Clustering Diarizer Inference') { - steps { - sh 'python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \ - diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \ - diarizer.speaker_embeddings.parameters.multiscale_weights=null \ - diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ - diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results' - sh 'rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results' - } - } - - stage('Neural Diarizer Inference') { - steps { - sh 'python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ - diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results' - sh 'rm -rf examples/speaker_tasks/diarization/neural_diarizer_results' - } - } - - stage('Multispeaker ASR Data Simulation') { - steps { - sh 'python tools/speech_data_simulator/multispeaker_simulator.py \ - --config-path=conf --config-name=data_simulator.yaml \ - data_simulator.random_seed=42 \ - data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \ - data_simulator.outputs.output_dir=./test_simulator \ - data_simulator.session_config.num_sessions=2 \ - data_simulator.session_config.session_length=60' - sh 'rm -rf ./test_simulator' - } - } - } - } - // TODO: Enable test after 21.08 container is used. - // stage('L2: ASR DALI dev run') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('Speech to Text - DALI AudioToMelSpectrogramPreprocessor') { - // steps { - // sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // +model.train_ds.use_dali=True \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // +model.validation_ds.use_dali=True \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_results' - // sh 'rm -rf examples/asr/speech_to_text_results' - // } - // } - // stage('Speech to Text BPE - DALI AudioToMelSpectrogramPreprocessor') { - // steps { - // sh 'python examples/asr/asr_ctc/speech_to_text_bpe.py \ - // --config-path="../conf/citrinet/" --config-name="config_bpe" \ - // model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - // model.tokenizer.type="wpe" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // +model.train_ds.use_dali=True \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // +model.validation_ds.use_dali=True \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results' - // sh 'rm -rf examples/asr/speech_to_text_wpe_results' - // } - // } - // // TODO: This would fail due to an unnecessary torchaudio import. - // // To be enabled once torchaudio is available in the container used for CI - // // stage('Speech to Text - DALI AudioToMFCCPreprocessor') { - // // steps { - // // sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - // // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // // +model.train_ds.use_dali=True \ - // // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // // +model.validation_ds.use_dali=True \ - // // model.preprocessor._target_=nemo.collections.asr.modules.AudioToMFCCPreprocessor \ - // // ~model.preprocessor.normalize \ - // // ~model.preprocessor.features \ - // // ~model.preprocessor.frame_splicing \ - // // ~model.preprocessor.dither \ - // // ~model.preprocessor.stft_conv \ - // // +model.n_mels=64 \ - // // +model.n_mfcc=64 \ - // // trainer.devices=[1] \ - // // trainer.accelerator="gpu" \ - // // +trainer.fast_dev_run=True \ - // // exp_manager.exp_dir=examples/asr/speech_to_text_results' - // // sh 'rm -rf examples/asr/speech_to_text_results' - // // } - // // } - // } - // } - - // TODO: Add back once CI is updated - // stage('L2: ASR RNNT dev run') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('Speech to Text - RNNT') { - // steps { - // sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_transducer/speech_to_text_rnnt.py \ - // --config-path="../conf/contextnet_rnnt/" --config-name="config_rnnt.yaml" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // model.train_ds.batch_size=2 \ - // model.validation_ds.batch_size=2 \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_results' - // sh 'rm -rf examples/asr/speech_to_text_rnnt_results' - // } - // } - // stage('L2: Speech to Text RNNT WPE') { - // steps { - // sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py \ - // --config-path="../conf/contextnet_rnnt/" --config-name="config_rnnt_bpe.yaml" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // model.train_ds.batch_size=2 \ - // model.validation_ds.batch_size=2 \ - // model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - // model.tokenizer.type="wpe" \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_wpe_results' - // sh 'rm -rf examples/asr/speech_to_text_rnnt_wpe_results' - // } - // } - // stage('L3: Speech to Text Hybrid Transducer-CTC WPE') { - // steps { - // sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py \ - // --config-path="../conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc/" --config-name="conformer_hybrid_transducer_ctc_bpe.yaml" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // model.encoder.n_layers= 2 \ - // model.train_ds.batch_size=2 \ - // model.validation_ds.batch_size=2 \ - // model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - // model.tokenizer.type="wpe" \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results' - // sh 'rm -rf examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results' - // } - // } - // } - // } - - // stage('L2: Hybrid ASR RNNT-CTC dev run') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('Speech to Text Hybrid Transducer-CTC WPE') { - // steps { - // sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py \ - // --config-path="../conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc/" --config-name="conformer_hybrid_transducer_ctc_bpe.yaml" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // model.encoder.n_layers= 2 \ - // model.train_ds.batch_size=2 \ - // model.validation_ds.batch_size=2 \ - // model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - // model.tokenizer.type="wpe" \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results' - // sh 'rm -rf examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results' - // } - // } - // } - // } - - stage('L2: ASR Multi-dataloader dev run') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Speech to Text multi-dataloader') { - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - +trainer.num_sanity_val_steps=1 \ - exp_manager.exp_dir=examples/asr/speech_to_text_results' - sh 'rm -rf examples/asr/speech_to_text_results' - } - } - - stage('Speech to Label multi-dataloader') { - steps { - sh 'python examples/asr/speech_classification/speech_to_label.py \ - model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ - model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - +trainer.num_sanity_val_steps=1 \ - model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ - ~model.preprocessor.window_size \ - ~model.preprocessor.window_stride \ - ~model.preprocessor.window \ - ~model.preprocessor.n_mels \ - ~model.preprocessor.n_mfcc \ - ~model.preprocessor.n_fft \ - exp_manager.exp_dir=examples/asr/speech_to_label_results' - sh 'rm -rf examples/asr/speech_to_label_results' - } - } - } - } - - stage('L2: ASR Adapters') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Linear Adapters') { - steps { - sh 'python examples/asr/asr_adapters/train_asr_adapter.py \ - model.pretrained_model="stt_en_conformer_ctc_small" \ - model.adapter.adapter_name="an4" \ - model.adapter.linear.in_features=176 \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.max_steps=5 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results' - sh 'rm -rf examples/asr/speech_to_text_adapters_results' - } - } - stage('RelPos MHA Adapters') { - steps { - sh 'python examples/asr/asr_adapters/train_asr_adapter.py \ - model.pretrained_model="stt_en_conformer_ctc_small" \ - model.adapter.adapter_name="encoder:an4" \ - model.adapter.adapter_type="tiny_attn" \ - model.adapter.tiny_attn.n_feat=176 \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.max_steps=5 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results' - sh 'rm -rf examples/asr/speech_to_text_adapters_mha_results' - } - } - - } - } - - stage('L2: Speech Transcription') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Speech to Text Transcribe') { - steps { - sh 'python examples/asr/transcribe_speech.py \ - pretrained_name="QuartzNet15x5Base-En" \ - audio_dir="/home/TestData/an4_transcribe/test_subset/" \ - output_filename="stt_test_res.json" \ - amp=true' - sh 'rm -rf stt_test_res.json' - } - } - } - } - stage('L2: Transducer alignment') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Running pytest') { - steps { - sh 'pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1' - } - } - } - } - - stage('L2: Segmentation Tool') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - stages { - stage('Install ctc_segmentation requirements') { - steps { - sh 'cd tools/ctc_segmentation && \ - pip install -r requirements.txt && \ - apt-get update && apt-get install libsox-fmt-all -y' - } - } - - stage('Parallel ctc_segmentation test') { - failFast true - parallel { - stage('L2: Eng CitriNet with .wav') { - steps { - sh 'cd tools/ctc_segmentation && \ - TIME=`date +"%Y-%m-%d-%T"` && \ - /bin/bash run_segmentation.sh \ - --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \ - --DATA_DIR=/home/TestData/ctc_segmentation/eng \ - --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \ - --LANGUAGE=en \ - --USE_NEMO_NORMALIZATION="TRUE" && \ - python /home/TestData/ctc_segmentation/verify_alignment.py \ - -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \ - -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \ - rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}' - } - } - stage('L2: Ru QN with mp3') { - steps { - sh 'cd tools/ctc_segmentation && \ - TIME=`date +"%Y-%m-%d-%T"` && \ - /bin/bash run_segmentation.sh \ - --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \ - --DATA_DIR=/home/TestData/ctc_segmentation/ru \ - --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \ - --LANGUAGE=ru \ - --ADDITIONAL_SPLIT_SYMBOLS=";" && \ - python /home/TestData/ctc_segmentation/verify_alignment.py \ - -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \ - -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \ - rm -rf /home/TestData/ctc_segmentation/ru/output${TIME}' - } - } - } - } - } - } - - stage('L2: G2P Models') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('G2P Conformer training, evaluation and inference') { - steps { - sh 'cd examples/tts/g2p && \ - TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \ - python g2p_train_and_evaluate.py \ - train_manifest=/home/TestData/g2p/g2p.json \ - validation_manifest=/home/TestData/g2p/g2p.json \ - model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ - model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \ - trainer.max_epochs=1 \ - model.max_source_len=64 \ - trainer.devices=[0] \ - do_training=True \ - do_testing=True \ - exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \ - +exp_manager.use_datetime_version=False\ - +exp_manager.version=test \ - --config-name=g2p_conformer_ctc && \ - python g2p_inference.py \ - pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \ - manifest_filepath=/home/TestData/g2p/g2p.json \ - phoneme_field=text' - } - } - // TODO: pleasefixme @redoctopus - // stage('ByT5G2P training, evaluation and inference') { - // steps { - // sh 'cd examples/tts/g2p && \ - // TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \ - // python g2p_train_and_evaluate.py \ - // train_manifest=/home/TestData/g2p/g2p.json \ - // validation_manifest=/home/TestData/g2p/g2p.json \ - // model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ - // trainer.max_epochs=1 \ - // model.max_source_len=64 \ - // trainer.devices=[1] \ - // do_training=True \ - // do_testing=True \ - // exp_manager.exp_dir=${OUTPUT_DIR_T5} \ - // +exp_manager.use_datetime_version=False\ - // +exp_manager.version=test && \ - // python g2p_inference.py \ - // pretrained_model=${OUTPUT_DIR_T5}/T5G2P/test/checkpoints/T5G2P.nemo \ - // manifest_filepath=/home/TestData/g2p/g2p.json \ - // phoneme_field=text' - // } - // } - stage('HeteronymClassificationModel training, evaluation and inference') { - steps { - sh 'cd examples/tts/g2p && \ - TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \ - python g2p_heteronym_classification_train_and_evaluate.py \ - train_manifest=/home/TestData/g2p/manifest.json \ - validation_manifest=/home/TestData/g2p/manifest.json \ - test_manifest=/home/TestData/g2p/manifest.json \ - model.wordids=/home/TestData/g2p/wordids.tsv \ - trainer.max_epochs=1 \ - model.max_seq_length=64 \ - do_training=True \ - do_testing=True \ - exp_manager.exp_dir=${OUTPUT_DIR} \ - +exp_manager.use_datetime_version=False\ - +exp_manager.version=test && \ - python g2p_heteronym_classification_inference.py \ - manifest=/home/TestData/g2p/manifest.json \ - pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ - output_manifest=preds.json' - } - } - } - } - - // TODO: add test once megatron-bert is supported again - // stage('L2: Multi-GPU Megatron finetuning') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('L2: Cased Megatron finetuning on MRPC') { - // steps { - // sh 'cd examples/nlp/glue_benchmark && \ - // python glue_benchmark.py \ - // model.dataset.data_dir=/home/TestData/nlp/glue_fake/MRPC \ - // trainer.devices=[0,1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // model.dataset.use_cache=false \ - // model.language_model.pretrained_model_name=megatron-bert-345m-cased \ - // trainer.accelerator=gpu \ - // trainer.strategy=ddp \ - // exp_manager=null' - // } - // } - // } - // } - - stage('L2: STS-b') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('GLUE STS-b with AlBERT') { - steps { - sh 'python examples/nlp/glue_benchmark/glue_benchmark.py \ - model.dataset.use_cache=false \ - model.task_name=sts-b \ - model.dataset.data_dir=/home/TestData/nlp/glue_fake/STS-B \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - model.language_model.pretrained_model_name=albert-base-v1 \ - exp_manager=null' - } - } - stage('Test Restore Punctuation & Capitalization with AlBERT') { - steps { - sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_and_Capitalization_albert.nemo \ - +model.test_ds.use_cache=false \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - exp_manager=null && \ - rm -rf "${data_dir}"' - } - } -// stage('Test Restore Punctuation & Capitalization with RoBERTa') { -// steps { -// sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \ -// cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ -// python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ -// +do_training=false \ -// +do_testing=true \ -// pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_and_Capitalization_roberta.nemo \ -// +model.test_ds.use_cache=false \ -// ~model.train_ds \ -// ~model.validation_ds \ -// model.test_ds.ds_item="${data_dir}" \ -// trainer.devices=[1] \ -// trainer.accelerator="gpu" \ -// exp_manager=null && \ -// rm -rf "${data_dir}"' -// } -// } - } - } - stage('L2: Dialogue Classification') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Dialogue: Intent and slot classification using GPT') { - steps { - sh 'cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\ - model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\ - model.dataset.dialogues_example_dir=sgd_gen_outputs \ - model.dataset.task_name=debug_sample \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[0] \ - model.dataset.use_cache=false \ - model.tokenizer.special_tokens={pad_token:"endoftext"} \ - model.tokenizer.tokenizer_name=gpt2 \ - model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\ - model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_outputs' - } - } - stage('Intent and slot classification using SGDQA') { - steps { - sh 'cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \ - model.dataset.task_name=debug_sample \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.dataset.num_tasks=6 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[0] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-cased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_outputs' - } - } - stage('Intent and slot classification using IntentSlotClassificationModel') { - steps { - sh 'cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/processed_assistant \ - model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \ - model.dataset.task=assistant \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[0] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_intent_classification_outputs' - } - } - stage('Intent classification using ZeroShotIntentModel') { - steps { - sh 'cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \ - model.dataset.task=zero_shot \ - model.dataset.prompt_template="This example is" \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[1] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_zero_shot_intent_classification_outputs' - } - } - stage('Design Intent classification using ZeroShotIntentModel') { - steps { - sh 'cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=megatron \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[1] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_outputs' - } - } - stage('Design Intent classification using ZeroShotIntentModel BART Classifier') { - steps { - sh 'cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=huggingface \ - trainer.devices=[1] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_bart_outputs' - } - } - stage('Design Intent classification using DialogueNearestNeighbourModel') { - steps { - sh 'cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="" \ - model.library=huggingface \ - trainer.devices=[0] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_dialogue_nearest_neighbour_classification_outputs' - } - } - } - } - stage('L2: Dialogue Generation') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Dialogue: Answer Extender using DialogueS2SGenerationModel') { - steps { - sh 'cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender_s2s \ - model.dataset.task=ms_marco \ - model.library=huggingface \ - model.dataset.debug_mode=True \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[1] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender_s2s' - } - } - stage('Dialogue: SGD Based Answer Extender using DialogueS2SGenerationModel') { - steps { - sh 'cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \ - model.dataset.task_name=debug_sample \ - model.dataset.task=sgd_generation \ - model.dataset.input_field=utterance+system_actions \ - model.dataset.output_field=system_utterance \ - model.dataset.use_cache=false \ - model.dataset.system_utterance=next_turn \ - model.dataset.debug_mode=True \ - model.dataset.prompt_template=slots_values \ - model.library=huggingface \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[0] \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_answer_extender_s2s' - } - } - } - } -// stage('L2: Dialogue Generation Part 2') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// parallel { -// stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') { -// steps { -// sh 'cd examples/nlp/dialogue && \ -// python dialogue.py \ -// do_training=False \ -// model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ -// model.dataset.dialogues_example_dir=answer_extender \ -// model.library=huggingface \ -// model.dataset.task=ms_marco \ -// model.dataset.debug_mode=True \ -// trainer.val_check_interval=0.0 \ -// trainer.devices=[0] \ -// model.dataset.use_cache=false \ -// model.language_model.pretrained_model_name=gpt2 \ -// trainer.accelerator=gpu \ -// exp_manager=null && \ -// rm -rf answer_extender' -// } -// } -// } -// } - stage('L2: COPY') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') { - steps { - sh 'cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender \ - model.library=huggingface \ - model.dataset.task=ms_marco \ - model.dataset.debug_mode=True \ - trainer.val_check_interval=0.0 \ - trainer.devices=[0] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=gpt2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender' - } - } - } - } - stage('L2: Duplex Text Normalization') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Duplex Text Normalization with Tarred dataset') { - steps { - sh 'cd examples/nlp/duplex_text_normalization && \ - python duplex_text_normalization_train.py \ - data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \ - mode=tn \ - lang=en \ - tagger_model.do_training=false \ - decoder_model.transformer=t5-small \ - data.validation_ds.batch_size=2 \ - data.train_ds.use_cache=false \ - data.validation_ds.use_cache=false \ - data.test_ds.batch_size=2 \ - data.train_ds.decoder_data_augmentation=false \ - data.train_ds.num_workers=2 \ - decoder_trainer.devices=[0,1] \ - decoder_trainer.accelerator="gpu" \ - data.train_ds.use_tarred_dataset=true \ - +decoder_trainer.fast_dev_run=true \ - decoder_exp_manager.create_checkpoint_callback=false \ - data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \ - data.test_ds.use_cache=false \ - data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv' - } - } - } - } - // Runs out of memory on the 12G TITAN V (GPU 0 on main CI) - // TODO: add when megatron bert is supported again in NeMo - // stage('L2: MegaBERT Token Classification') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps { - // sh 'cd examples/nlp/token_classification && \ - // python token_classification_train.py \ - // model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ - // model.language_model.pretrained_model_name=megatron-bert-345m-uncased \ - // model.train_ds.batch_size=10 \ - // model.dataset.max_seq_length=50 \ - // model.dataset.use_cache=false \ - // trainer.accelerator=gpu \ - // trainer.strategy=ddp \ - // trainer.precision=16 \ - // trainer.devices=[1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // exp_manager=null' - // } - // } - - stage('L2: BERT Text Classification') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage ('Text Classification with BERT Test') { - steps { - sh 'cd examples/nlp/text_classification && \ - python text_classification_with_bert.py \ - model.dataset.num_classes=6 \ - model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.train_ds.batch_size=10 \ - model.dataset.max_seq_length=50 \ - model.dataset.use_cache=false \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null' - } - } - } - } - - stage('L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('BERT SQUAD 1.1') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - exp_manager=null' - } - } - stage('BERT SQUAD 2.0') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - exp_manager=null' - } - } - } - } - - stage('L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('BART SQUAD 1.1') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - exp_manager=null' - } - } - stage('BART SQUAD 2.0') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - exp_manager=null' - } - } - } - } - - stage('L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('GPT2 SQUAD 1.1') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - exp_manager=null' - } - } - stage('GPT2 SQUAD 2.0') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - exp_manager=null' - } - } - } - } - - stage('L2: Intent and Slot Classification Tasks') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: Intent and Slot Classification') { - steps { - sh 'cd examples/nlp/intent_slot_classification && \ - python intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/retail \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints' - sh 'rm -rf checkpoints' - } - } - stage('L2: Multi-Label Intent and Slot Classification') { - steps { - sh 'cd examples/nlp/intent_slot_classification && \ - python multi_label_intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/new_multiatis \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=[0] \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints2' - sh 'rm -rf checkpoints2' - } - } - } - } - - // TODO: add when megatron-bert is supported again - // stage('L2: Model Parallel Size 2 Megatron Text Classification') { - // when { - // anyOf{ - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps{ - // sh 'cd examples/nlp/text_classification && \ - // python text_classification_with_bert.py \ - // trainer.devices=[0,1] \ - // trainer.accelerator="gpu" \ - // trainer.num_nodes=1 \ - // trainer.precision=16 \ - // trainer.gradient_clip_val=1.0 \ - // +trainer.fast_dev_run=true \ - // model.dataset.num_classes=6 \ - // model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - // model.train_ds.batch_size=4 \ - // model.language_model.pretrained_model_name=megatron-bert-uncased \ - // model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \ - // model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \ - // model.nemo_path=null \ - // ~model.infer_samples \ - // exp_manager=null' - // } - // } - - // stage('L2: Model Parallel Size 2 Megatron Autoresume') { - // when { - // anyOf{ - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps{ - // sh 'cd examples/nlp/text_classification && \ - // python text_classification_with_bert.py \ - // trainer.devices=[0,1] \ - // trainer.accelerator="gpu" \ - // trainer.num_nodes=1 \ - // trainer.precision=16 \ - // trainer.gradient_clip_val=1.0 \ - // trainer.max_epochs=1 \ - // +trainer.fast_dev_run=true \ - // model.dataset.num_classes=6 \ - // model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - // model.train_ds.batch_size=4 \ - // model.language_model.pretrained_model_name=megatron-bert-uncased \ - // model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \ - // model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \ - // model.nemo_path=null \ - // ~model.infer_samples \ - // +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \ - // +exp_manager.resume_if_exists=true' - // } - // } - - // stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') { - // when { - // anyOf{ - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps{ - // sh 'cd examples/nlp/text_classification && \ - // python model_parallel_text_classification_evaluation.py \ - // trainer.devices=[0,1] \ - // trainer.accelerator="gpu" \ - // trainer.num_nodes=1 \ - // model.dataset.num_classes=6 \ - // model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - // model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \ - // exp_manager=null' - // } - // } - - // stage('L2: Model Parallel Size 2 Megatron Train from .nemo') { - // when { - // anyOf{ - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps{ - // sh 'cd examples/nlp/token_classification && \ - // python token_classification_train.py \ - // pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \ - // model.dataset.data_dir=/home/TestData/nlp/ner/ \ - // model.train_ds.batch_size=2 \ - // model.dataset.use_cache=false \ - // trainer.devices=[0,1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // model.dataset.class_balancing="weighted_loss" \ - // exp_manager=null' - // } - // } - - stage('L2: Parallel NLP Examples 2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage ('NER finetuning from pretrained Test') { - steps { - sh 'cd examples/nlp/token_classification && \ - python token_classification_train.py \ - pretrained_model=ner_en_bert \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.train_ds.batch_size=2 \ - model.dataset.use_cache=false \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.class_balancing="weighted_loss" \ - exp_manager.exp_dir=null' - } - } - stage ('Punctuation and capitalization finetuning from pretrained test') { - steps { - sh 'cd examples/nlp/token_classification && \ - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - pretrained_model=punctuation_en_bert \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=null && \ - rm -rf "${data_dir}"' - } - } - stage ('NER with TurkuNLP/bert-base-finnish-cased-v1') { - steps { - sh 'cd examples/nlp/token_classification && \ - python token_classification_train.py \ - model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \ - exp_manager.exp_dir=null' - } - } - stage('Evaluation script for Token Classification') { - steps { - sh 'python examples/nlp/token_classification/token_classification_evaluate.py \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.dataset.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo' - } - } - stage('Evaluation script for Punctuation') { - steps { - sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - model.test_ds.ds_item="${data_dir}" \ - ~model.train_ds \ - ~model.validation_ds \ - +model.test_ds.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \ - rm -rf "${data_dir}"' - } - } - stage('L2: Punctuation & Capitalization, 2GPUs with DistilBERT, Fine-tuning on different data') { - steps { - sh 'cd examples/nlp/token_classification && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir}" \ - model.validation_ds.ds_item="${tmp_data_dir}" \ - model.test_ds.ds_item="${tmp_data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=true && \ - tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \ - mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \ - rm -rf "${tmp_data_dir}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir_2}" \ - model.validation_ds.ds_item="${tmp_data_dir_2}" \ - model.test_ds.ds_item="${tmp_data_dir_2}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \ - "${tmp_data_dir_2}" \ - "${output_dir}"' - } - } - } - } - - stage('Punctuation & Capitalization tarred dataset') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - stages { - stage('create and use tarred dataset') { - steps { - sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \ - /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \ - "${data_dir}"/ && \ - usual_data=${data_dir}/wmt_wiki_10000 && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tarred_data=${output_dir}/train_tarred && \ - tokens_in_batch=2000 && \ - max_seq_length=512 && \ - lm_model=distilbert-base-uncased && \ - python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \ - --text ${usual_data}/input.txt \ - --labels ${usual_data}/labels.txt \ - --output_dir ${tarred_data} \ - --tokens_in_batch ${tokens_in_batch} \ - --max_seq_length 512 \ - --lines_per_dataset_fragment 2000 \ - --num_batches_per_tarfile 5 \ - --tar_file_prefix punctuation_capitalization \ - --tokenizer_name ${lm_model} \ - --use_fast_tokenizer \ - --pad_label O \ - --n_jobs 3 && \ - echo "Number of tarred files in dataset:" && \ - ls ${tarred_data}/*.tar | wc -l && \ - echo "Label id files in dataset:" && \ - ls ${tarred_data}/*.csv && \ - metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.train_ds.ds_item=${tarred_data} \ - model.language_model.pretrained_model_name=${lm_model} \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.tar_metadata_file=${metadata_file} \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir=${output_dir}/output && \ - rm -rf "${output_dir}" "${data_dir}"' - } - } - } - } - - stage('Punctuation & Capitalization, Different ways of passing labels to model') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - stages { - stage('Punctuation & Capitalization, Using model.common_datasest_parameters.label_vocab_dir') { - steps { - sh 'cd examples/nlp/token_classification && \ - work_dir="$(mktemp -d -p "$(pwd)")" && \ - label_vocab_dir="${work_dir}/labels" && \ - mkdir -p ${label_vocab_dir} && \ - data_dir="${work_dir}/data" && \ - mkdir -p "${data_dir}" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - output_dir="${work_dir}/output" && \ - mkdir -p "${output_dir}" && \ - punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \ - capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \ - printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \ - printf "O\nU\n" > "${capit_label_vocab}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \ - model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \ - model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=false && \ - python punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf "${work_dir}"' - } - } - stage('Punctuation & Capitalization, Using model.common_datasest_parameters.{punct,capit}_label_ids') { - steps { - sh 'cd examples/nlp/token_classification && \ - work_dir="$(mktemp -d -p "$(pwd)")" && \ - output_dir="${work_dir}/output" && \ - mkdir -p "${output_dir}" && \ - data_dir="${work_dir}/data" && \ - mkdir -p "${data_dir}" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - conf_name=punctuation_capitalization_config_with_ids && \ - cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \ - sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \ - "${work_dir}/${conf_name}.yaml" && \ - sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \ - "${work_dir}/${conf_name}.yaml" && \ - python punctuation_capitalization_train_evaluate.py \ - --config-path "${work_dir}" \ - --config-name "${conf_name}" \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=false && \ - python punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf "${work_dir}"' - } - } - } - } - stage('Punctuation & Capitalization inference') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - stages { - stage('Restore punctuation and capitalization in long text') { - steps { - sh 'output_dir="$(mktemp -d -p "$(pwd)")" && \ - python examples/nlp/token_classification/punctuate_capitalize_infer.py \ - --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \ - --output_text "${output_dir}/iwslt_inference_result.txt" \ - --max_seq_length 92 \ - --step 8 \ - --margin 16 \ - --pretrained_name punctuation_en_bert \ - --batch_size 32 && \ - rm -rf "${output_dir}"' - } - } - } - } - - stage('L2: Parallel Pretraining BERT pretraining from Text/Preprocessed') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: Pretraining BERT pretraining from Text') { - steps { - sh 'cd examples/nlp/language_modeling && \ - python bert_pretraining.py \ - --config-name=bert_pretraining_from_text_config.yaml \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - trainer.precision=16 \ - +trainer.fast_dev_run=true \ - model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt \ - model.train_ds.batch_size=32 \ - model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt \ - model.validation_ds.batch_size=32 \ - model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \ - model.optim.lr=0.01 \ - model.optim.sched.warmup_ratio=0.1 \ - model.tokenizer.tokenizer_name=sentencepiece \ - model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \ - model.mask_prob=0.15 \ - model.short_seq_prob=0.1 \ - exp_manager.exp_dir=PretrainingBERTFromText \ - ' - sh 'rm -f /home/TestData/nlp/wikitext-2/*.pkl' - sh 'rm -rf examples/nlp/language_modeling/PretrainingBERTFromText' - sh 'ls -lha examples/nlp/language_modeling' - } - } - stage('L2: Pretraining BERT from Preprocessed') { - steps { - sh 'cd examples/nlp/language_modeling && \ - python bert_pretraining.py \ - --config-name=bert_pretraining_from_preprocessed_config.yaml \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - trainer.precision=16 \ - +trainer.fast_dev_run=false \ - +trainer.max_epochs=1 \ - +trainer.limit_val_batches=0 \ - +trainer.limit_train_batches=1 \ - model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \ - model.train_ds.batch_size=8 \ - model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \ - model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \ - model.optim.lr=0.875e-4 \ - model.optim.weight_decay=0.01 \ - model.optim.sched.warmup_ratio=0.01 \ - exp_manager.exp_dir=PretrainingBERTFromPreprocessed \ - exp_manager.create_checkpoint_callback=False \ - ' - sh 'rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed' - sh 'ls -lha examples/nlp/language_modeling' - } - } - } - } - - stage('L2: Entity Linking') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage ('Self Alignment Pretraining BERT') { - steps { - sh 'cd examples/nlp/entity_linking && \ - python self_alignment_pretraining.py \ - project_dir=. \ - trainer.val_check_interval=3 \ - model.raw_data=None \ - model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \ - model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \ - model.train_ds.batch_size=8 \ - model.validation_ds.batch_size=8 \ - exp_manager.exp_dir=null' - } - } - } - } - - // TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858 - // is in the release container - stage('L2: NMT Attention is All You Need Training') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: NMT Training Post-LN') { - steps { - sh 'python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=false \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.encoder.inner_size=256 \ - model.decoder.num_layers=1 \ - model.decoder.hidden_size=64 \ - model.decoder.inner_size=256 \ - +model.optim.capturable=True \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.val_check_interval=2 \ - +trainer.limit_val_batches=1 \ - +trainer.max_steps=2 \ - trainer.precision=16 \ - +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ - +exp_manager.create_checkpoint_callback=true \ - ' - sh 'python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.encoder.inner_size=256 \ - model.decoder.num_layers=1 \ - model.decoder.hidden_size=64 \ - model.decoder.inner_size=256 \ - +model.optim.capturable=True \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.val_check_interval=10 \ - +trainer.limit_val_batches=1 \ - +trainer.limit_test_batches=1 \ - +trainer.max_steps=10 \ - +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ - +exp_manager.create_checkpoint_callback=true \ - +exp_manager.resume_if_exists=True \ - ' - sh 'rm -rf examples/nlp/machine_translation/nmt_results' - } - } - - stage('L2: NMT Training Pre-LN') { - steps { - sh 'cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.pre_ln=true \ - model.decoder.pre_ln=true \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null \ - ' - } - } - stage('L2: NMT Multi-Validation') { - steps { - sh 'cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ - model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null \ - ' - } - } - } - } - - stage('L2: NMT Attention is All You Need Inference') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh 'cd examples/nlp/machine_translation && \ - python nmt_transformer_infer.py \ - --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ - --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \ - --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \ - --target_lang en \ - --source_lang de \ - ' - } - } - - stage('L2: NMT Attention is All You Need Finetuning') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "cd examples/nlp/machine_translation && \ - python enc_dec_nmt_finetune.py \ - model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ - trainer.devices=[0] \ - ~trainer.max_epochs \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - +trainer.val_check_interval=10 \ - +trainer.limit_val_batches=1 \ - +trainer.limit_test_batches=1 \ - +trainer.max_steps=10 \ - +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \ - +exp_manager.create_checkpoint_callback=True \ - +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ - +exp_manager.checkpoint_callback_params.mode=max \ - +exp_manager.checkpoint_callback_params.save_best_model=true \ - " - sh "rm -rf examples/nlp/machine_translation/nmt_finetune" - } - } - - - stage('L2: NMT Tarred Dataset Creation') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: NMT Auto Tarred Dataset Creation') { - steps { - sh 'cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_training=false \ - model.preproc_out_dir=$PWD/preproc_out_dir \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.n_preproc_jobs=2 \ - model.train_ds.lines_per_dataset_fragment=500 \ - model.train_ds.num_batches_per_tarfile=10 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.vocab_size=2000 \ - model.decoder_tokenizer.vocab_size=2000 \ - ~model.test_ds \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null \ - ' - } - } - - stage('L2: NMT Script Tarred Dataset Creation') { - steps { - sh 'cd examples/nlp/machine_translation && \ - python create_tarred_parallel_dataset.py \ - --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - --out_dir $PWD/out_dir \ - --encoder_tokenizer_vocab_size=2000 \ - --decoder_tokenizer_vocab_size=2000 \ - --tokens_in_batch=1000 \ - --lines_per_dataset_fragment=500 \ - --num_batches_per_tarfile=10 \ - --n_preproc_jobs=2 \ - ' - } - } - } - } - // stage('L2: Megatron NMT Training TP=2') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps { - // sh "python examples/nlp/machine_translation/megatron_nmt_training.py \ - // trainer.devices=2 \ - // trainer.accelerator=gpu \ - // trainer.log_every_n_steps=1 \ - // trainer.val_check_interval=10 \ - // +trainer.limit_val_batches=2 \ - // trainer.accumulate_grad_batches=1 \ - // trainer.max_steps=10 \ - // trainer.precision=16 \ - // trainer.gradient_clip_val=1.0 \ - // exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - // model.tensor_model_parallel_size=2 \ - // model.seq_length=128 \ - // model.encoder.num_layers=4 \ - // model.encoder.hidden_size=64 \ - // model.encoder.num_attention_heads=8 \ - // model.encoder.activation='swiglu' \ - // model.encoder.masked_softmax_fusion=False \ - // model.encoder.bias_activation_fusion=False \ - // model.encoder.activations_checkpoint_method='block' \ - // model.encoder.activations_checkpoint_num_layers=1 \ - // model.decoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.num_attention_heads=8 \ - // model.decoder.activation='swiglu' \ - // model.decoder.masked_softmax_fusion=False \ - // model.decoder.bias_activation_fusion=False \ - // model.decoder.activations_checkpoint_method='block' \ - // model.decoder.activations_checkpoint_num_layers=1 \ - // model.micro_batch_size=2 \ - // model.global_batch_size=4 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.train_ds.num_workers=1 \ - // model.validation_ds.num_workers=1 \ - // ~model.test_ds \ - // model.train_ds.dataset_type=text_memmap \ - // model.encoder_tokenizer.library=sentencepiece \ - // model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - // model.decoder_tokenizer.library=sentencepiece \ - // model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model" - // // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - // // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - // sh "python examples/nlp/machine_translation/megatron_nmt_training.py \ - // trainer.devices=2 \ - // trainer.accelerator=gpu \ - // trainer.log_every_n_steps=1 \ - // trainer.val_check_interval=1 \ - // +trainer.limit_val_batches=2 \ - // trainer.accumulate_grad_batches=1 \ - // trainer.max_steps=10 \ - // trainer.precision=16 \ - // trainer.gradient_clip_val=1.0 \ - // exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - // model.tensor_model_parallel_size=2 \ - // model.seq_length=128 \ - // model.encoder.num_layers=4 \ - // model.encoder.hidden_size=64 \ - // model.encoder.num_attention_heads=8 \ - // model.encoder.activation='swiglu' \ - // model.encoder.masked_softmax_fusion=False \ - // model.encoder.bias_activation_fusion=False \ - // model.encoder.activations_checkpoint_method='block' \ - // model.encoder.activations_checkpoint_num_layers=1 \ - // model.decoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.num_attention_heads=8 \ - // model.decoder.activation='swiglu' \ - // model.decoder.masked_softmax_fusion=False \ - // model.decoder.bias_activation_fusion=False \ - // model.decoder.activations_checkpoint_method='block' \ - // model.decoder.activations_checkpoint_num_layers=1 \ - // model.micro_batch_size=2 \ - // model.global_batch_size=4 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.train_ds.num_workers=1 \ - // model.validation_ds.num_workers=1 \ - // ~model.test_ds \ - // model.train_ds.dataset_type=text_memmap \ - // model.encoder_tokenizer.library=sentencepiece \ - // model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - // model.decoder_tokenizer.library=sentencepiece \ - // model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model" - // sh "rm -rf examples/nlp/machine_translation/megatron_nmt_results" - // } - // } - stage('L2: Megatron BART Perceiver MIM Training TP=2') { - // Testing Megatron hidden transformations - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string=\'\"800,100,100\"\' \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5" - // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string=\'\"800,100,100\"\' \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5" - sh "rm -rf examples/nlp/language_modeling/megatron_mim_results" - } - } - // stage('L2: NMT Bottleneck Fallback') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('L2: seq2seq (no bottleneck)') { - // steps { - // sh 'cd examples/nlp/machine_translation && \ - // enc_dec_nmt-bottleneck.py \ - // --config-path=conf \ - // --config-name=aayn_bottleneck \ - // do_testing=true \ - // model.model_type=nll \ - // model.encoder.arch=seq2seq \ - // model.encoder.hidden_steps=1 \ - // model.encoder.hidden_blocks=1 \ - // model.encoder.hidden_init_method=params \ - // model.encoder.hidden_size=64 \ - // model.encoder.inner_size=128 \ - // model.encoder.num_attention_heads=2 \ - // model.encoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.inner_size=128 \ - // model.decoder.num_attention_heads=2 \ - // model.decoder.num_layers=2 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ - // model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - // model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - // model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \ - // model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \ - // model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - // model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - // trainer.devices=[1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // +trainer.limit_test_batches=2 \ - // exp_manager=null \ - // ' - // } - // } - // } - // } - // stage('L2: NMT Bottleneck Architecture') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('Bridge Encoder (identity)') { - // steps { - // sh 'cd examples/nlp/machine_translation && \ - // enc_dec_nmt-bottleneck.py \ - // --config-path=conf \ - // --config-name=aayn_bottleneck \ - // do_testing=true \ - // model.model_type=nll \ - // model.encoder.arch=bridge \ - // model.encoder.hidden_steps=1 \ - // model.encoder.hidden_blocks=1 \ - // model.encoder.hidden_init_method=identity \ - // model.encoder.hidden_size=64 \ - // model.encoder.inner_size=128 \ - // model.encoder.num_attention_heads=2 \ - // model.encoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.inner_size=128 \ - // model.decoder.num_attention_heads=2 \ - // model.decoder.num_layers=2 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - // model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // +trainer.limit_test_batches=2 \ - // exp_manager=null \ - // ' - // } - // } - // stage('Perceiver Encoder (params)') { - // steps { - // sh 'cd examples/nlp/machine_translation && \ - // enc_dec_nmt-bottleneck.py \ - // --config-path=conf \ - // --config-name=aayn_bottleneck \ - // do_testing=true \ - // model.model_type=nll \ - // model.encoder.arch=perceiver \ - // model.encoder.hidden_steps=1 \ - // model.encoder.hidden_blocks=1 \ - // model.encoder.hidden_init_method=params \ - // model.encoder.hidden_size=64 \ - // model.encoder.inner_size=128 \ - // model.encoder.num_attention_heads=2 \ - // model.encoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.inner_size=128 \ - // model.decoder.num_attention_heads=2 \ - // model.decoder.num_layers=2 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - // model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - // trainer.devices=[1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // +trainer.limit_test_batches=2 \ - // exp_manager=null \ - // ' - // } - // } - // } - // } - // stage('L2: NMT Bottleneck LVM') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('VAE') { - // steps { - // sh 'cd examples/nlp/machine_translation && \ - // enc_dec_nmt-bottleneck.py \ - // --config-path=conf \ - // --config-name=aayn_bottleneck \ - // do_testing=true \ - // model.model_type=vae \ - // model.encoder.arch=perceiver \ - // model.encoder.hidden_steps=1 \ - // model.encoder.hidden_blocks=1 \ - // model.encoder.hidden_init_method=params \ - // model.encoder.hidden_size=64 \ - // model.encoder.inner_size=128 \ - // model.encoder.num_attention_heads=2 \ - // model.encoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.inner_size=128 \ - // model.decoder.num_attention_heads=2 \ - // model.decoder.num_layers=2 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - // model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // +trainer.limit_test_batches=2 \ - // exp_manager=null \ - // ' - // } - // } - // stage('MIM') { - // steps { - // sh 'cd examples/nlp/machine_translation && \ - // enc_dec_nmt-bottleneck.py \ - // --config-path=conf \ - // --config-name=aayn_bottleneck \ - // do_testing=true \ - // model.model_type=mim \ - // model.encoder.arch=perceiver \ - // model.encoder.hidden_steps=1 \ - // model.encoder.hidden_blocks=1 \ - // model.encoder.hidden_init_method=params \ - // model.encoder.hidden_size=64 \ - // model.encoder.inner_size=128 \ - // model.encoder.num_attention_heads=2 \ - // model.encoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.inner_size=128 \ - // model.decoder.num_attention_heads=2 \ - // model.decoder.num_layers=2 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - // model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - // trainer.devices=[1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // +trainer.limit_test_batches=2 \ - // exp_manager=null \ - // ' - // } - // } - // } - // } - stage('L2: Megatron Bert Pretraining and Resume Training with Pipeline Paralleism') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/bert_index_mappings" - } - } - stage('L2: Megatron Bert Pretraining and Resume Training') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/bert_index_mappings" - } - } - stage('L2: Megatron Core Bert Pretraining and Resume Training') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/bert_index_mappings" - } - } - stage('L2: NeMo Bert Embedding Finetuning and Resume') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.max_steps=12 \ - trainer.val_check_interval=4 \ - trainer.max_epochs=1 \ - +trainer.num_sanity_val_steps=0 \ - restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_nemo_tiny.nemo \ - model.num_layers=2 \ - model.hidden_size=64 \ - model.ffn_hidden_size=256 \ - model.num_attention_heads=2 \ - model.megatron_legacy=False \ - model.mcore_bert=False \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.optim.lr=0.0005 \ - model.encoder_seq_length=512 \ - model.tokenizer.library='huggingface' \ - model.tokenizer.type='intfloat/e5-large-unsupervised' \ - model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \ - model.data.hard_negatives_to_train=4 \ - exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \ - exp_manager.create_wandb_logger=False \ - exp_manager.resume_if_exists=False" - sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.max_steps=36 \ - trainer.val_check_interval=4 \ - trainer.max_epochs=1 \ - +trainer.num_sanity_val_steps=0 \ - restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_nemo_tiny.nemo \ - model.num_layers=2 \ - model.hidden_size=64 \ - model.ffn_hidden_size=256 \ - model.num_attention_heads=2 \ - model.megatron_legacy=False \ - model.mcore_bert=False \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.optim.lr=0.0005 \ - model.encoder_seq_length=512 \ - model.tokenizer.library='huggingface' \ - model.tokenizer.type='intfloat/e5-large-unsupervised' \ - model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \ - model.data.hard_negatives_to_train=4 \ - exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \ - exp_manager.create_wandb_logger=False \ - exp_manager.resume_if_exists=True" - sh "rm -rf examples/nlp/information_retrieval/bert_embedding_results" - } - } - stage('L2: Megatron Core Bert Embedding Finetuning and Resume') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.max_steps=12 \ - trainer.val_check_interval=4 \ - trainer.max_epochs=36 \ - +trainer.num_sanity_val_steps=0 \ - restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_mcore_tiny.nemo \ - model.num_layers=2 \ - model.hidden_size=64 \ - model.ffn_hidden_size=256 \ - model.num_attention_heads=2 \ - model.megatron_legacy=False \ - model.mcore_bert=True \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.optim.lr=0.0005 \ - model.encoder_seq_length=512 \ - model.tokenizer.library='huggingface' \ - model.tokenizer.type='intfloat/e5-large-unsupervised' \ - model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \ - model.data.hard_negatives_to_train=4 \ - exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \ - exp_manager.create_wandb_logger=False \ - exp_manager.resume_if_exists=False" - sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.max_steps=16 \ - trainer.val_check_interval=4 \ - trainer.max_epochs=1 \ - +trainer.num_sanity_val_steps=0 \ - restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_mcore_tiny.nemo \ - model.num_layers=2 \ - model.hidden_size=64 \ - model.ffn_hidden_size=256 \ - model.num_attention_heads=2 \ - model.megatron_legacy=False \ - model.mcore_bert=True \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.optim.lr=0.0005 \ - model.encoder_seq_length=512 \ - model.tokenizer.library='huggingface' \ - model.tokenizer.type='intfloat/e5-large-unsupervised' \ - model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \ - model.data.hard_negatives_to_train=4 \ - exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \ - exp_manager.create_wandb_logger=False \ - exp_manager.resume_if_exists=True" - sh "rm -rf examples/nlp/information_retrieval/bert_embedding_results" - } - } - stage('L2: Megatron RETRO Pretraining and Resume Training') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.num_nodes=1 \ - trainer.devices=2 \ - trainer.precision=bf16 \ - trainer.accelerator=gpu \ - model.data.data_prefix=['none'] \ - exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ - model.data.num_workers=4 \ - model.micro_batch_size=1 \ - model.data.shuffle_documents=False \ - trainer.val_check_interval=30 \ - +trainer.num_sanity_val_steps=0 \ - model.init_method_std=0.023 \ - model.optim.lr=6.0e-4 \ - model.megatron_amp_O2=True \ - model.data.splits_string=\'\"98,2,0\"\' \ - model.data.dataloader_type=cyclic \ - trainer.max_steps=10" - sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.num_nodes=1 \ - trainer.devices=2 \ - trainer.precision=bf16 \ - trainer.accelerator=gpu \ - model.data.data_prefix=['none'] \ - exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ - model.data.num_workers=4 \ - model.micro_batch_size=1 \ - model.data.shuffle_documents=False \ - trainer.val_check_interval=30 \ - +trainer.num_sanity_val_steps=0 \ - model.init_method_std=0.023 \ - model.optim.lr=6.0e-4 \ - model.megatron_amp_O2=True \ - model.data.splits_string=\'\"98,2,0\"\' \ - model.data.dataloader_type=cyclic \ - trainer.max_steps=20" - sh "rm -rf examples/nlp/language_modeling/mcore_retro_results" - } - } - stage('L2: (Legacy) Megatron RETRO Pretraining and Resume Training') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ - model.data.data_prefix='' \ - model.data.knn_index='' \ - model.data.retrieval_prefix='' \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True" - sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ - model.data.data_prefix='' \ - model.data.knn_index='' \ - model.data.retrieval_prefix='' \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True" - sh "rm -rf examples/nlp/language_modeling/retro_legacy_results" - } - } - stage('L2: (Legacy) Megatron RETRO muTransfer Pretraining Performance') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=100 \ - trainer.log_every_n_steps=1 \ - trainer.precision=16 \ - trainer.val_check_interval=100 \ - trainer.limit_val_batches=0 \ - trainer.gradient_clip_val=1.0 \ - +trainer.num_sanity_val_steps=0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results/ \ - +exp_manager.version=smalltest \ - model.data.neighbors=2 \ - model.megatron_amp_O2=False \ - model.apply_query_key_layer_scaling=False \ - model.tensor_model_parallel_size=1 \ - model.optim.name=muadamw \ - model.optim.weight_decay=0.1 \ - model.optim.betas=[0.9,0.95] \ - model.optim.lr=6e-4 \ - model.optim.sched.warmup_steps=1000 \ - model.optim.sched.constant_steps=0 \ - model.optim.sched.min_lr=6e-5 \ - model.add_position_embedding=False \ - model.enc_num_layers=2 \ - model.dec_num_layers=6 \ - model.enc_cross_attention=[0] \ - model.dec_cross_attention=[3,5] \ - model.hidden_size=96 \ - model.ffn_hidden_size=384 \ - model.init_method_std=0.023 \ - model.num_attention_heads=12 \ - model.max_position_embeddings=1024 \ - model.encoder_seq_length=1024 \ - model.tokenizer.library=megatron \ - model.tokenizer.type=GPT2BPETokenizer \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \ - model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \ - model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \ - model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \ - model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \ - model.data.num_workers=8 \ - model.micro_batch_size=8 \ - model.normalization=rmsnorm \ - model.transformer_block_type=pre_ln \ - model.bias_activation_fusion=True \ - model.bias_dropout_add_fusion=False \ - model.masked_softmax_fusion=True \ - model.hidden_dropout=0 \ - model.attention_dropout=0 \ - model.fp32_residual_connection=True \ - model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml" - sh '''python -c "import pandas as pd -import pathlib -from pandas.testing import assert_frame_equal -from tensorboard.backend.event_processing.event_accumulator import EventAccumulator -import torch -if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()): - import sys - sys.exit(0) -event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_legacy_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0] -ea = EventAccumulator(str(event_file)).Reload() -vals = [] -for i in ea.Scalars('reduced_train_loss'): - vals.append(i.value) -training_curve = pd.DataFrame({'loss': vals}) -gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv') -assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' - sh "rm -rf examples/nlp/language_modeling/retro_legacy_results" - } - } - stage('L2: BioMegatron Bert NER Task') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/token_classification/token_classification_train.py \ - exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \ - trainer.max_epochs=1 \ - model.dataset.data_dir=/home/TestData/nlp/ner \ - model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \ - model.tokenizer.tokenizer_name=null" - sh "rm -rf examples/nlp/language_modeling/token_classification_results" - } - } - stage('L2: Megatron GPT Pretraining and Resume Training TETransformerLayerTP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - ++model.name=megatron_gpt_full_te_layer_autocast \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=layernorm1p \ - model.bias_activation_fusion=True \ - model.bias_dropout_add_fusion=True \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=null \ - model.activations_checkpoint_granularity=null \ - model.activations_checkpoint_num_layers=null \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - ++model.name=megatron_gpt_full_te_layer_autocast \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=layernorm1p \ - model.bias_activation_fusion=True \ - model.bias_dropout_add_fusion=True \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=null \ - model.activations_checkpoint_granularity=null \ - model.activations_checkpoint_num_layers=null \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged - // @athitten: Revert limit_val_batches to 2 until limit_val_batches 1.0 leading to no validation is fixed for non DictConfig data_prefix - stage('L2: Megatron GPT Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.cpu_offloading_num_layers=7 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.cpu_offloading_num_layers=7 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - - stage('L2: Megatron GPT Pretraining and Resume Training TP=2 with Torch Distributed Checkpoint') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.mcore_gpt=True \ - model.torch_distributed_checkpoint=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.transformer_engine=true \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.cpu_offloading_num_layers=7 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.mcore_gpt=True \ - model.torch_distributed_checkpoint=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.transformer_engine=True \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.cpu_offloading_num_layers=7 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } -/* - stage('L2: Megatron GPT Pretraining with EP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.expert_model_parallel_size=2 \ - ++model.num_moe_experts=2 \ - ++model.moe_router_topk=1 \ - ++model.megatron_amp_O2=True \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.cpu_offloading_num_layers=7 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } -*/ - stage('L2: Megatron GPT with Rope Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=rope \ - model.rotary_percentage=0.5 \ - model.normalization=rmsnorm \ - model.transformer_engine=True \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.cpu_offloading_num_layers=7 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document],validation:[/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document],test:[/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]}' \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - // commented out to save time on github ci @adithyare - //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - //trainer.devices=2 \ - //trainer.accelerator=gpu \ - //trainer.log_every_n_steps=1 \ - //trainer.val_check_interval=2 \ - //trainer.limit_val_batches=1 \ - //trainer.accumulate_grad_batches=1 \ - //trainer.max_steps=6 \ - //trainer.precision=16 \ - //trainer.gradient_clip_val=1.0 \ - //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - //exp_manager.resume_if_exists=True \ - //model.tensor_model_parallel_size=2 \ - //model.optim.name=fused_adam \ - //model.optim.lr=2e-4 \ - //model.optim.sched.warmup_steps=2 \ - //model.optim.sched.constant_steps=2 \ - //model.optim.sched.min_lr=8e-5 \ - //model.max_position_embeddings=128 \ - //model.encoder_seq_length=128 \ - //model.data.seq_length=128 \ - //model.position_embedding_type=rope \ - //model.rotary_percentage=0.5 \ - //model.normalization=rmsnorm \ - //model.bias=False \ - //model.bias_activation_fusion=False \ - //model.bias_dropout_add_fusion=False \ - //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - //model.num_layers=8 \ - //model.hidden_size=256 \ - //model.num_attention_heads=8 \ - //model.activations_checkpoint_method='block' \ - //model.activations_checkpoint_granularity='full' \ - //model.activations_checkpoint_num_layers=1 \ - //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - - // This test requires Ampere but some of the test GPUs are Volta - // Need to add a check for compute capability before uncommenting this test - // stage('L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps { - // sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - // trainer.devices=2 \ - // trainer.accelerator=gpu \ - // trainer.log_every_n_steps=1 \ - // trainer.val_check_interval=2 \ - // trainer.limit_val_batches=2 \ - // trainer.accumulate_grad_batches=1 \ - // trainer.max_steps=3 \ - // trainer.precision=16 \ - // trainer.gradient_clip_val=1.0 \ - // exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - // model.tensor_model_parallel_size=2 \ - // model.optim.name=fused_adam \ - // model.optim.lr=2e-4 \ - // model.optim.sched.warmup_steps=1 \ - // model.optim.sched.constant_steps=1 \ - // model.optim.sched.min_lr=8e-5 \ - // model.max_position_embeddings=128 \ - // model.encoder_seq_length=128 \ - // model.data.seq_length=128 \ - // model.position_embedding_type=rope \ - // model.rotary_percentage=0.5 \ - // model.normalization=rmsnorm \ - // model.bias=False \ - // model.bias_activation_fusion=False \ - // model.bias_dropout_add_fusion=False \ - // model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - // model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - // model.num_layers=8 \ - // model.hidden_size=256 \ - // model.num_attention_heads=8 \ - // model.activations_checkpoint_method='block' \ - // model.activations_checkpoint_granularity='full' \ - // model.activations_checkpoint_num_layers=1 \ - // model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - // model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ - // model.use_flash_attention=True " - // // commented out to save time on github ci @adithyare - // //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - // //trainer.devices=2 \ - // //trainer.accelerator=gpu \ - // //trainer.log_every_n_steps=1 \ - // //trainer.val_check_interval=2 \ - // //trainer.limit_val_batches=1 \ - // //trainer.accumulate_grad_batches=1 \ - // //trainer.max_steps=6 \ - // //trainer.precision=16 \ - // //trainer.gradient_clip_val=1.0 \ - // //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - // //exp_manager.resume_if_exists=True \ - // //model.tensor_model_parallel_size=2 \ - // //model.optim.name=fused_adam \ - // //model.optim.lr=2e-4 \ - // //model.optim.sched.warmup_steps=2 \ - // //model.optim.sched.constant_steps=2 \ - // //model.optim.sched.min_lr=8e-5 \ - // //model.max_position_embeddings=128 \ - // //model.encoder_seq_length=128 \ - // //model.data.seq_length=128 \ - // //model.position_embedding_type=rope \ - // //model.rotary_percentage=0.5 \ - // //model.normalization=rmsnorm \ - // //model.bias=False \ - // //model.bias_activation_fusion=False \ - // //model.bias_dropout_add_fusion=False \ - // //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - // //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - // //model.num_layers=8 \ - // //model.hidden_size=256 \ - // //model.num_attention_heads=8 \ - // //model.activations_checkpoint_method='block' \ - // //model.activations_checkpoint_granularity='full' \ - // //model.activations_checkpoint_num_layers=1 \ - // //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - // //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ - // //model.use_flash_attention=True" - // sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - // sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - // } - // } - // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged - stage('L2: Megatron GPT with ALiBi Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=alibi \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.cpu_offloading_num_layers=7 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - // not testing resume functionality to save time on ci @adithyare - //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - //trainer.devices=2 \ - //trainer.accelerator=gpu \ - //trainer.log_every_n_steps=1 \ - //trainer.val_check_interval=2 \ - //trainer.limit_val_batches=1 \ - //trainer.accumulate_grad_batches=1 \ - //trainer.max_steps=6 \ - //trainer.precision=16 \ - //trainer.gradient_clip_val=1.0 \ - //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - //exp_manager.resume_if_exists=True \ - //model.tensor_model_parallel_size=2 \ - //model.optim.name=fused_adam \ - //model.optim.lr=2e-4 \ - //model.optim.sched.warmup_steps=2 \ - //model.optim.sched.constant_steps=2 \ - //model.optim.sched.min_lr=8e-5 \ - //model.max_position_embeddings=128 \ - //model.encoder_seq_length=128 \ - //model.data.seq_length=128 \ - //model.position_embedding_type=alibi \ - //model.normalization=rmsnorm \ - //model.bias=False \ - //model.bias_activation_fusion=False \ - //model.bias_dropout_add_fusion=False \ - //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - //model.num_layers=8 \ - //model.hidden_size=256 \ - //model.num_attention_heads=8 \ - //model.activations_checkpoint_method='block' \ - //model.activations_checkpoint_granularity='full' \ - //model.activations_checkpoint_num_layers=1 \ - //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged - stage('L2: Megatron GPT with KERPLE Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=kerple \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.cpu_offloading_num_layers=7 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - // commented out to save time on github ci @adithyare - //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - //trainer.devices=2 \ - //trainer.accelerator=gpu \ - //trainer.log_every_n_steps=1 \ - //trainer.val_check_interval=2 \ - //trainer.limit_val_batches=1 \ - //trainer.accumulate_grad_batches=1 \ - //trainer.max_steps=6 \ - //trainer.precision=16 \ - //trainer.gradient_clip_val=1.0 \ - //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - //exp_manager.resume_if_exists=True \ - //model.tensor_model_parallel_size=2 \ - //model.optim.name=fused_adam \ - //model.optim.lr=2e-4 \ - //model.optim.sched.warmup_steps=2 \ - //model.optim.sched.constant_steps=2 \ - //model.optim.sched.min_lr=8e-5 \ - //model.max_position_embeddings=128 \ - //model.encoder_seq_length=128 \ - //model.data.seq_length=128 \ - //model.position_embedding_type=kerple \ - //model.normalization=rmsnorm \ - //model.bias=False \ - //model.bias_activation_fusion=False \ - //model.bias_dropout_add_fusion=False \ - //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - //model.num_layers=8 \ - //model.hidden_size=256 \ - //model.num_attention_heads=8 \ - //model.activations_checkpoint_method='block' \ - //model.activations_checkpoint_granularity='full' \ - //model.activations_checkpoint_num_layers=1 \ - //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged - stage('L2: Megatron GPT Pretraining and Resume Training PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.mcore_gpt=True \ - model.megatron_amp_O2=True \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.activation=fast-swiglu \ - model.bias_activation_fusion=False \ - model.hidden_dropout=0.0 \ - model.attention_dropout=0.0 \ - model.transformer_block_type=normformer \ - model.headscale=True \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.cpu_offloading_num_layers=7 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - model.mcore_gpt=True \ - model.megatron_amp_O2=True \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.activation=fast-swiglu \ - model.bias_activation_fusion=False \ - model.hidden_dropout=0.0 \ - model.attention_dropout=0.0 \ - model.transformer_block_type=normformer \ - model.headscale=True \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.cpu_offloading_num_layers=7 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - stage('L2: Megatron GPT Finetuning PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - +trainer.limit_val_batches=2 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.peft.peft_scheme=null \ - model.data.train_ds.micro_batch_size=1 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.test_ds.names=[quarel] \ - model.data.validation_ds.micro_batch_size=1 \ - model.data.validation_ds.global_batch_size=1 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.validation_ds.names=[quarel,trec]" - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.peft.peft_scheme=null \ - model.data.train_ds.micro_batch_size=1 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.test_ds.names=[quarel] \ - model.data.validation_ds.micro_batch_size=1 \ - model.data.validation_ds.global_batch_size=1 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.validation_ds.names=[quarel,trec]" - sh "rm -rf examples/nlp/language_modeling/gpt_sft_results" - } - } - stage('L2: Megatron GPT Finetuning StarCoder PP=1') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.precision=32 \ - trainer.max_steps=4 \ - trainer.val_check_interval=4 \ - trainer.enable_checkpointing=False \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - exp_manager.checkpoint_callback_params.save_best_model=False \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.optim.name=distributed_fused_adam \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.num_workers=0 \ - model.data.train_ds.concat_sampling_probabilities=[1.0]" - sh "rm -rf examples/nlp/language_modeling/gpt_sft_results" - } - } - stage('L2: Megatron GPT PEFT Lora PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2" - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.peft.peft_scheme='lora' \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" - sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2" - } - } - stage('L2: Megatron GPT PEFT Lora TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/nlp/lora_tuning_tp2" - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - model.peft.peft_scheme='lora' \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=['quarel4'] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'" - sh "rm -rf /home/TestData/nlp/lora_tuning_tp2" - } - } - stage('L2: Megatron GPT PEFT Lora TP=2 SP') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/nlp/lora_tuning_tp2_sp" - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.sequence_parallel=true \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ - model.peft.peft_scheme='lora' \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" - sh "rm -rf /home/TestData/nlp/lora_tuning_tp2_sp" - } - } - stage('L2: Megatron GPT Eval') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps{ - sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \ - gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \ - prompts=['How to fix GPU memory? A:'] \ - tensor_model_parallel_size=1 \ - inference.tokens_to_generate=32 \ - trainer.precision=32" - } - } - stage('L2: Megatron GPT Eval PP2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \ - gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - server=False \ - tensor_model_parallel_size=1 \ - pipeline_model_parallel_size=2 \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.precision=32" - } - } - stage('L2: Megatron GPT SFT Eval (inference seq len > training seq len)') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps{ - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \ - model.peft.restore_from_path=null \ - model.data.test_ds.file_names=['/home/TestData/nlp/megatron_gpt_sft/sample.jsonl'] \ - model.data.test_ds.names=['test'] \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=30 \ - model.data.test_ds.max_seq_length=6000 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix='examples/nlp/language_modeling/out' \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path='examples/nlp/language_modeling/out.jsonl' && \ - rm -rf examples/nlp/language_modeling/out.jsonl" - } - } - - // TODO: Add this test back. Test was failing on CI machines due to HW error - // stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps { - // sh "python -m torch.distributed.launch --nproc_per_node=2 \ - // examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \ - // --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \ - // --checkpoint_name=model_optim_rng.pt \ - // --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \ - // --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \ - // --model_type=gpt \ - // --pipeline_model_parallel_size=1 \ - // --gpus_per_node=2 \ - // --tensor_model_parallel_size=2" - // sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \ - // --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \ - // --tokens_to_generate=32 \ - // --tensor_model_parallel_size=2 \ - // --prompt='This is a test.'" - // sh "rm examples/nlp/language_modeling/small_gpt.nemo" - // } - // } - stage('L2: Megatron Change Partitions') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel{ - stage('Reduce TP Num Partitions (2 to 1) and PP Num Partitions (1 to 2)'){ - steps{ - sh "python examples/nlp/language_modeling/megatron_change_num_partitions.py \ - --model_file \ - /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - --target_file \ - /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \ - --tensor_model_parallel_size \ - 2 \ - --target_tensor_model_parallel_size \ - 1 \ - --pipeline_model_parallel_size \ - 1 \ - --target_pipeline_model_parallel_size \ - 2" - sh "rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo" - } - } - stage('Increase TP Num Partitions (2 to 4) and PP Num Partitions (1 to 2)'){ - steps{ - sh "python examples/nlp/language_modeling/megatron_change_num_partitions.py \ - --model_file \ - /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - --target_file \ - /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \ - --tensor_model_parallel_size \ - 2 \ - --target_tensor_model_parallel_size \ - 4 \ - --pipeline_model_parallel_size \ - 1 \ - --target_pipeline_model_parallel_size \ - 1" - sh "rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo" - } - } - } - } - stage('L2: Megatron T5 Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='fast-swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='fast-swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - stage('L2: Megatron T5 with ALiBi Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=alibi \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=alibi \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - stage('L2: Megatron T5 with KERPLE Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=kerple \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=kerple \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - stage('L2: Megatron T5 Pretraining and Resume Training PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation='gelu' \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='post_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings" - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation='gelu' \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='post_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - stage('L2: Megatron T5 w/ Mixture of Expert Pretraining') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.num_moe_experts=4 \ - model.decoder.num_moe_experts=4 \ - model.encoder.moe_frequency=3 \ - model.decoder.moe_frequency=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation='gelu' \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='post_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - - stage('L2: Megatron UL2 Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='normformer' \ - model.encoder.headscale=True \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='geglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.decoder.transformer_block_type='normformer' \ - model.decoder.headscale=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings" - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='normformer' \ - model.encoder.headscale=True \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='geglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.decoder.transformer_block_type='normformer' \ - model.decoder.headscale=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - stage('L2: Megatron T5 Eval') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps{ - sh "python examples/nlp/language_modeling/megatron_t5_eval.py \ - --model_file \ - /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - --prompt \ - 'How do I fix my GPU memory issue? I am seeing out of memory.' \ - --tensor_model_parallel_size 1" - } - } - stage('L2: Megatron BART Pretraining and Resume Training, TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='reglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='reglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'" - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=5 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='reglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='reglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'" - sh "rm -rf examples/nlp/language_modeling/bart_pretrain_results" - } - } - stage('L2: Megatron BART Pretraining and Resume Training, PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='geglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='geglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]" - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='geglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='geglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]" - sh "rm -rf examples/nlp/language_modeling/bart_pretrain_results" - } - } - stage('L2: Megatron T5 GLUE/XNLI Finetuning') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - // TODO(Oktai15): update it in 1.8.0 version - stage('T5 GLUE RTE') { - steps { - sh "python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=rte \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv \ - " - sh "rm -rf examples/nlp/language_modeling/t5_glue_results" - } - } - stage('T5 GLUE XNLI') { - steps { - sh "python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - -cn megatron_t5_config_finetune_glue_xnli \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.test_ds.global_batch_size=2 \ - model.data.test_ds.micro_batch_size=2 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=xnli \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ - model.data.test_ds.task_name=xnli \ - model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ - " - sh "rm -rf examples/nlp/language_modeling/t5_xnli_results" - } - } - } - } - - stage('L2: Megatron T5 PEFT Lora TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/nlp/t5_lora_tuning_tp2" - sh "python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.peft_scheme='lora' \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" - sh "python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \ - model.peft.restore_from_ckpt_name=null \ - model.peft.restore_from_hparams_path=null \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=['quarel4'] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/t5_lora_tuning_tp2/out' \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path='/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl'" - sh "rm -rf /home/TestData/nlp/t5_lora_tuning_tp2" - } - } - - stage('L2: Megatron FIM Dataset') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - ++model.name=megatron_gpt_full_te_layer_autocast \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=layernorm1p \ - model.bias_activation_fusion=True \ - model.bias_dropout_add_fusion=True \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=null \ - model.activations_checkpoint_granularity=null \ - model.activations_checkpoint_num_layers=null \ - model.data.data_prefix='[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]' \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ - ++model.data.add_fim=True \ - ++model.data.fim.extra_tokens.prefix='fim_prefix' \ - ++model.data.fim.extra_tokens.middle='fim_middle' \ - ++model.data.fim.extra_tokens.suffix='fim_suffix' \ - ++model.data.fim.extra_tokens.pad='fim_pad' \ - ++model.data.fim.extra_tokens.eod='endoftext'" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - } - } - - stage('L2: Megatron Mock Data Generation') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('MockGPTDataset') { - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.max_steps=10 \ - trainer.limit_val_batches=7 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.data.data_impl=mock \ - model.data.data_prefix=[] \ - " - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - } - } - stage('MockT5Dataset') { - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.max_steps=10 \ - trainer.limit_val_batches=3 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.data.data_impl=mock \ - model.data.data_prefix=[] \ - " - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - } - } - } - } - - stage('L2: TTS Fast dev runs 1') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - parallel { - stage('Tacotron 2') { - steps { - sh 'python examples/tts/tacotron2.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.decoder.decoder_rnn_dim=256 \ - model.decoder.attention_rnn_dim=1024 \ - model.decoder.prenet_dim=128 \ - model.postnet.postnet_n_convolutions=3 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs \ - ~trainer.check_val_every_n_epoch \ - ' - } - } - stage('WaveGlow') { - steps { - sh 'python examples/tts/waveglow.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.waveglow.n_flows=4 \ - model.waveglow.n_wn_layers=2 \ - model.waveglow.n_wn_channels=32 \ - ~trainer.check_val_every_n_epoch' - } - } - stage('FastPitch') { - steps { - sh 'python examples/tts/fastpitch.py \ - --config-name fastpitch_align_v1.05 \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/beta_priors \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.symbols_embedding_dim=64 \ - model.input_fft.d_inner=384 \ - model.input_fft.n_layer=2 \ - model.output_fft.d_inner=384 \ - model.output_fft.n_layer=2 \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs' - } - } - stage('RADTTS') { - steps { - sh 'python examples/tts/radtts.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - export_dir=/home/TestData/radtts_test \ - model.optim.lr=0.0001 \ - model.modelConfig.decoder_use_partial_padding=True \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs' - } - } - stage('Mixer-TTS') { - steps { - sh 'python examples/tts/mixer_tts.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/sup_data \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs' - } - } - stage('Hifigan') { - steps { - sh 'python examples/tts/hifigan.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - +trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.generator.upsample_initial_channel=64 \ - +model.debug=true \ - ~trainer.check_val_every_n_epoch' - } - } - } - } - stage('L2: NeRF') { - when { - anyOf { - branch 'r1.21.0' - changeRequest target: 'r1.21.0' - } - } - parallel { - stage('DreamFusion') { - steps { - sh 'python examples/multimodal/text_to_image/nerf/main.py \ - trainer.num_nodes=1 \ - trainer.devices="[0]" \ - trainer.max_steps=1000 \ - model.prompt="a DSLR photo of a delicious hamburger" \ - exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results' - sh 'rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results' - } - } - } - } - stage('L??: Speech Checkpoints tests') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh 'CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \ - pretrained_name=QuartzNet15x5Base-En \ - dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \ - batch_size=64 \ - tolerance=0.1012' - sh 'rm -f examples/asr/evaluation_transcripts.json' - } - } - } - - post { - always { - sh 'chmod -R 777 .' - cleanWs() - } - } -} diff --git a/README.rst b/README.rst index 66b3a5806c2d..89ed934527d8 100644 --- a/README.rst +++ b/README.rst @@ -77,63 +77,85 @@ Latest News +
+ Speech Recognition +
+ New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model (2024/04/18) + + The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization. Canary also provides bi-directional translation, between English and the three other supported languages. +

+
+ +
+ Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models (2024/04/18) + + NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhere—on any cloud and on-premises—released the Parakeet family of automatic speech recognition (ASR) models. These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy. +

+
+ +
+ Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT (2024/04/18) + + NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhere—on any cloud and on-premises—recently released Parakeet-TDT. This new addition to the  NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B. +

+
+ +
+ Introduction ------------ -NVIDIA NeMo Framework is a generative AI framework built for researchers and pytorch developers -working on large language models (LLMs), multimodal models (MM), automatic speech recognition (ASR), -and text-to-speech synthesis (TTS). -The primary objective of NeMo is to provide a scalable framework for researchers and developers from industry and academia -to more easily implement and design new generative AI models by being able to leverage existing code and pretrained models. +NVIDIA NeMo Framework is a scalable and cloud-native generative AI framework built for researchers and PyTorch developers working on Large Language Models (LLMs), Multimodal Models (MMs), Automatic Speech Recognition (ASR), Text to Speech (TTS), and Computer Vision (CV) domains. It is designed to help you efficiently create, customize, and deploy new generative AI models by leveraging existing code and pre-trained model checkpoints. For technical documentation, please see the `NeMo Framework User Guide `_. -All NeMo models are trained with `Lightning `_ and -training is automatically scalable to 1000s of GPUs. +LLMs and MMs Training, Alignment, and Customization +################################################### + +All NeMo models are trained with `Lightning `_. +Training is automatically scalable to 1000s of GPUs. + +When applicable, NeMo models leverage cutting-edge distributed training techniques, incorporating `parallelism strategies `_ to enable efficient training of very large models. These techniques include Tensor Parallelism (TP), Pipeline Parallelism (PP), Fully Sharded Data Parallelism (FSDP), Mixture-of-Experts (MoE), and Mixed Precision Training with BFloat16 and FP8, as well as others. + +NeMo Transformer-based LLMs and MMs utilize `NVIDIA Transformer Engine `_ for FP8 training on NVIDIA Hopper GPUs, while leveraging `NVIDIA Megatron Core `_ for scaling Transformer model training. + +NeMo LLMs can be aligned with state-of-the-art methods such as SteerLM, Direct Preference Optimization (DPO), and Reinforcement Learning from Human Feedback (RLHF). See `NVIDIA NeMo Aligner `_ for more information. -When applicable, NeMo models take advantage of the latest possible distributed training techniques, -including parallelism strategies such as +In addition to supervised fine-tuning (SFT), NeMo also supports the latest parameter efficient fine-tuning (PEFT) techniques such as LoRA, P-Tuning, Adapters, and IA3. Refer to the `NeMo Framework User Guide `_ for the full list of supported models and techniques. -* data parallelism -* tensor parallelism -* pipeline model parallelism -* fully sharded data parallelism (FSDP) -* sequence parallelism -* context parallelism -* mixture-of-experts (MoE) +LLMs and MMs Deployment and Optimization +######################################## -and mixed precision training recipes with bfloat16 and FP8 training. +NeMo LLMs and MMs can be deployed and optimized with `NVIDIA NeMo Microservices `_. -NeMo's Transformer based LLM and Multimodal models leverage `NVIDIA Transformer Engine `_ for FP8 training on NVIDIA Hopper GPUs -and leverages `NVIDIA Megatron Core `_ for scaling transformer model training. +Speech AI +######### -NeMo LLMs can be aligned with state of the art methods such as SteerLM, DPO and Reinforcement Learning from Human Feedback (RLHF), -see `NVIDIA NeMo Aligner `_ for more details. +NeMo ASR and TTS models can be optimized for inference and deployed for production use cases with `NVIDIA Riva `_. -NeMo LLM and Multimodal models can be deployed and optimized with `NVIDIA Inference Microservices (Early Access) `_. +NeMo Framework Launcher +####################### -NeMo ASR and TTS models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva `_. +`NeMo Framework Launcher `_ is a cloud-native tool that streamlines the NeMo Framework experience. It is used for launching end-to-end NeMo Framework training jobs on CSPs and Slurm clusters. -For scaling NeMo LLM and Multimodal training on Slurm clusters or public clouds, please see the `NVIDIA Framework Launcher `_. -The NeMo Framework launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and Multimodal models and also has an `Autoconfigurator `_ -which can be used to find the optimal model parallel configuration for training on a specific cluster. -To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks `_ -The NeMo Framework Launcher does not currently support ASR and TTS training but will soon. +The NeMo Framework Launcher includes extensive recipes, scripts, utilities, and documentation for training NeMo LLMs. It also includes the NeMo Framework `Autoconfigurator `_, which is designed to find the optimal model parallel configuration for training on a specific cluster. -Getting started with NeMo is simple. -State of the Art pretrained NeMo models are freely available on `HuggingFace Hub `_ and +To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks `_. The NeMo Framework Launcher does not currently support ASR and TTS training, but it will soon. + +Get Started with NeMo Framework +------------------------------- + +Getting started with NeMo Framework is easy. State-of-the-art pretrained NeMo models are freely available on `Hugging Face Hub `_ and `NVIDIA NGC `_. These models can be used to generate text or images, transcribe audio, and synthesize speech in just a few lines of code. We have extensive `tutorials `_ that -can be run on `Google Colab `_ or with our `NGC NeMo Framework Container. `_ -and we have `playbooks `_ for users that want to train NeMo models with the NeMo Framework Launcher. +can be run on `Google Colab `_ or with our `NGC NeMo Framework Container `_. We also have `playbooks `_ for users who want to train NeMo models with the NeMo Framework Launcher. -For advanced users that want to train NeMo models from scratch or finetune existing NeMo models -we have a full suite of `example scripts `_ that support multi-GPU/multi-node training. +For advanced users who want to train NeMo models from scratch or fine-tune existing NeMo models, we have a full suite of `example scripts `_ that support multi-GPU/multi-node training. Key Features ------------ @@ -147,9 +169,9 @@ Key Features Requirements ------------ -1) Python 3.10 or above -2) Pytorch 1.13.1 or above -3) NVIDIA GPU, if you intend to do model training +* Python 3.10 or above +* Pytorch 1.13.1 or above +* NVIDIA GPU (if you intend to do model training) Developer Documentation ----------------------- @@ -172,65 +194,61 @@ Developer Documentation | Stable | |stable| | `Documentation of the stable (i.e. most recent release) branch. `_ | +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+ - -Getting help with NeMo +Install NeMo Framework ---------------------- -FAQ can be found on NeMo's `Discussions board `_. You are welcome to ask questions or start discussions there. - - -Installation ------------- The NeMo Framework can be installed in a variety of ways, depending on your needs. Depending on the domain, you may find one of the following installation methods more suitable. -* Conda / Pip - Refer to the `Conda <#conda>`_ and `Pip <#pip>`_ sections for installation instructions. +* Conda / Pip - Refer to `Conda <#conda>`_ and `Pip <#pip>`_ for installation instructions. + + * This is the recommended method for ASR and TTS domains. + * When using a Nvidia PyTorch container as the base, this is the recommended method for all domains. - * This is recommended for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains. - * When using a Nvidia PyTorch container as the base, this is the recommended installation method for all domains. +* Docker Containers - Refer to `Docker containers <#docker-containers>`_ for installation instructions. -* Docker Containers - Refer to the `Docker containers <#docker-containers>`_ section for installation instructions. + * NeMo Framework container - `nvcr.io/nvidia/nemo:24.05` - * This is recommended for Large Language Models (LLM), Multimodal and Vision domains. - * NeMo LLM & Multimodal Container - `nvcr.io/nvidia/nemo:24.03.framework` - * NeMo Speech Container - `nvcr.io/nvidia/nemo:24.01.speech` +* LLMs and MMs Dependencies - Refer to `LLMs and MMs Dependencies <#install-llms-and-mms-dependencies>`_ for installation instructions. -* LLM and Multimodal Dependencies - Refer to the `LLM and Multimodal dependencies <#llm-and-multimodal-dependencies>`_ section for isntallation instructions. - * It's higly recommended to start with a base NVIDIA PyTorch container: `nvcr.io/nvidia/pytorch:24.02-py3` +**Important: We strongly recommended that you start with a base NVIDIA PyTorch container: nvcr.io/nvidia/pytorch:24.02-py3.** Conda -~~~~~ +^^^^^^ -We recommend installing NeMo in a fresh Conda environment. +Install NeMo in a fresh Conda environment: .. code-block:: bash conda create --name nemo python==3.10.12 conda activate nemo -Install PyTorch using their `configurator `_. +Install PyTorch using their `configurator `_: .. code-block:: bash conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -The command used to install PyTorch may depend on your system. Please use the configurator linked above to find the right command for your system. +The command to install PyTorch may depend on your system. Use the configurator linked above to find the right command for your system. + +Then, install NeMo via Pip or from Source. We do not provide NeMo on the conda-forge or any other Conda channel. Pip -~~~ -Use this installation mode if you want the latest released version. +^^^ + +To install the nemo_toolkit, use the following installation method: .. code-block:: bash apt-get update && apt-get install -y libsndfile1 ffmpeg - pip install Cython + pip install Cython packaging pip install nemo_toolkit['all'] -Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command. +Depending on the shell used, you may need to use the ``"nemo_toolkit[all]"`` specifier instead in the above command. -Pip (Domain Specific) -~~~~~~~~~~~~~~~~~~~~~ +Pip from a Specific Domain +^^^^^^^^^^^^^^^^^^^^^^^^^^ -To install only a specific domain of NeMo, use the following commands. Note: It is required to install the above pre-requisites before installing a specific domain of NeMo. +To install a specific domain of NeMo, you must first install the nemo_toolkit using the instructions listed above. Then, you run the following domain-specific commands: .. code-block:: bash @@ -240,20 +258,22 @@ To install only a specific domain of NeMo, use the following commands. Note: It pip install nemo_toolkit['vision'] pip install nemo_toolkit['multimodal'] -Pip from source -~~~~~~~~~~~~~~~ -Use this installation mode if you want the version from a particular GitHub branch (e.g main). +Pip from a Source Branch +^^^^^^^^^^^^^^^^^^^^^^^^ + +If you want to work with a specific version of NeMo from a particular GitHub branch (e.g main), use the following installation method: .. code-block:: bash apt-get update && apt-get install -y libsndfile1 ffmpeg - pip install Cython + pip install Cython packaging python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all] -From source -~~~~~~~~~~~ -Use this installation mode if you are contributing to NeMo. +Build from Source +^^^^^^^^^^^^^^^^^ + +If you want to clone the NeMo GitHub repository and contribute to NeMo open-source development work, use the following installation method: .. code-block:: bash @@ -262,18 +282,16 @@ Use this installation mode if you are contributing to NeMo. cd NeMo ./reinstall.sh -If you only want the toolkit without additional conda-based dependencies, you may replace ``reinstall.sh`` -with ``pip install -e .`` when your PWD is the root of the NeMo repository. +If you only want the toolkit without the additional Conda-based dependencies, you can replace ``reinstall.sh`` with ``pip install -e .`` when your PWD is the root of the NeMo repository. -Mac computers with Apple silicon -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To install NeMo on Mac with Apple M-Series GPU: +Mac Computers with Apple Silicon +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- create a new Conda environment +To install NeMo on Mac computers with the Apple M-Series GPU, you need to create a new Conda environment, install PyTorch 2.0 or higher, and then install the nemo_toolkit. -- install PyTorch 2.0 or higher +**Important: This method is only applicable to the ASR domain.** -- run the following code: +Run the following code: .. code-block:: shell @@ -285,7 +303,7 @@ To install NeMo on Mac with Apple M-Series GPU: conda install -c conda-forge pynini # install Cython manually - pip install cython + pip install cython packaging # clone the repo and install in development mode git clone https://github.com/NVIDIA/NeMo @@ -295,24 +313,22 @@ To install NeMo on Mac with Apple M-Series GPU: # Note that only the ASR toolkit is guaranteed to work on MacBook - so for MacBook use pip install 'nemo_toolkit[asr]' Windows Computers -~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^ -One of the options is using Windows Subsystem for Linux (WSL). - -To install WSL: - -- In PowerShell, run the following code: +To install the Windows Subsystem for Linux (WSL), run the following code in PowerShell: .. code-block:: shell wsl --install # [note] If you run wsl --install and see the WSL help text, it means WSL is already installed. -Learn more about installing WSL at `Microsoft's official documentation `_. +To learn more about installing WSL, refer to `Microsoft's official documentation `_. + +After installing your Linux distribution with WSL, two options are available: -After Installing your Linux distribution with WSL: - - **Option 1:** Open the distribution (Ubuntu by default) from the Start menu and follow the instructions. - - **Option 2:** Launch the Terminal application. Download it from `Microsoft's Windows Terminal page `_ if not installed. +**Option 1:** Open the distribution (Ubuntu by default) from the Start menu and follow the instructions. + +**Option 2:** Launch the Terminal application. Download it from `Microsoft's Windows Terminal page `_ if not installed. Next, follow the instructions for Linux systems, as provided above. For example: @@ -324,8 +340,11 @@ Next, follow the instructions for Linux systems, as provided above. For example: ./reinstall.sh RNNT -~~~~ -Note that RNNT requires numba to be installed from conda. +^^^^ + +For optimal performance of a Recurrent Neural Network Transducer (RNNT), install the Numba package from Conda. + +Run the following code: .. code-block:: bash @@ -333,14 +352,12 @@ Note that RNNT requires numba to be installed from conda. pip uninstall numba conda install -c conda-forge numba -LLM and Multimodal Dependencies -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Install LLMs and MMs Dependencies +--------------------------------- -The LLM and Multimodal domains require three additional dependencies: -NVIDIA Apex, NVIDIA Transformer Engine, and NVIDIA Megatron Core. +If you work with the LLM and MM domains, three additional dependencies are required: NVIDIA Apex, NVIDIA Transformer Engine, and NVIDIA Megatron Core. When working with the `main` branch, these dependencies may require a recent commit. -When working with the `main` branch these dependencies may require a recent commit. -The most recent working versions of these dependencies are: +The most recent working versions of these dependencies are here: .. code-block:: bash @@ -349,11 +366,14 @@ The most recent working versions of these dependencies are: export mcore_commit=fbb375d4b5e88ce52f5f7125053068caff47f93f export nv_pytorch_tag=24.02-py3 -When using a released version of NeMo, -please refer to the `Software Component Versions `_ -for the correct versions. +When using a released version of NeMo, please refer to the `Software Component Versions `_ for the correct versions. + +PyTorch Container +^^^^^^^^^^^^^^^^^ -If starting with a base NVIDIA PyTorch container first launch the container: +We recommended that you start with a base NVIDIA PyTorch container: nvcr.io/nvidia/pytorch:24.02-py3. + +If starting with a base NVIDIA PyTorch container, you must first launch the container: .. code-block:: bash @@ -366,15 +386,14 @@ If starting with a base NVIDIA PyTorch container first launch the container: --ulimit stack=67108864 \ nvcr.io/nvidia/pytorch:$nv_pytorch_tag -Then install the dependencies: +Next, you need to install the dependencies. Apex -~~~~ -NeMo LLM Multimodal Domains require that NVIDIA Apex to be installed. -Apex comes installed in the NVIDIA PyTorch container but it's possible that -NeMo LLM and Multimodal may need to be updated to a newer version. +^^^^ + +NVIDIA Apex is required for LLM and MM domains. Although Apex is pre-installed in the NVIDIA PyTorch container, you may need to update it to a newer version. -To install Apex, run +To install Apex, run the following code: .. code-block:: bash @@ -383,35 +402,32 @@ To install Apex, run git checkout $apex_commit pip install . -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam --group_norm" +When attempting to install Apex separately from the NVIDIA PyTorch container, you might encounter an error if the CUDA version on your system is different from the one used to compile PyTorch. To bypass this error, you can comment out the relevant line in the setup file located in the Apex repository on GitHub here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32. -While installing Apex outside of the NVIDIA PyTorch container, -it may raise an error if the CUDA version on your system does not match the CUDA version torch was compiled with. -This raise can be avoided by commenting it here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32 +cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using. -cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using: +To install cuda-nvprof, run the following code: .. code-block:: bash conda install -c nvidia cuda-nvprof=11.8 -packaging is also needed: +Finally, install the packaging: .. code-block:: bash pip install packaging -With the latest versions of Apex, the `pyproject.toml` file in Apex may need to be deleted in order to install locally. - +To install the most recent versions of Apex locally, it might be necessary to remove the `pyproject.toml` file from the Apex directory. Transformer Engine -~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^ -The NeMo LLM Multimodal Domains require that NVIDIA Transformer Engine to be installed. -Transformer Engine comes installed in the NVIDIA PyTorch container but it's possible that -NeMo LLM and Multimodal may need Transformer Engine to be updated to a newer version. +NVIDIA Transformer Engine is required for LLM and MM domains. Although the Transformer Engine is pre-installed in the NVIDIA PyTorch container, you may need to update it to a newer version. -Transformer Engine enables FP8 training on NVIDIA Hopper GPUs and many performance optimizations for transformer-based model training. -Documentation for installing Transformer Engine can be found `here `_. +The Transformer Engine facilitates training with FP8 precision on NVIDIA Hopper GPUs and introduces many enhancements for the training of Transformer-based models. Refer to `Transformer Enginer `_ for information. + +To install Transformer Engine, run the following code: .. code-block:: bash @@ -424,14 +440,11 @@ Documentation for installing Transformer Engine can be found `here `_. +-------------------- + +NeMo Text Processing, specifically Inverse Text Normalization, is now a separate repository. It is located here: `https://github.com/NVIDIA/NeMo-text-processing `_. + +Docker Containers +----------------- -Docker containers -~~~~~~~~~~~~~~~~~ -We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.23.0`` comes with container ``nemo:24.01.speech``, you may find more details about released containers in `releases page `_. +NeMo containers are launched concurrently with NeMo version updates. NeMo Framework now supports LLMs, MMs, ASR, and TTS in a single consolidated Docker container. You can find additional information about released containers on the `NeMo releases page `_. -To use a pre-built container, please run +To use a pre-built container, run the following code: .. code-block:: bash - docker pull nvcr.io/nvidia/nemo:24.01.speech + docker pull nvcr.io/nvidia/nemo:24.05 -To build a nemo container with Dockerfile from a branch, please run +To build a nemo container with Dockerfile from a branch, run the following code: .. code-block:: bash - DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest . - + DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest If you choose to work with the main branch, we recommend using NVIDIA's PyTorch container version 23.10-py3 and then installing from GitHub. @@ -472,25 +485,32 @@ If you choose to work with the main branch, we recommend using NVIDIA's PyTorch -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \ stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.10-py3 -Examples --------- -Many examples can be found under the `"Examples" `_ folder. +Future Work +----------- + +The NeMo Framework Launcher does not currently support ASR and TTS training, but it will soon. +Discussions Board +----------------- -Contributing ------------- +FAQ can be found on the NeMo `Discussions board `_. You are welcome to ask questions or start discussions on the board. + +Contribute to NeMo +------------------ We welcome community contributions! Please refer to `CONTRIBUTING.md `_ for the process. Publications ------------- +------------------ We provide an ever-growing list of `publications `_ that utilize the NeMo Framework. -If you would like to add your own article to the list, you are welcome to do so via a pull request to this repository's ``gh-pages-src`` branch. -Please refer to the instructions in the `README of that branch `_. +To contribute an article to the collection, please submit a pull request to the ``gh-pages-src`` branch of this repository. For detailed information, please consult the README located at the `gh-pages-src branch `_. + +Licenses +-------- + +* `NeMo GitHub Apache 2.0 license `__ -License -------- -NeMo is released under an `Apache 2.0 license `_. +* NeMo is licensed under the `NVIDIA AI PRODUCT AGREEMENT `__. By pulling and using the container, you accept the terms and conditions of this license. diff --git a/ci.groovy b/ci.groovy deleted file mode 100644 index 27ad659b99a1..000000000000 --- a/ci.groovy +++ /dev/null @@ -1,119 +0,0 @@ -@Library('blossom-github-lib@master') -import ipp.blossom.* - -podTemplate(cloud:'sc-ipp-blossom-prod', yaml : """ -apiVersion: v1 -kind: Pod -metadata: - labels: - some-label: some-label-value -spec: - volumes: - - name: scratch - nfs: - server: ipp1-cdot01-col01 - path: /vol/scratch1/scratch.okuchaiev_blossom - containers: - - name: latestdlfw - image: nvcr.io/nvidia/pytorch:23.02-py3 - command: - - cat - volumeMounts: - - name: scratch - mountPath: /testdata - resources: - limits: - nvidia.com/gpu: 2 - restartPolicy: Never - backoffLimit: 4 - tty: true - shm-size: 32g - nodeSelector: - kubernetes.io/os: linux - nvidia.com/gpu_type: "Tesla_T4x4" - nvidia.com/node_type: gpu_tester - nvidia.com/driver_version: "510.20" -""" -) { - node(POD_LABEL) { - def githubHelper - stage('Get Token') { - withCredentials([usernamePassword(credentialsId: 'GHAtoken', passwordVariable: 'GIT_PASSWORD', usernameVariable: 'GIT_USERNAME')]) { - // create new instance of helper object - githubHelper = GithubHelper.getInstance("${GIT_PASSWORD}", githubData) - } - - } - def stageName = '' - try { - currentBuild.description = githubHelper.getBuildDescription() - container('latestdlfw') { - stage('Code checkout') { - // update status on github - githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Running", GitHubCommitState.PENDING) - checkout changelog: true, poll: true, scm: [$class: 'GitSCM', branches: [[name: "pr/"+githubHelper.getPRNumber()]], - doGenerateSubmoduleConfigurations: false, - submoduleCfg: [], - userRemoteConfigs: [[credentialsId: 'github-token', url: githubHelper.getCloneUrl(), refspec: '+refs/pull/*/head:refs/remotes/origin/pr/*']]] - } - - stage('Code Style') { - sh "apt-get update && \ - apt-get install -y bc && \ - nvidia-smi && \ - pip install -r requirements/requirements_test.txt && \ - python setup.py style && ls -l /testdata/TestData && ln -s /testdata/TestData /home/TestData && \ - ls -l /home && ls -l /home/TestData" - } - - stage('Installation') { - sh "git config --global --add safe.directory '*' && nvidia-smi && ./reinstall.sh release" - } - - stage('L0: GPU unit tests') { - sh "NEMO_NUMBA_MINVER=0.53 pytest -m 'not pleasefixme'" - } - - parallel( //USE CUDA_VISIBLE_DEVICES to execute 2 single GPU tests in parallel here - [ - "L1: NMT Training Pre-LN": { sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.decoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.encoder.pre_ln=true \ - model.decoder.pre_ln=true \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null \ - '}, - "L1: Speech to text": { sh 'CUDA_VISIBLE_DEVICES=1 python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_val.json \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager=null \ - '} - ] - )//end of parallel - } - githubHelper.updateCommitStatus("$BUILD_URL", "Complete", GitHubCommitState.SUCCESS) - } - catch (Exception ex){ - currentBuild.result = 'FAILURE' - println ex - githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Failed", GitHubCommitState.FAILURE) - } - - } - } \ No newline at end of file diff --git a/docs/source/core/core_index.rst b/docs/source/apis.rst similarity index 74% rename from docs/source/core/core_index.rst rename to docs/source/apis.rst index 01977c1b5101..e3c199bb47d5 100644 --- a/docs/source/core/core_index.rst +++ b/docs/source/apis.rst @@ -14,14 +14,26 @@ You can learn more about aspects of the NeMo "core" by following the links below :name: core :titlesonly: - core - neural_modules - exp_manager - neural_types - export - adapters/intro - api + core/core + core/neural_modules + core/exp_manager + core/neural_types + core/export + core/adapters/intro +You can learn more about aspects of the NeMo APIs by following the links below: + +.. toctree:: + :maxdepth: 1 + :name: API + :titlesonly: + + core/api + common/intro + nlp/api + multimodal/api + asr/api + tts/api Alternatively, you can jump straight to the documentation for the individual collections: diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst index 2eb687d97d8e..c99d92c0371a 100644 --- a/docs/source/asr/api.rst +++ b/docs/source/asr/api.rst @@ -1,5 +1,5 @@ -NeMo ASR Collection API -======================= +NeMo ASR API +============ Model Classes diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst index b4656eec3f3f..a6e9cbe96c63 100644 --- a/docs/source/asr/datasets.rst +++ b/docs/source/asr/datasets.rst @@ -261,11 +261,6 @@ Semi Sorted Batching Sorting samples by duration and spliting them into batches speeds up training, but can degrade the quality of the model. To avoid quality degradation and maintain some randomness in the partitioning process, we add pseudo noise to the sample length when sorting. - .. image:: images/ssb.png - :align: center - :alt: semi sorted batching - :scale: 50% - It may result into training speeedup of more than 40 percent with the same quality. To enable and use semi sorted batching add some lines in config. .. code:: @@ -772,30 +767,30 @@ To enable multimodal dataloading, we provide several configuration options: Example 3. Combine an ASR (audio-text) dataset with an MT (text-only) dataset so that mini-batches have some examples from both datasets. Provide a custom prompt field for both datasets (to be leveraged by a relevant dataset class): -```yaml -use_multimodal_sampling: true -batch_tokens: 1024 -token_equivalent_duration: 0.08 # 0.01 frame shift * 8 subsampling factor -quadratic_factor: 50 -num_buckets: 30 -use_bucketing: true -input_cfg: - - type: nemo_tarred - manifest_filepath: /path/to/manifest__OP_0..512_CL_.json - tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar - weight: 0.5 - tags: - lang: en - prompt: "Given the following recording, transcribe what the person is saying:" - - type: txt_pair - source_path: /path/to/en__OP_0..512_CL_.txt - target_path: /path/to/pl__OP_0..512_CL_.txt - source_language: en - target_language: pl - weight: 0.5 - tags: - prompt: "Translate the following text to Polish:" -``` +.. code-block:: yaml + + use_multimodal_sampling: true + batch_tokens: 1024 + token_equivalent_duration: 0.08 # 0.01 frame shift * 8 subsampling factor + quadratic_factor: 50 + num_buckets: 30 + use_bucketing: true + input_cfg: + - type: nemo_tarred + manifest_filepath: /path/to/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.5 + tags: + lang: en + prompt: "Given the following recording, transcribe what the person is saying:" + - type: txt_pair + source_path: /path/to/en__OP_0..512_CL_.txt + target_path: /path/to/pl__OP_0..512_CL_.txt + source_language: en + target_language: pl + weight: 0.5 + tags: + prompt: "Translate the following text to Polish:" .. caution:: We strongly recommend to use multiple shards for text files as well so that different nodes and dataloading workers are able to randomize the order of text iteration. Otherwise, multi-GPU training has a high risk of duplication of text examples. diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst index 7d1270af1267..d353b4d983dd 100644 --- a/docs/source/asr/intro.rst +++ b/docs/source/asr/intro.rst @@ -156,11 +156,11 @@ Canary-1B is a multi-lingual, multi-task model, supporting automatic speech-to-t .. raw:: html - diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst index 97dafcb2bf6d..f002137beb0f 100644 --- a/docs/source/asr/models.rst +++ b/docs/source/asr/models.rst @@ -46,12 +46,14 @@ HuggingFace Spaces to try out Parakeet models in your browser: * `Parakeet-TDT-1.1B `__ space .. _Conformer_model: + Conformer --------- + .. _Conformer-CTC_model: + Conformer-CTC ~~~~~~~~~~~~~ -------------- Conformer-CTC is a CTC-based variant of the Conformer model introduced in :cite:`asr-models-gulati2020conformer`. Conformer-CTC has a similar encoder as the original Conformer but uses CTC loss and decoding instead of RNNT/Transducer loss, which makes it a non-autoregressive model. diff --git a/docs/source/asr/speech_intent_slot/api.rst b/docs/source/asr/speech_intent_slot/api.rst index 735c583f9115..d45f24f807f6 100644 --- a/docs/source/asr/speech_intent_slot/api.rst +++ b/docs/source/asr/speech_intent_slot/api.rst @@ -15,8 +15,10 @@ Mixins .. autoclass:: nemo.collections.asr.parts.mixins.ASRModuleMixin :show-inheritance: :members: + :no-index: .. autoclass:: nemo.collections.asr.parts.mixins.ASRBPEMixin :show-inheritance: :members: + :no-index: diff --git a/docs/source/asr/ssl/api.rst b/docs/source/asr/ssl/api.rst index 7103243a4b20..8e6f83986032 100644 --- a/docs/source/asr/ssl/api.rst +++ b/docs/source/asr/ssl/api.rst @@ -15,10 +15,12 @@ Mixins .. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin :show-inheritance: :members: + :no-index: .. autoclass:: nemo.core.classes.mixins.access_mixins.AccessMixin :show-inheritance: :members: + :no-index: diff --git a/docs/source/asr/ssl/intro.rst b/docs/source/asr/ssl/intro.rst index d1a7366164d8..76a3a75dcf37 100644 --- a/docs/source/asr/ssl/intro.rst +++ b/docs/source/asr/ssl/intro.rst @@ -1,5 +1,5 @@ -Self-Supervised Learning -================================= +Speech Self-Supervised Learning +=============================== Self-Supervised Learning (SSL) refers to the problem of learning without explicit labels. As any learning process require feedback, without explit labels, SSL derives supervisory signals from diff --git a/docs/source/ckpt_converters/dev_guide.rst b/docs/source/ckpt_converters/dev_guide.rst index 9faa752df2e1..601e69749b64 100644 --- a/docs/source/ckpt_converters/dev_guide.rst +++ b/docs/source/ckpt_converters/dev_guide.rst @@ -48,7 +48,7 @@ Script Placement and Naming Conventions Code Template ------------- -Below template tries to address the 11 steps in the guideline part. Please also use `Gemma Huggingface to NeMo converter `_ as an full example for development. +Below template tries to address the 11 steps in the guideline part. Please also use `Gemma Huggingface to NeMo converter `__ as an full example for development. .. code-block:: python @@ -210,7 +210,7 @@ A Simple Guide for Model Mapping and Conversion 2. **Common issues when converting: results not matching between Community model and NeMo model**: - a. Megatron Core uses a special QKV layout, which needs careful handling and reshaping from community models, especially when GQA or MQA is used. Refer to the `Gemma Huggingface to NeMo converter `_ for guidance. + a. Megatron Core uses a special QKV layout, which needs careful handling and reshaping from community models, especially when GQA or MQA is used. Refer to the `Gemma Huggingface to NeMo converter `__ for guidance. b. GLU Variants weights could also be a common source of error. In Megatron Core, the regular feedforward projection weights and gated forward weights are fused together, requiring careful attention to the order of these two. Refer to the `Gemma Huggingface to NeMo converter `_ for more details. diff --git a/docs/source/ckpt_converters/user_guide.rst b/docs/source/ckpt_converters/user_guide.rst index 9de22f4b5994..451679a7e3ae 100644 --- a/docs/source/ckpt_converters/user_guide.rst +++ b/docs/source/ckpt_converters/user_guide.rst @@ -6,45 +6,45 @@ This guide provides instructions on how to use the conversion scripts to convert Support Matrix -------------- -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Conversion | From | To | Github Link | -+======================+==================+=====================+====================================================================================================================+ -| Baichuan | Hugging Face | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Baichuan | NeMo | Hugging Face | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| BERT | Hugging Face | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| BERT | NeMo | Hugging Face | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Falcon | Hugging Face | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Falcon | NeMo | Hugging Face | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Gemma | Hugging Face | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Gemma | JAX | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Gemma | PyTorch | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| GPT/LLaMA | NeMo (Legacy) | NeMo (Megatron-Core)| `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| LLaMA | Hugging Face | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| LLaMA | NeMo | Hugging Face | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Mistral 7B | Hugging Face | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Mistral 7B | NeMo | Hugging Face | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Mixtral | Hugging Face | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Mixtral | NeMo | Hugging Face | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| MPT | Hugging Face | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ -| Starcoder | Hugging Face | NeMo | `Link `_ | -+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+ ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Conversion | From | To | Github Link | ++======================+==================+=====================+=====================================================================================================================+ +| Baichuan | Hugging Face | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Baichuan | NeMo | Hugging Face | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| BERT | Hugging Face | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| BERT | NeMo | Hugging Face | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Falcon | Hugging Face | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Falcon | NeMo | Hugging Face | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Gemma | Hugging Face | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Gemma | JAX | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Gemma | PyTorch | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| GPT/LLaMA | NeMo (Legacy) | NeMo (Megatron-Core)| `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| LLaMA | Hugging Face | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| LLaMA | NeMo | Hugging Face | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Mistral 7B | Hugging Face | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Mistral 7B | NeMo | Hugging Face | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Mixtral | Hugging Face | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Mixtral | NeMo | Hugging Face | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| MPT | Hugging Face | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ +| Starcoder | Hugging Face | NeMo | `Link `__ | ++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+ Convert Hugging Face LLaMA Checkpoints to NeMo @@ -54,7 +54,7 @@ To convert a Hugging Face LLaMA checkpoint into a NeMo checkpoint, use the follo .. code-block:: bash - python convert_llama_hf_to_nemo.py>`_ \ + python convert_llama_hf_to_nemo.py \ --input_name_or_path \ --output_path @@ -67,7 +67,7 @@ To convert a NeMo checkpoint into a Hugging Face LLaMA checkpoint, you have two .. code-block:: bash - python convert__nemo_to_hf.py>`_ \ + python convert__nemo_to_hf.py \ --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \ --output_path /path/to/pytorch_model.bin @@ -75,7 +75,7 @@ To convert a NeMo checkpoint into a Hugging Face LLaMA checkpoint, you have two .. code-block:: bash - python convert__nemo_to_hf.py>`_ \ + python convert__nemo_to_hf.py \ --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \ --output_path /path/to/model_folder \ --hf_input_path /path/to/input_hf_folder \ diff --git a/docs/source/collections.rst b/docs/source/collections.rst index 1cc7a654b9c1..d4bea503513b 100644 --- a/docs/source/collections.rst +++ b/docs/source/collections.rst @@ -11,26 +11,9 @@ Documentation for the individual collections :titlesonly: nlp/nemo_megatron/intro - nlp/models nlp/machine_translation/machine_translation nlp/megatron_onnx_export nlp/quantization - nlp/api - - -.. toctree:: - :maxdepth: 1 - :caption: Speech AI - :name: Speech AI - :titlesonly: - - asr/intro - asr/speech_classification/intro - asr/speaker_recognition/intro - asr/speaker_diarization/intro - asr/ssl/intro - asr/speech_intent_slot/intro - .. toctree:: :maxdepth: 1 @@ -42,29 +25,32 @@ Documentation for the individual collections multimodal/vlm/intro multimodal/text2img/intro multimodal/nerf/intro - multimodal/api - .. toctree:: :maxdepth: 1 - :caption: Text To Speech (TTS) - :name: Text To Speech + :caption: Vision (CV) + :name: vision :titlesonly: - tts/intro + vision/intro .. toctree:: :maxdepth: 1 - :caption: Vision (CV) - :name: vision + :caption: Speech AI + :name: Speech AI :titlesonly: - vision/intro + asr/intro + asr/speech_classification/intro + asr/speaker_recognition/intro + asr/speaker_diarization/intro + asr/ssl/intro + asr/speech_intent_slot/intro .. toctree:: :maxdepth: 1 - :caption: Common - :name: Common + :caption: Text To Speech (TTS) + :name: Text To Speech :titlesonly: - common/intro \ No newline at end of file + tts/intro diff --git a/docs/source/common/intro.rst b/docs/source/common/intro.rst index fadbd9528485..a89f1a480e5d 100644 --- a/docs/source/common/intro.rst +++ b/docs/source/common/intro.rst @@ -1,5 +1,5 @@ -Common Collection -================= +NeMo Common Collection API +========================== The common collection contains things that could be used across all collections. diff --git a/docs/source/conf.py b/docs/source/conf.py index e8fba7457605..c599f630d7f7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -113,10 +113,9 @@ "sphinx.ext.viewcode", "sphinx.ext.napoleon", "sphinx.ext.githubpages", - "sphinxcontrib.bibtex", "sphinx.ext.inheritance_diagram", "sphinx.ext.intersphinx", - "sphinx.ext.autosectionlabel", + # "sphinx.ext.autosectionlabel", "sphinxcontrib.bibtex", "sphinx_copybutton", "sphinxext.opengraph", diff --git a/docs/source/core/adapters/api.rst b/docs/source/core/adapters/api.rst index b0f2a8e13610..8922c72d63eb 100644 --- a/docs/source/core/adapters/api.rst +++ b/docs/source/core/adapters/api.rst @@ -9,6 +9,7 @@ Core :members: :member-order: bysource :undoc-members: adapter_module_names + :no-index: ----- @@ -17,6 +18,7 @@ Core :members: :member-order: bysource :undoc-members: adapter_module_names + :no-index: ----- @@ -28,6 +30,7 @@ Adapter Networks :show-inheritance: :members: :member-order: bysource + :no-index: ----- @@ -35,6 +38,7 @@ Adapter Networks :show-inheritance: :members: :member-order: bysource + :no-index: ----- @@ -47,6 +51,7 @@ Adapter Strategies :members: :member-order: bysource :undoc-members: adapter_module_names + :no-index: ----- @@ -55,6 +60,7 @@ Adapter Strategies :members: :member-order: bysource :undoc-members: adapter_module_names + :no-index: ----- @@ -63,3 +69,4 @@ Adapter Strategies :members: :member-order: bysource :undoc-members: adapter_module_names + :no-index: diff --git a/docs/source/core/adapters/components.rst b/docs/source/core/adapters/components.rst index cc2ea0b525df..d8bed1b23a75 100644 --- a/docs/source/core/adapters/components.rst +++ b/docs/source/core/adapters/components.rst @@ -8,7 +8,7 @@ An adapter module can be any pytorch module, but it must follow certain straight 1) The model accepts an input of some input dimension, and its output must match this dimension. 2) Ideally, the module is initialized such that the output of the adapter when initialized is such that it does not modify the original input. This allows the model to produce the same output results, even when additional parameters have been added. -According to Junxian et al :cite:`adapters-Junxian2021unified`, we can consider an adapter being represented as three components - +According to Junxian et al :cite:`adapters-components-Junxian2021unified`, we can consider an adapter being represented as three components - 1) Functional form - the trainable parameters that will modify the input 2) Insertion form - Where the adapter outputs are integrated with the original input. The input to the adapters can be the last output of the layer, the input to some attention layer, or even the original input to the module itself (before even the modules forward pass). @@ -17,7 +17,7 @@ According to Junxian et al :cite:`adapters-Junxian2021unified`, we can consider Functional Form - Adapter Networks ================================== -Adapter modules represent the functional form of the adapter. We discuss an example of a most commonly used adapter module found in literature, titled the ``LinearAdapter`` (or Houlsby Adapter) :cite:`adapters-houlsby2019adapter`. +Adapter modules represent the functional form of the adapter. We discuss an example of a most commonly used adapter module found in literature, titled the ``LinearAdapter`` (or Houlsby Adapter) :cite:`adapters-components-houlsby2019adapter`. .. note:: @@ -28,6 +28,7 @@ Adapter modules represent the functional form of the adapter. We discuss an exam :show-inheritance: :members: :member-order: bysource + :no-index: ----- @@ -35,12 +36,13 @@ Adapter modules represent the functional form of the adapter. We discuss an exam :show-inheritance: :members: :member-order: bysource + :no-index: Insertion Form - Module Adapters -------------------------------- -Adapter modules can be integrated into many different locations of a given module. For example, it is possible to have an adapter that affects only the outputs of the final layer in each module. We can also have a ``Parallel Adapter`` :cite:`adapters-Junxian2021unified` that operates at the input of the module itself, in parallel to the forward pass of the module. Yet another insertion location is inside the Multi Head Attention Layers. +Adapter modules can be integrated into many different locations of a given module. For example, it is possible to have an adapter that affects only the outputs of the final layer in each module. We can also have a ``Parallel Adapter`` :cite:`adapters-components-Junxian2021unified` that operates at the input of the module itself, in parallel to the forward pass of the module. Yet another insertion location is inside the Multi Head Attention Layers. On top of this, while adapters are commonly used only in the layers containing the most parameters (say the Encoder of a network), some models can support adapters in multiple locations (Encoder-Decoder architecture for Language Models, Machine Translation, or even Encoder-Decoder-Joint for ASR with Transducer Loss). As such, NeMo utilizes the concept of ``Module Adapters``. @@ -70,6 +72,7 @@ We discuss a simple residual additional connection strategy below - that accepts :members: :member-order: bysource :undoc-members: adapter_module_names + :no-index: ----- @@ -78,6 +81,7 @@ We discuss a simple residual additional connection strategy below - that accepts :members: :member-order: bysource :undoc-members: adapter_module_names + :no-index: ----- @@ -87,4 +91,4 @@ References .. bibliography:: ./adapter_bib.bib :style: plain - :keyprefix: adapters- + :keyprefix: adapters-components- diff --git a/docs/source/core/adapters/intro.rst b/docs/source/core/adapters/intro.rst index fd94c8d23446..8c5e9cbc8895 100644 --- a/docs/source/core/adapters/intro.rst +++ b/docs/source/core/adapters/intro.rst @@ -144,4 +144,5 @@ References .. bibliography:: ./adapter_bib.bib :style: plain + :labelprefix: adapters :keyprefix: adapters- diff --git a/docs/source/core/api.rst b/docs/source/core/api.rst index 6b389ca3be85..1aceb73de0d9 100644 --- a/docs/source/core/api.rst +++ b/docs/source/core/api.rst @@ -1,6 +1,6 @@ -Core APIs -========= +NeMo Core APIs +============== Base class for all NeMo models ------------------------------ diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst index 6e5efa56d5f0..1c9325cf0a96 100644 --- a/docs/source/core/core.rst +++ b/docs/source/core/core.rst @@ -16,9 +16,10 @@ NeMo models contain everything needed to train and reproduce Conversational AI m NeMo uses `Hydra `_ for configuring both NeMo models and the PyTorch Lightning Trainer. -.. note:: Every NeMo model has an example configuration file and training script that can be found `here `_. +.. note:: + Every NeMo model has an example configuration file and training script that can be found `here `__. -The end result of using NeMo, `Pytorch Lightning `_, and Hydra is that NeMo models all have the same look and feel and are also fully compatible with the PyTorch ecosystem. +The end result of using NeMo, `Pytorch Lightning `__, and Hydra is that NeMo models all have the same look and feel and are also fully compatible with the PyTorch ecosystem. Pretrained ---------- @@ -42,14 +43,14 @@ To see all available pretrained models for a specific NeMo model, use the ``list For detailed information on the available pretrained models, refer to the collections documentation: -- :ref:`Automatic Speech Recognition (ASR)` +- :doc:`Automatic Speech Recognition (ASR) <../asr/intro>` - :doc:`Natural Language Processing (NLP) <../nlp/models>` - :doc:`Text-to-Speech Synthesis (TTS) <../tts/intro>` Training -------- -NeMo leverages `PyTorch Lightning `_ for model training. PyTorch Lightning lets NeMo decouple the +NeMo leverages `PyTorch Lightning `__ for model training. PyTorch Lightning lets NeMo decouple the conversational AI code from the PyTorch training code. This means that NeMo users can focus on their domain (ASR, NLP, TTS) and build complex AI applications without having to rewrite boiler plate code for PyTorch training. @@ -298,7 +299,7 @@ With NeMo and Hydra, every aspect of model training can be modified from the com of experiments on compute clusters or for quickly testing parameters while developing. All NeMo `examples `_ come with instructions on how to -run the training/inference script from the command-line (see `here `_ +run the training/inference script from the command-line (see `here `__ for an example). With Hydra, arguments are set using the ``=`` operator: diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst index b44d27c38b4b..efb55b0feabb 100644 --- a/docs/source/core/exp_manager.rst +++ b/docs/source/core/exp_manager.rst @@ -379,3 +379,4 @@ ExpManagerConfig :show-inheritance: :members: :member-order: bysource + :no-index: diff --git a/docs/source/core/export.rst b/docs/source/core/export.rst index 990769452a5c..c53dd8159a60 100644 --- a/docs/source/core/export.rst +++ b/docs/source/core/export.rst @@ -194,7 +194,7 @@ To facilitate that, the hooks below are provided. To export, for example, 'encod First goes the one receiving input (input_example) """ -Some nertworks may be exported differently according to user-settable options (like ragged batch support for TTS or cache support for ASR). To facilitate that - `set_export_config()` method is provided by Exportable to set key/value pairs to predefined model.export_config dictionary, to be used during the export: +Some networks may be exported differently according to user-settable options (like ragged batch support for TTS or cache support for ASR). To facilitate that - `set_export_config()` method is provided by Exportable to set key/value pairs to predefined model.export_config dictionary, to be used during the export: .. code-block:: Python @@ -202,6 +202,7 @@ Some nertworks may be exported differently according to user-settable options (l """ Sets/updates export_config dictionary """ + Also, if an action hook on setting config is desired, this method may be overloaded by `Exportable` descendants to include one. An example can be found in ``/nemo/collections/asr/models/rnnt_models.py``. diff --git a/docs/source/core/neural_types.rst b/docs/source/core/neural_types.rst index 9003b9ca5203..ec7d94336c05 100644 --- a/docs/source/core/neural_types.rst +++ b/docs/source/core/neural_types.rst @@ -24,6 +24,7 @@ Types are implemented in ``nemo.core.neural_types.NeuralType`` class. When you i are expected to include both *axes* information and *element type* information. .. autoclass:: nemo.core.neural_types.NeuralType + :no-index: Type Comparison Results ----------------------- @@ -31,6 +32,7 @@ Type Comparison Results When comparing two neural types, the following comparison results are generated. .. autoclass:: nemo.core.neural_types.NeuralTypeComparisonResult + :no-index: Examples -------- @@ -113,6 +115,7 @@ Custom element types It is possible to create user-defined element types to express the semantics of elements in your tensors. To do so, the user will need to inherit and implement abstract methods of the ``nemo.core.neural_types.elements.ElementType`` class .. autoclass:: nemo.core.neural_types.elements.ElementType + :no-index: Note that element types can be parametrized. Consider this example where it distinguishes between audio sampled at 8Khz and 16Khz. diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst index 0e0b3ad84402..4d363670fedf 100644 --- a/docs/source/features/memory_optimizations.rst +++ b/docs/source/features/memory_optimizations.rst @@ -3,7 +3,7 @@ Memory Optimizations Parallelism ----------- -Refer to :doc:`Parallelism <./parallelism>`. +Refer to :doc:`Parallelism <./parallelisms>`. Flash Attention --------------- @@ -11,38 +11,97 @@ Flash Attention Overview ^^^^^^^^ -Flash Attention is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as Natural Language Processing (NLP). Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. FlashAttention, an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms. +Flash attention is an algorithm designed to improve the efficiency of the attention mechanism in transformer models such as GPT and BERT. The attention mechanism has quadratic time and memory complexity in sequence length and can present significant runtime and memory challenges for longer sequences. + +Compared to the standard, non-flash algorithm, flash attention applies two techniques to lower the memory requirement and improve compute efficiency. + +The tiling technique decomposes the inputs based on the shared memory size and calculates the softmax one tile at a time. Instead of working on the entire query, key, value tensors at once, it makes several passes at these tensors and then combines the results in a subsequent step. + +The recomputation technique stores the softmax normalization factors (linear to sequence length), instead of the softmax results (qudratic to sequence length), and uses these normalization factors to recompute the attention scores. This saves the amount of data to write to global memory and reduces both the memory requirement and I/O traffic between global memory and shared memory. + +Flash attention lowers the memory footprint and computational complexity from quadratic to linear, and greatly extending the range of sequence length allowed in large language models. + +The flash attention algorithm was first propsed `here `_. Two of its implementations are `flash-attention `_ by Tri Dao *et al*, and `fused flash attention `_ by NVIDIA cuDNN. Turn Flash Attention On and Off ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In the NeMo Framework, Flash Attention is supported through the Transformer Engine with the inclusion of Flash Attention 2. By default, Flash Attention is enabled, but the Transformer Engine may switch to a different kernel if the tensor dimensions are not optimal for Flash Attention. Users can completely disable Flash Attention by setting the environment variable ``NVTE_FLASH_ATTN=0``. +In the NeMo framework, flash attention is supported through `Transformer Engine `_, including both of the implementations mentioned above. Transformer Engine selects the appropriate implementation based on input information such as sequence length, number of heads and head dimension. When both implementations are applicable, Transformer Engine prefers cuDNN flash attention on Hopper+ architectures and Tri Dao flash attention on Ampere architectures. + +To disable Tri Dao flash attention, set the environment variable ``NVTE_FLASH_ATTN=0``. To disable cuDNN flash attention, set ``NVTE_FUSED_ATTN=0``. -For more details on the supported Dot Attention backend, please refer to the Transformer Engine source code available at `Transformer Engine's Attention Mechanism `_. +For more details on the Dot Product Attention backends supported in Transformer Engine, please refer to the source code at `Transformer Engine's Attention Mechanism `_. -.. bibliography:: ./nlp_all.bib - :style: plain - :labelprefix: nlp-megatron - :keyprefix: nlp-megatron- +Activation Recomputation +------------------------ Overview ^^^^^^^^ Full Activation Recomputation """"""""""""""""""""""""""""" -This method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed. +The full activation recomputation method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed. Partial Activation Recomputation """""""""""""""""""""""""""""""" -This method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency. +The partial activation recomputation method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency. Selective Activation Recomputation """""""""""""""""""""""""""""""""" -This method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost. +The selective activation recomputation method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost. + +Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198. + +Multi-query Attention (MQA) and Grouped-query Attention (GQA) +------------------------------------------------------------- + +**Multi-query Attention (MQA)** and **Grouped-query Attention (GQA)** are modifications of the traditional multihead attention mechanism in Transformer models. These methods improve the efficiency and effectiveness of attention mechanisms. + +Overview +^^^^^^^^ + +**Multi-query Attention (MQA)** + MQA treats all attention heads as a single group, reducing computational complexity and accelerating training times. It is beneficial when model scalability or limited computational resources are concerns. + +**Grouped-query Attention (GQA)** + GQA groups the heads into clusters, each processing a subset of queries independently. This method balances the detailed focus of traditional multihead attention with the broad approach of MQA, enhancing nuanced input data processing. + +These attention variants offer: + +- **Reduced computational load**: Both methods decrease computation, beneficial for large models. +- **Increased processing speed**: Simplifying attention leads to faster training and inference. +- **Flexibility and adaptability**: Adjustments can be made based on task needs or hardware constraints. + +Enable MQA and GQA +^^^^^^^^^^^^^^^^^^ + +To use MQA or GQA in the NeMo Framework, adjust the ``num_query_groups`` parameter in the model configuration: + +1. **For Multi-query Attention (MQA)**: + - Set ``num_query_groups`` to `1` to treat all attention heads as a single group. + + .. code-block:: yaml + + num_query_groups: 1 # Enables Multi-query Attention + +2. **For Grouped-query Attention (GQA)**: + - Set ``num_query_groups`` to a number that is a divisor of the total number of attention heads (more than one but less than the total heads). + + .. code-block:: yaml + + num_query_groups: # Enables Grouped-query Attention + + - For regular attention, set this parameter to `None` or match it with the number of heads. + + .. code-block:: yaml + + num_query_groups: null # Default setting for regular multihead attention + +Adjust the ``num_query_groups`` to explore different attention mechanisms and optimize your model's performance based on specific needs. + +Implement MQA or GQA +^^^^^^^^^^^^^^^^^^^^ -Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198 +NeMo's support for GQA and MQA is enabled through the integration of Megatron Core's Attention mechanism. The underlying implementation details can be explored within the Attention class of Megatron Core, which provides the functional backbone for these advanced attention methods. To understand the specific modifications and implementations of MQA and GQA, refer to the source code in the Attention class: -.. bibliography:: ./nlp_all.bib - :style: plain - :labelprefix: nlp-megatron - :keyprefix: nlp-megatron- \ No newline at end of file +Check implementation details from Attention Class in Megatron Core Repo: https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/attention.py#L49 diff --git a/docs/source/features/mixed_precision.rst b/docs/source/features/mixed_precision.rst index d193752e5475..ba0dfb4e945b 100644 --- a/docs/source/features/mixed_precision.rst +++ b/docs/source/features/mixed_precision.rst @@ -4,3 +4,45 @@ Mixed Precision Training ------------------------ Mixed precision training significantly enhances computational efficiency by conducting operations in half-precision and fp8 formats, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. NeMo now supports FP16, BF16, and FP8 (via Transformer Engine) across most models. Further details will be provided shortly. + + +FP8 usage +========= + +Overview +^^^^^^^^ + +NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. NeMo uses the NVIDIA `TransformerEngine `_ (TE) in order to leverage speedups from FP8. The following table summarizes the FP8 related arguments that can be configured in NeMo (`example config setting `_). For a more detailed overview, refer to the TE `documentation `_, specifically the FP8 `format `_ and `recipe `_. + +.. list-table:: FP8 arguments + :widths: 25 25 50 + :header-rows: 1 + + * - Argument + - Description + * - transformer_engine + - TE and related functionality can be enabled by setting this boolean argument to True. If this argument is not set to True, all subsequent arguments will be ignored. + * - fp8 + - Enables FP8 training. For transformer networks, the QKV, projection, FC1, and FC2 matrix multiplications are executed using the 4th generation H100 tensor cores with FP8 support. + * - fp8_e4m3 + - Training recipe format for FP8. Activations, weights, and gradient tensors use the E4M3 format. + * - fp8_hybrid + - Training recipe format for FP8. Activations and weight tensors use the E4M3 format, whereas gradient use the E5M2 format to satisfy the additional dynamic range requirement for backward tensors. This is the default setting. + * - fp8_margin + - The scaling factor for FP8 tensors can be shifted by a factor of $2 ^ {margin}$ using this argument. + * - fp8_amax_history_len + - Window size for amax history. The window size determines how many instances of the most recent absolute max values (amaxes) are stored per tensor. + * - fp8_amax_compute_algo + - The choice between “max” and “most_recent” specifies how to select an amax value from the given history. + * - reduce_amax + - Indicates whether or not to perform an allreduce on the amax (absolute max) values for the FP8 tensors. Since the amax is directly used to compute the scaling factor for FP8 tensors, setting this argument ensures that the scaling factors for a tensor remain synchronized across devices in multi-GPU training configurations. + * - fp8_params + - Indicates whether or not to store module level parameters in FP8. Enabling this option can lead to reduced memory consumption. It eliminates the need to store a copy of weights in higher precision (> half) for cases where these weights are externally maintained, such as master parameters in the optimizer. For more information, refer to the `fp8_model_init `_ API in TE. + +Resources +^^^^^^^^^ + +- `TE documentation `_ +- `Intro to FP8, floating point formats, and mixed precision training `_ +- `Performance optimizations `_ that are natively supported in NeMo by enabling FP8 training with TE +- `TE installation `_ diff --git a/docs/source/features/parallelisms.rst b/docs/source/features/parallelisms.rst index b10477e4232c..4cc493f40024 100644 --- a/docs/source/features/parallelisms.rst +++ b/docs/source/features/parallelisms.rst @@ -3,59 +3,246 @@ Parallelisms ------------ -NeMo Megatron supports 5 types of parallelisms (which can be mixed together arbitrarily): +NeMo Megatron supports five types of parallelism (which can be mixed together arbitrarily). + +Data Parallelism +^^^^^^^^^^^^^^^^ + +Data Parallelism (DP) creates identical copies of the model across +multiple GPUs. Data batches are distributed between GPUs so that the +GPUs can process them independently. While compute is efficiently +distributed between GPUs, communication is required in order to keep +the model copies consistent with each other. Distributed Data Parallelism -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Distributed Data Parallelism (DDP) creates idential copies of the model across multiple GPUs. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Distributed Data Parallelism (DDP) keeps model copies consistent by +synchronizing parameter gradients before each optimization step. More +specifically, it sums gradients over all model copies using an +all-reduce communication collective. .. image:: ../nlp/nemo_megatron/images/ddp.gif :align: center :width: 800px :alt: Distributed Data Parallel +Distributed Optimizer (ZeRO-1) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ZeRO-1 algorithm keeps model copies consistent by sharding the +optimizer state between GPUs. During each optimization step, the +parameter gradients are first summed and sharded (with a +reduce-scatter collective), each GPU applies an optimization to its +local shard of the parameters, and the updated parameter shards are +broadcast to update all of the model copies (with an all-gather +collective). This approach is attractive for large models since +sharding the optimizer state can significantly reduce its memory +footprint on individual GPUs. It also has, in theory, the same +communication volume as DDP and its communication pattern has more +opportunities for overlapping with compute. + +Enable Data Parallelism +~~~~~~~~~~~~~~~~~~~~~~~ + +DDP is the default parallelism scheme when NeMo is run on multiple +GPUs. Enabling other parallelism schemes in the model configuration +will decrease the size of the DP group, that is the number of +identical model copies. + +To enable the distributed optimizer, set +``model.optim.name=distributed_fused_adam`` in the model +configuration. It can be configured with the following options: + +=========================== ========= ================================================================================================================================== +Option Default Description +=========================== ========= ================================================================================================================================== +``dtype`` fp32 Optimizer state datatype +``grad_sync_dtype`` ``dtype`` Gradient reduce-scatter datatype +``overlap_grad_sync`` True Overlap gradient reduce-scatter with compute +``overlap_param_sync`` False Overlap parameter all-gather with compute +``bucket_cap_mb`` 100 Buffer size (in MiB) for internal state and workspaces. Larger buckets have lower runtime overheads but may increase memory usage. +``contiguous_param_buffer`` False Allocate parameters as views into a large buffer. Helps avoid some data copies. +``contiguous_grad_buffer`` True Allocate parameter gradients as views into a large buffer. Helps avoid some data copies. +=========================== ========= ================================================================================================================================== + +See the keyword arguments in `Apex DistributedFusedAdam `_ and `NeMo MegatronDistributedFusedAdam `_ for a full list of distributed optimizer options. + +Implement Data Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +DDP in NeMo either uses PyTorch +`DistributedDataParallel `_ +(default) or a custom implementation (if custom multi-precision +training is enabled with ``megatron_amp_O2``). + +The distributed optimizer in NeMo is built on top of +`DistributedFusedAdam `_ +from Apex. Tensor Parallelism ^^^^^^^^^^^^^^^^^^ -With Tensor Paralellism (TP) a tensor is split into non-overlapping pieces and -different parts are distributed and processed on separate GPUs. + +Tensor Parallelism (TP) is a method for distributing a model's computation across multiple GPUs by splitting tensors into non-overlapping pieces. This allows different parts of the tensor to be processed simultaneously on separate GPUs, enhancing performance and enabling the training of larger models. .. image:: ../nlp/nemo_megatron/images/tp.gif :align: center :width: 800px :alt: Tensor Parallel +Enable Tensor Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~ + +To enable TP in the NeMo framework, configure the ``tensor_model_parallel_size`` parameter in the model configuration. This parameter determines the number of GPUs among which the model's tensors are partitioned. + +**For Tensor Parallelism**: + +Set ``tensor_model_parallel_size`` to greater than ``1`` to enable intra-layer model parallelism. + + .. code-block:: yaml + + tensor_model_parallel_size: 1 # Example to enable Tensor Parallelism + +The configuration file can be adjusted here: `NeMo Megatron GPT Config `_. + +Implement Tensor Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +NeMo integrates Tensor Parallelism through the implementation from Megatron Core. To understand how TP is activated within transformer blocks, refer to the code in the following repository: `Megatron-LM Transformer Block `_. + +For detailed API usage and additional configurations, consult the `Megatron Core Developer Guide `_. + Pipeline Parallelism ^^^^^^^^^^^^^^^^^^^^ -With Pipeline Paralellism (PP) consecutive layer chunks are assigned to different GPUs. + +Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially. .. image:: ../nlp/nemo_megatron/images/pp.gif :align: center :width: 800px :alt: Pipeline Parallel + +Enable Pipeline Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To utilize PP in the NeMo framework, you need to set the ``pipeline_model_parallel_size`` parameter in the model's configuration. This parameter specifies the number of GPUs among which the model's layers are distributed. + +**For Pipeline Parallelism**: + +Set ``pipeline_model_parallel_size`` to a value greater than ``1`` to enable inter-layer model parallelism. + + .. code-block:: yaml + + pipeline_model_parallel_size: 1 # Example to enable Pipeline Parallelism + +Adjust the configuration accordingly here: `NeMo Megatron GPT Config `_. + +Interleaved Pipeline Parallel Schedule +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. For instance, instead of each GPU processing a continuous set of four layers, it might handle two model chunks with two layers each. + + .. code-block:: yaml + + virtual_pipeline_model_parallel_size: 2 # Set for interleaved pipeline + +For more insights into this approach, see our detailed blog: `Scaling Language Model Training `_. + +Implement Pipeline Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The NeMo implementation of PP leverages functionalities from Megatron Core. For a practical example of how PP is implemented within transformer blocks in NeMo, you can inspect the following codebase: `Megatron-LM Transformer Block `_. + +For more detailed API usage and configurations related to PP, visit the `Megatron Core Developer Guide `_. + Sequence Parallelism ^^^^^^^^^^^^^^^^^^^^ +Sequence Parallelism extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency. + .. image:: ../nlp/nemo_megatron/images/sp.gif :align: center :width: 800px :alt: Sequence Parallel +Enable Sequence Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To utilize Sequence Parallelism in NeMo, set the ``sequence_parallel`` parameter to ``True`` in the model's configuration. Note that this feature is effective only when the tensor parallel size (``tensor_model_parallel_size``) is greater than ``1``. + + .. code-block:: yaml + + sequence_parallel: True # Enable Sequence Parallelism + +For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config `_. + +Implement Sequence Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The NeMo implementation of Sequence Parallelism utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code here: `Megatron-LM Sequence Parallel Source Code `_. + +Context Parallelism +^^^^^^^^^^^^^^^^^^^ + +Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs, focusing on the sequence dimension of the input data. Unlike Sequence Parallelism (SP) that only partitions specific types of activations, CP divides all network activations along the sequence dimension. + +Enable Context Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To activate CP in the NeMo framework, set the ``context_parallel_size`` parameter in the model configuration. This parameter specifies the number of GPUs among which the model's sequence activations are distributed. + +**For Context Parallelism**: + +Set ``context_parallel_size`` to a value greater than ``1`` to enable sequence-wide model parallelism. + + .. code-block:: yaml + + context_parallel_size: 1 # Example to enable Context Parallelism + +The configuration can be found and modified here: `NeMo Megatron Core Context Config `_. + +Implement Context Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +NeMo leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency. + +Visit our source code for more insights into the implementation: +- `Megatron Core wrappers for Transformer Engine `_ +- `Transformer Engine attention modules `_ + + Expert Parallelism ^^^^^^^^^^^^^^^^^^ -Expert Paralellim (EP) distributes experts across GPUs. - +Expert Parallelism (EP) is a type of model parallelism that distributes experts of an MoE across GPUs. .. image:: ../nlp/nemo_megatron/images/ep.png :align: center :width: 800px :alt: Expert Parallelism +Enable Expert Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~ + +To enable EP, set ``model.expert_model_parallel_size`` to the desired expert parallel size. For example, if the model has six experts (``model.num_moe_experts=6``), then setting ``model.expert_model_parallel_size=3`` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size. + + .. code-block:: yaml + + expert_model_parallel_size: 3 # Set EP to 3 + +For further information on configuration, refer to the following documentation: `NeMo Megatron GPT Config `_. + + +Implement Expert Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The NeMo implementation of Expert Parallelism uses functionality from Megatron Core. Please consult the `Megatron Core MoE layer `_ for more MoE implementation details. + + Parallelism nomenclature ^^^^^^^^^^^^^^^^^^^^^^^^ -When reading and modifying NeMo Megatron code you will encounter the following terms. +The following figure illustrates some terms that you may encounter in the NeMo Megatron codebase. .. image:: ../nlp/nemo_megatron/images/pnom.gif :align: center diff --git a/docs/source/features/throughput_optimizations.rst b/docs/source/features/throughput_optimizations.rst index 825c3add5dfb..dfd8b6cf9310 100644 --- a/docs/source/features/throughput_optimizations.rst +++ b/docs/source/features/throughput_optimizations.rst @@ -71,8 +71,8 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \ model.data.train_ds.file_names=[/path/to/training.jsonl] \ model.data.train_ds.max_seq_length=2048 \ - model.restore_from_path= \ - +output_dir= + +tokenizer_path=/path/to/tokenizer.model \ + +output_dir=/path/to/output_folder \ +pack_sizes=[2048,4096,8192] \ [ +packing_algorithm=first_fit_shuffle \ ] [ +seed=0 ] @@ -86,10 +86,7 @@ target length (i.e. efficient packing), then use shuffle. Otherwise try *first_f to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data, and can be determined by examining the distribution of sequence lengths in the dataset. - Note 3. Currently, we require a full nemo model file for simplicity and readability of code, but in theory only a - tokenizer file is needed. This part can be improved in a future iteration of the script. - - Note 4. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for + Note 3. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for each pack size. The output files are named ``/packed_{pack_size}_seed{seed}.npy``. This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in @@ -135,6 +132,14 @@ To train with packed sequences, you need to change four items in the SFT/PEFT co Now you are all set to finetune your model with a much improved throughput! +Sequence Packing for NeVA +------------------------- + +Sequence packing in NeVA (Multimodal LLMs) differs slightly from the LLM SFT/PEFT approach. For details, +please refer to the documentation below + +:doc:`../multimodal/mllm/sequence_packing` + Communication Overlap --------------------- NeMo leverages Megatron-Core's optimizations to enhance bandwidth utilization and effectively overlap computation with communication. Additional details will be provided soon. diff --git a/docs/source/index.rst b/docs/source/index.rst index 82d3359480ca..511d3ef700c9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -41,7 +41,6 @@ For quick guides and tutorials, see the "Getting started" section below. :titlesonly: starthere/intro - starthere/best-practices starthere/tutorials For more information, browse the developer docs for your area of interest in the contents section below or on the left sidebar. @@ -70,7 +69,7 @@ For more information, browse the developer docs for your area of interest in the :name: APIs :titlesonly: - core/core_index + apis .. toctree:: :maxdepth: 1 @@ -86,4 +85,4 @@ For more information, browse the developer docs for your area of interest in the :name: Speech AI Tools :titlesonly: - tools/intro \ No newline at end of file + tools/intro diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst index d6f96e6c6ea4..7a9fe2822d07 100644 --- a/docs/source/multimodal/api.rst +++ b/docs/source/multimodal/api.rst @@ -1,5 +1,5 @@ -Multimodal API -======================= +NeMo Multimodal API +=================== Model Classes ------------- @@ -8,6 +8,7 @@ Model Classes :show-inheritance: :no-members: :members: __init__, configure_optimizers + :no-index: .. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion @@ -16,18 +17,18 @@ Model Classes :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets -.. autoclass:: nemo.collections.multimodal.models.dreambooth.dreambooth.MegatronDreamBooth +.. autoclass:: nemo.collections.multimodal.models.text_to_image.dreambooth.dreambooth.MegatronDreamBooth :show-inheritance: :no-members: :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets -.. autoclass:: nemo.collections.multimodal.models.controlnet.controlnet.MegatronControlNet +.. autoclass:: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.MegatronControlNet :show-inheritance: :no-members: :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets -.. autoclass:: nemo.collections.multimodal.models.imagen.imagen.MegatronImagen +.. autoclass:: nemo.collections.multimodal.models.text_to_image.imagen.imagen.MegatronImagen :show-inheritance: :no-members: :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets @@ -65,7 +66,7 @@ Modules :members: __init__, encode -.. autoclass:: nemo.collections.multimodal.models.controlnet.controlnet.ControlledUnetModel +.. autoclass:: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.ControlledUnetModel :show-inheritance: :no-members: :members: forward diff --git a/docs/source/multimodal/mllm/checkpoint.rst b/docs/source/multimodal/mllm/checkpoint.rst index 46c6da631ba2..d1fe7b651e66 100644 --- a/docs/source/multimodal/mllm/checkpoint.rst +++ b/docs/source/multimodal/mllm/checkpoint.rst @@ -41,7 +41,7 @@ Converting Local Checkpoints The training script only auto-converts the final checkpoint into the ``.nemo`` format. To evaluate intermediate training checkpoints, conversion to ``.nemo`` might be needed. For this: -.. code-block:: python +.. code-block:: bash python -m torch.distributed.launch --nproc_per_node= * \ examples/multimodal/convert_ckpt_to_nemo.py \ @@ -59,12 +59,12 @@ NeVA Checkpoints Currently, the conversion mainly supports LLaVA checkpoints based on "llama-2 chat" checkpoints. As a reference, we'll consider the checkpoint `llava-llama-2-13b-chat-lightning-preview `_. -After downloading this checkpoint and saving it at `/path/to/llava-llama-2-13b-chat-lightning-preview`, undertake the following procedures: +After downloading this checkpoint and saving it at ``/path/to/llava-llama-2-13b-chat-lightning-preview``, undertake the following procedures: Modifying the Tokenizer """"""""""""""""""""""" -NeMo mandates adding specific tokens to the tokenizer model for peak performance. To modify an existing tokenizer located in `/path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer`, execute the following in the NeMo container: +NeMo mandates adding specific tokens to the tokenizer model for peak performance. To modify an existing tokenizer located in ``/path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer``, execute the following in the NeMo container: .. code-block:: bash @@ -82,7 +82,7 @@ Checkpoint Conversion For conversion: -.. code-block:: python +.. code-block:: bash python examples/multimodal/mllm/neva/convert_hf_llava_to_neva.py \ --in-file /path/to/llava-llama-2-13b-chat-lightning-preview \ @@ -99,7 +99,7 @@ NeVA Checkpoints Adjust model parallelism with: -.. code-block:: python +.. code-block:: bash python examples/nlp/language_modeling/megatron_change_num_partitions.py \ --model_file=/path/to/source.nemo \ diff --git a/docs/source/multimodal/mllm/datasets.rst b/docs/source/multimodal/mllm/datasets.rst index 1c64c4d317d2..2f2000124e4d 100644 --- a/docs/source/multimodal/mllm/datasets.rst +++ b/docs/source/multimodal/mllm/datasets.rst @@ -90,6 +90,14 @@ For NeVA training, integrating special tokens into the tokenizer is vital. After .. code-block:: bash + cd /opt; git clone https://github.com/google/sentencepiece.git && \ + cd sentencepiece && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make && \ + make install && \ + ldconfig cd /opt/sentencepiece/src/; protoc --python_out=/opt/NeMo/scripts/tokenizers/ sentencepiece_model.proto python /opt/NeMo/scripts/tokenizers/add_special_tokens_to_sentencepiece.py \ --input_file /path/to/neva/tokenizers/tokenizer.model \ diff --git a/docs/source/multimodal/mllm/intro.rst b/docs/source/multimodal/mllm/intro.rst index 687ecd930a9e..0e76a9737a0f 100644 --- a/docs/source/multimodal/mllm/intro.rst +++ b/docs/source/multimodal/mllm/intro.rst @@ -10,4 +10,5 @@ The endeavor to extend Language Models (LLMs) into multimodal domains by integra configs checkpoint neva - + video_neva + sequence_packing diff --git a/docs/source/multimodal/mllm/sequence_packing.rst b/docs/source/multimodal/mllm/sequence_packing.rst new file mode 100644 index 000000000000..b061ee1d89c6 --- /dev/null +++ b/docs/source/multimodal/mllm/sequence_packing.rst @@ -0,0 +1,127 @@ +Sequence Packing for NeVA +========================= + +Overview +-------- +As outlined in the throughput optimizations section, most multimodal LLM datasets, such as the LLaVA datasets, exhibit a skewed distribution of sequence lengths. Many sequences are short, and a few are very long, conforming to Zipf’s Law. Transformer models require fixed-length inputs, necessitating padding with many unused pad tokens, which is inefficient for two reasons: + +1. Computation on pad values is disregarded in the final model output, resulting in wasted FLOPs. +2. The micro batch size is often constrained by the batch containing the longest sequences, leading to underutilized GPU memory in most other batches. + +Sequence packing is a training technique wherein multiple training sequences (examples) are concatenated into one long sequence (pack). This approach eliminates the need for padding and allows for more tokens to be processed per micro batch, optimizing both GPU compute and memory utilization. + +For Sequence Packing in SFT / PEFT for LLMs, NeVA considers the following design: + +1. Original Datasets to Sequence Lengths Files + + 1.1. **PyTorch Loaders for Dataset Processing Efficiency** + To efficiently manage large datasets (~700K sequences), the system utilizes PyTorch's DataLoader with multi-worker capabilities, significantly speeding up the data processing phase by parallelizing the loading and pre-processing steps. + 1.2. **Handling Large Datasets** + The system writes sequence lengths to disk on the fly, ensuring scalability and efficient memory usage, as loading all data into memory is impractical. + 1.3. **Efficient I/O Operations** + To facilitate efficient I/O operations necessary for parallelized data loading, the system employs IndexedDataset from Megatron-Core, chosen for its ability to dynamically build binary tensor files. + +2. Packing Sequences into Bins + + 2.1. **Algorithm Choices and Performance** + The first_fit_decreasing and first_fit_shuffle algorithms initially used for packing sequences into bins showed performance issues due to their O(n^2) complexity, making the processing of NeVA samples time-consuming. + 2.2. **Introduction of shuffle_and_pack** + To address these inefficiencies, the shuffle_and_pack algorithm was introduced, an O(n) complexity algorithm that shuffles the sequence lengths before packing them into bins sequentially, significantly improving processing time. + 2.3. **Parallelization of Packing Process** + The system implements a parallelized approach to the first_fit_shuffle algorithm by dividing the samples into chunks (~20K samples each) and processing them separately, effectively mitigating the quadratic complexity problem. The bins from each chunk are then combined in the final step, enhancing overall efficiency. + 2.4. **Efficiency Improvements with completed_bins** + A minor optimization involves using completed_bins to prevent the algorithm from iterating over bins that cannot accommodate the minimum sequence length, leading to a more efficient packing process. + +3. Reading Sequence Lengths and Packing into New Files + After determining the optimal bins for packing, the system reads the sequence lengths from the generated files and packs these lengths into new files based on the bins' assignments. This final step consolidates the sequences into efficiently packed bins, ready for further processing or analysis. + +Performance Improvement +----------------------- +A 40% speed increase was achieved with optimized sequence packing for sequence length w/ Vicuna-1.5 13B (LLaVA 1.5 recipe). Detailed performance metrics across different configurations and stages are provided in the tables below. + +Fine-tuning Performance Table: + ++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+ +| Stage | Vision Encoder | LLM Model | TP | PP | Precision | Sequence Packing | Step Timing (s) | Global Batch Size | Samples / Sec | Perf Improvement | ++==============+===========================+================+====+====+===========+==================+=================+===================+===============+===================+ +| Fine-tuning | openai/clip-vit-large- | Vicuna-1.5 13B | 8 | 1 | BF16 | No | 2.008 | 128 | 63.745 | 0% | +| | patch14-336 | | | | | | | | | | ++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+ +| Fine-tuning | openai/clip-vit-large- | Vicuna-1.5 13B | 4 | 2 | BF16 | No | 1.889 | 128 | 67.761 | 6% | +| | patch14-336 | | | | | | | | | | ++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+ +| Fine-tuning | openai/clip-vit-large- | Vicuna-1.5 13B | 8 | 1 | BF16 | Yes | 1.302 | 116.08 | 89.155 | 40% | +| | patch14-336 | | | | | | | | | | ++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+ +| Fine-tuning | openai/clip-vit-large- | Vicuna-1.5 13B | 4 | 2 | BF16 | Yes | 1.237 | 116.08 | 93.840 | 47% | +| | patch14-336 | | | | | | | | | | ++--------------+---------------------------+----------------+----+----+-----------+------------------+-----------------+-------------------+---------------+-------------------+ + +How to Run NeVA with Packed Sequence +------------------------------------ +Prepare Dataset +^^^^^^^^^^^^^^^ +We provide an easy-to-use script for preprocessing a dataset for the NeMo Multimodal Learning framework. It requires specifying paths for data, images, and the tokenizer model, among other parameters. + +.. code-block:: bash + + python examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py \ + --data_path=/path/to/LLaVA-Instruct-150K/llava_v1_5_mix665k_filtered.json \ + --image_folder=/path/to/LLaVA-Instruct-150K/images \ + --tokenizer_path=/path/to/checkpoints/tokenizer_add_special.model \ + --output_dir=/path/to/LLaVA-Instruct-150K/packed_seq_12288_336_v1 \ + --max_seq_length=12288 \ + --packing_algorithm=first_fit_shuffle \ + --hf_vision_encoder=openai/clip-vit-large-patch14-336 \ + --conv_template=v1 \ + --image_aspect_ratio=pad \ + --seed=42 + +Parameters: +* ``--data_path``: Path to the dataset file in JSON format. +* ``--image_folder``: Directory containing the images referenced in the dataset. +* ``--tokenizer_path``: Path to the tokenizer model. +* ``--output_dir``: Directory where the processed dataset will be stored. +* ``--max_seq_length``: The maximum sequence length of the model. +* ``--packing_algorithm``: Algorithm used for packing sequences. Defaults to 'first_fit_shuffle'. +* ``--hf_vision_encoder``: The Hugging Face vision encoder to use. Default is 'openai/clip-vit-large-patch14-336'. +* ``--conv_template``: Template for data conversion. Default is 'plain', with 'v1' as an alternative. +* ``--image_aspect_ratio``: The aspect ratio for processing images. Defaults to 'square', 'pad' for padding to maintain aspect ratio. +* ``--seed``: Seed for random operations in 'first_fit_shuffle'. +* ``--hparams_file``: Optional path to a YAML file containing additional hyperparameters. + +Remarks: +1. The current version of data processing saves processed image tensors in the sequence packing, which may require significant storage. This issue will be addressed in future iterations. +2. The ``max_seq_length`` is crucial for achieving optimal performance. Excessive length can lead to out-of-memory errors, while insufficient length may degrade performance. +3. The conversation prompt template is inserted during this step to ensure accurate sequence length calculation. + +Adjust Training Config +"""""""""""""""""""""" +To train with packed sequences, modify four items in the SFT/PEFT config file. + +1. Enable the ``packed_sequence`` flag: + +.. code-block:: bash + + ++model.data.data_prefix=/lustre/fsw/coreai_dlalgo_genai/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset + ++model.data.crop_size=[224,224] + ++model.data.packed_sequence=True + +2. Use the new dataset file instead of the original JSONL file and ensure the crop sizes are correctly specified since images are now cached: + +.. code-block:: bash + + ++model.data.data_prefix=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset + ++model.data.crop_size=[336,336] + +4. Adjust batch sizes: + +* Micro batch size should be set to 1 due to concatenation in the preprocessing step. Increase ``pack_size`` to achieve a higher micro batch size. +* Global batch size should be adjusted based on the average number of sequences per pack (``n``), calculated as the total number of sequences divided by the number of packs. This maintains the training recipe by ensuring each gradient iteration sees, on average, the same number of tokens. + +.. code-block:: bash + + model.micro_batch_size=1 + model.global_batch_size= + +Now, you are ready to fine-tune your model with significantly improved throughput! diff --git a/docs/source/multimodal/mllm/video_neva.rst b/docs/source/multimodal/mllm/video_neva.rst new file mode 100644 index 000000000000..eb0624545a3e --- /dev/null +++ b/docs/source/multimodal/mllm/video_neva.rst @@ -0,0 +1,204 @@ +Video NeVA +========== + +Model Introduction +------------------ + +Video NeVa adds support for video modality in NeVa by representing video as multiple image frames. + +There is only a minor change done to :class:`~nemo.collections.multimodal.models.multimodal_llm.neva.neva_model.MegatronNevaModel` class in order to support pretraining on video input data. + +Representing video input as a series of images is done in :class:`~nemo.collections.multimodal.data.neva.TarOrFolderVideoLoader` class, using Decord which provides convenient video slicing methods. + + +Video Neva Configuration +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: yaml + + data: + media_type: video + splice_single_frame: null + num_frames: 8 + image_token_len: 256 + image_folder: null + video_folder: null + +- ``media_type``: If set to `video`, NeVa's dataloader goes through the additional preprocessing steps to represent the input video data as a series of image frames. +- ``splice_single_frame``: Can either be set as `first`, `middle` or `last`. This will result in only a single frame in that specific location of the video being selected. +- ``image_token_len``: The NeVa dataloader calculates `image_token_len` based on the height and width of the preprocessed image frame and the patch size of the CLIP model being used. + +.. code-block:: python + + image_token_len = (224 // 14) * (224 // 14) = 16 * 16 = 256 + +- ``num_frames``: This is used to select the number of image frames that will be used to represent the video. +- ``video_folder``: This specifies the directory where the video files are located. This follows the same format as NeVa's `image_folder`. + + + +Inference with Video NeVA +========================= + +We can run ``neva_evaluation.py`` located in ``NeMo/examples/multimodal/multimodal_llm/neva`` to generate inference results from the Video NeVA model. +Currently, video NeVA supports both image and video inference by changing the config attribute ``inference.media_type`` in ``NeMo/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml`` to either ``image`` or ``video``, and adding the corresponding media path ``inference.media_base_path``. + +Inference with Pretrained Projectors with Base LM Model +------------------------------------------------------- + +An example of an inference script execution: + +For running video inference:: + + CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_VISIBLE_DEVICES=0,1,2,3 python3 /path/to/neva_evaluation.py \ + --config-path=/path/to/conf/ \ + --config-name=neva_inference.yaml \ + tensor_model_parallel_size=4 \ + pipeline_model_parallel_size=1 \ + neva_model_file=/path/to/projector/checkpoint \ + base_model_file=/path/to/base/lm/checkpoint \ + trainer.devices=4 \ + trainer.precision=bf16 \ + prompt_file=/path/to/prompt/file \ + inference.media_base_path=/path/to/videos \ + inference.media_type=video \ + output_file=/path/for/output/file/ \ + inference.temperature=0.2 \ + inference.top_k=0 \ + inference.top_p=0.9 \ + inference.greedy=False \ + inference.add_BOS=False \ + inference.all_probs=False \ + inference.repetition_penalty=1.2 \ + inference.insert_media_token=right \ + inference.tokens_to_generate=256 \ + quantization.algorithm=awq \ + quantization.enable=False + +Example format of ``.jsonl`` prompt_file:: + + {"video": "video_test.mp4", "text": "Can you describe the scene?", "category": "conv", "question_id": 0} + +input video file: video_test.mp4 + +Output:: + + System + A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. + + User + Can you describe the scene?