diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 592080b07..9d92c4b9d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -13,82 +13,14 @@ on: workflow_dispatch: jobs: - pre-commit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - run: pip install pre-commit - - run: pre-commit --version - - run: pre-commit install - - run: pre-commit run --all-files - pretest: - runs-on: ubuntu-latest - strategy: - matrix: - toxenv: [pylint, doc8, docs] - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Run pre-tests with Tox - run: tox -e ${{ matrix.toxenv }} - - test-dashboard-build: - needs: pretest - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Use Node.js ${{ matrix.node-version }} - uses: actions/setup-node@v2 - with: - node-version: 16 - - name: Compile Dashboard - run: | - cd dashboard/src - yarn - yarn build - rm -rf ../build - mv build .. - cd ../../ - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -e .[test] - - name: Get gecko driver - run: | - wget https://github.com/mozilla/geckodriver/releases/download/v0.31.0/geckodriver-v0.31.0-linux64.tar.gz - tar -xvf geckodriver-v0.31.0-linux64.tar.gz - - name: Create gunicorn_tmp_dir to match orion_config.yaml - run: mkdir -p gunicorn_tmp_dir - - name: Launch backend - run: orion serve -c .github/workflows/orion/orion_config.yaml & - - name: Launch frontend - run: orion frontend & - - name: Install Firefox - uses: browser-actions/setup-firefox@latest - - name: Test frontend - run: PATH=$PATH:$(pwd) pytest tests/functional/serving/test_frontend.py - - test: - needs: [pre-commit, pretest] + test: runs-on: ${{ matrix.platform }} strategy: max-parallel: 4 matrix: platform: [ubuntu-latest] - python-version: [3.7, 3.8, 3.9, '3.10'] + python-version: [3.7] env: PLATFORM: ${{ matrix.platform }} steps: @@ -102,7 +34,7 @@ jobs: python -m pip install --upgrade pip pip install tox tox-gh-actions - name: Test with tox (and all extra dependencies) - run: tox -e py-all + run: tox -e py-all -- tests/functional/example/test_speechbrain_tutorial.py - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 with: @@ -111,243 +43,3 @@ jobs: env_vars: PLATFORM,PYTHON name: codecov-umbrella fail_ci_if_error: false - - test-long-algos: - needs: [pre-commit, pretest] - runs-on: ${{ matrix.platform }} - strategy: - matrix: - platform: [ubuntu-latest] - python-version: [3.7, 3.8, 3.9] - algo: [nevergrad, hebo, ax] - env: - PLATFORM: ${{ matrix.platform }} - steps: - - uses: actions/checkout@v1 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox tox-gh-actions - - - name: Test long algorithms with tox - run: tox -e algo -- tests/unittests/algo/long/${{ matrix.algo }} - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - file: ./coverage.xml - flags: unittests - env_vars: PLATFORM,PYTHON - name: codecov-umbrella - fail_ci_if_error: false - - test_no_extras: - needs: [pre-commit, pretest] - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Test with tox - run: tox -e py - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - file: ./coverage.xml - flags: unittests - env_vars: PLATFORM,PYTHON - name: codecov-umbrella - fail_ci_if_error: false - mongodb: - needs: [pre-commit, pretest] - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Setup MongoDB - uses: supercharge/mongodb-github-action@1.8.0 - with: - mongodb-version: 6.0 - - - name: Install MongoShell - run: | - wget -qO - https://www.mongodb.org/static/pgp/server-6.0.asc | sudo apt-key add - - echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/6.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-6.0.list - sudo apt-get update - sudo apt-get install -y mongodb-org - - - name: Configure MongoDB - run: | - mongosh orion_test --eval 'db.createUser({user:"user",pwd:"pass",roles:["readWrite"]});' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Test with tox - run: tox -e mongodb - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - file: ./coverage.xml - flags: backward - env_vars: PLATFORM,PYTHON - name: codecov-umbrella - fail_ci_if_error: false - backward-compatibility: - needs: [pre-commit, pretest] - runs-on: ubuntu-latest - strategy: - max-parallel: 2 - matrix: - orion_db_type: [mongodb, pickleddb] - env: - ORION_DB_TYPE: ${{ matrix.orion_db_type }} - steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Setup MongoDB - uses: supercharge/mongodb-github-action@1.8.0 - with: - mongodb-version: 6.0 - - - name: Install MongoShell - run: | - wget -qO - https://www.mongodb.org/static/pgp/server-6.0.asc | sudo apt-key add - - echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/6.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-6.0.list - sudo apt-get update - sudo apt-get install -y mongodb-org - - - name: Configure MongoDB - run: | - mongosh orion_test --eval 'db.createUser({user:"user",pwd:"pass",roles:["readWrite"]});' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Test with tox - run: tox -e backward-compatibility - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - file: ./coverage.xml - flags: backward - env_vars: PLATFORM,PYTHON - name: codecov-umbrella - fail_ci_if_error: false - pypi: - needs: [test, backward-compatibility, test-long-algos, mongodb, test_no_extras, test-dashboard-build] - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Test packaging - run: tox -e packaging - - name: Build - run: tox -e build - - name: Test dashboard build deployment on normal install - run: | - # Get package path - export ORION_PACKAGE=$( realpath `find dist/ -type f` ) - echo Package path: ${ORION_PACKAGE} - # Move to another folder (to prevent any confusion at installation with repo folder) - cd ~ - echo Normal install - pip install ${ORION_PACKAGE} - # Get prefix - export ORION_PREFIX=$( python -c "import sys; print(sys.prefix);" ) - echo Check if dashboard build is installed - if ( [ -d "${ORION_PREFIX}/orion-dashboard" ] ); then true; else false; fi - if ( [ -f "${ORION_PREFIX}/orion-dashboard/build/index.html" ] ); then true; else false; fi - if ( ls ${ORION_PREFIX}/orion-dashboard/build/static/js/main.*.js ); then true; else false; fi - echo Check if frontend script can find dashboard build - python -c "from orion.core.cli.frontend import get_dashboard_build_path; get_dashboard_build_path();" - echo Clean-up - pip uninstall -y orion - echo Check if dashboard build is correctly removed - # NB: It seems orion-dashboard build is not deleted, - # but it should be empty after uninstall - if ( [ -f "${ORION_PREFIX}/orion-dashboard/build/index.html" ] ); then false; fi - if ( ls ${ORION_PREFIX}/orion-dashboard/build/static/js/main.*.js ); then false; fi - echo End - cd - - - name: Test dashboard build deployment on user install - run: | - # Get package path - export ORION_PACKAGE=$( realpath `find dist/ -type f` ) - echo Package path: ${ORION_PACKAGE} - # Move to another folder (to prevent any confusion at installation with repo folder) - cd ~ - echo User install - pip install --user ${ORION_PACKAGE} - # Get prefix - export ORION_PREFIX=$( python -c "import site; print(site.USER_BASE);" ) - echo Check if dashboard build is installed - if ( [ -d "${ORION_PREFIX}/orion-dashboard" ] ); then true; else false; fi - if ( [ -f "${ORION_PREFIX}/orion-dashboard/build/index.html" ] ); then true; else false; fi - if ( ls ${ORION_PREFIX}/orion-dashboard/build/static/js/main.*.js ); then true; else false; fi - echo Check if frontend script can find dashboard build - python -c "from orion.core.cli.frontend import get_dashboard_build_path; get_dashboard_build_path();" - echo Clean-up - pip uninstall -y orion - echo Check if dashboard build is correctly removed - # NB: In user install case, it seems folder orion-dashboard itself is not deleted, - # but it should be empty after uninstall - if ( [ -f "${ORION_PREFIX}/orion-dashboard/build/index.html" ] ); then false; fi - if ( ls ${ORION_PREFIX}/orion-dashboard/build/static/js/main.*.js ); then false; fi - echo End - cd - - - name: Publish distribution đŸ“¦ to Test PyPI - if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master - with: - user: __token__ - password: ${{ secrets.test_pypi_password }} - repository_url: https://test.pypi.org/legacy/ - - name: Publish distribution đŸ“¦ to PyPI - if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master - with: - user: __token__ - password: ${{ secrets.pypi_password }} - conda: - needs: [test, backward-compatibility, test-long-algos, test-dashboard-build] - runs-on: ubuntu-latest - env: - ANACONDA_TOKEN: ${{ secrets.anaconda_token }} - steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Build conda - run: ./conda/ci_build.sh - - name: Publish distribution đŸ“¦ to Conda - if: startsWith(github.ref, 'refs/tags') - run: ./conda/upload.sh diff --git a/.github/workflows/dashboard-build.yml b/.github/workflows/dashboard-build.yml index 3a51d41ec..9e8413eab 100644 --- a/.github/workflows/dashboard-build.yml +++ b/.github/workflows/dashboard-build.yml @@ -1,57 +1,58 @@ -name: Dashboard Build PR - -on: - push: - branches: [ develop ] - -defaults: - run: - working-directory: dashboard/src - -jobs: - build_dashboard: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 - - name: Use Node.js ${{ matrix.node-version }} - uses: actions/setup-node@v2 - with: - node-version: 16 - - name: Compile Dashboard - run: | - # NB: We are in directory dashboard/src - yarn - yarn build - # If previous build folder exists and is same as new one, do not update. - # In any other case, we must update. - if ( [ -d "../build" ] && diff -qr build ../build ); then UPDATE_BUILD=0; else UPDATE_BUILD=1; fi - echo "update_build=${UPDATE_BUILD}" >> $GITHUB_ENV - echo Build changes ? ${UPDATE_BUILD} - - name: Update compiled Dashboard - if: ${{ env.update_build == '1' }} - run: | - rm -rf ../build - mv build .. - - name: Create Pull Request - if: ${{ env.update_build == '1' }} - id: cpr - uses: peter-evans/create-pull-request@v3 - with: - commit-message: 'Compile Dashboard' - branch: ci/build-dashboard - base: develop - delete-branch: true - title: 'Compile Dashboard' - body: | - Auto-generated by [create-pull-request][1] - - [1]: https://github.com/peter-evans/create-pull-request - labels: | - automated pr - - name: Check outputs - if: ${{ env.update_build == '1' }} - run: | - echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" - echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" +#name: Dashboard Build PR +# +#on: +# push: +# branches: [ develop ] +# +#defaults: +# run: +# working-directory: dashboard/src +# +#jobs: +# build_dashboard: +# +# runs-on: ubuntu-latest +# +# steps: +# - uses: actions/checkout@v2 +# - name: Use Node.js ${{ matrix.node-version }} +# uses: actions/setup-node@v2 +# with: +# node-version: 16 +# - name: Compile Dashboard +# run: | +# # NB: We are in directory dashboard/src +# yarn +# yarn build +# # If previous build folder exists and is same as new one, do not update. +# # In any other case, we must update. +# if ( [ -d "../build" ] && diff -qr build ../build ); then UPDATE_BUILD=0; else UPDATE_BUILD=1; fi +# echo "update_build=${UPDATE_BUILD}" >> $GITHUB_ENV +# echo Build changes ? ${UPDATE_BUILD} +# - name: Update compiled Dashboard +# if: ${{ env.update_build == '1' }} +# run: | +# rm -rf ../build +# mv build .. +# - name: Create Pull Request +# if: ${{ env.update_build == '1' }} +# id: cpr +# uses: peter-evans/create-pull-request@v3 +# with: +# commit-message: 'Compile Dashboard' +# branch: ci/build-dashboard +# base: develop +# delete-branch: true +# title: 'Compile Dashboard' +# body: | +# Auto-generated by [create-pull-request][1] +# +# [1]: https://github.com/peter-evans/create-pull-request +# labels: | +# automated pr +# - name: Check outputs +# if: ${{ env.update_build == '1' }} +# run: | +# echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" +# echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" +# \ No newline at end of file diff --git a/.github/workflows/dashboard-src.yml b/.github/workflows/dashboard-src.yml index 97853021c..31dedaa3d 100644 --- a/.github/workflows/dashboard-src.yml +++ b/.github/workflows/dashboard-src.yml @@ -1,81 +1,82 @@ # Workflow to run unit tests from Github Actions. # Inspired from: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions -name: dashboard-src - -on: - push: - branches: [ master, develop ] - pull_request: - branches: [ master, develop ] - schedule: - - cron: '44 4 * * *' - release: - types: [published] - workflow_dispatch: - -defaults: - run: - working-directory: dashboard/src - -jobs: - test-dashboard-src: - - runs-on: ubuntu-latest - - strategy: - matrix: - node-version: [12.x, 14.x, 16.x] - - steps: - - uses: actions/checkout@v2 - - name: Use Node.js ${{ matrix.node-version }} - uses: actions/setup-node@v2 - with: - node-version: ${{ matrix.node-version }} - # Launch an orion server - - name: Launch Orion server - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - run: python -m pip install git+https://github.com/notoraptor/orion.git@feature/benchmark_webapi_rebased#egg=orion[profet] - - name: info about current directory - run: df . - - name: check filesystem type using df - run: df -Th - - name: check filesystem type using fstab - run: cat /etc/fstab - - - name: Setup MongoDB - uses: supercharge/mongodb-github-action@1.8.0 - with: - mongodb-version: 4.2 - mongodb-db: orion_dashboard_test - - name: Populate MongoDB - run: | - cd ../../ - python .github/workflows/orion/pickle_to_mongodb.py - cd dashboard/src/ - - - name: Start Orion backend - run: | - # Start Orion backend in repository root folder. - cd ../../ - mkdir -p gunicorn_tmp_dir - orion -vv serve -c .github/workflows/orion/orion_config_mongodb.yaml 2> orion-backend-${{ matrix.node-version }}.log & - cd dashboard/src/ - # install - - run: yarn - # check files formatting using Carbon's `ci-check` script - - run: yarn ci-check - # Run tests - # NB: Tests are running in parallel by default, this may cause backend to receive too many requests in small time - # Option --runInBand allows running tests sequentially: https://jestjs.io/docs/cli#--runinband - - run: yarn test --all --verbose --runInBand - # Upload orion backend log. - - name: Get Orion backend log - if: always() - uses: actions/upload-artifact@v3 - with: - name: orion-backend-log-artifact-${{ matrix.node-version }} - path: orion-backend-${{ matrix.node-version }}.log +#name: dashboard-src +# +#on: +# push: +# branches: [ master, develop ] +# pull_request: +# branches: [ master, develop ] +# schedule: +# - cron: '44 4 * * *' +# release: +# types: [published] +# workflow_dispatch: +# +#defaults: +# run: +# working-directory: dashboard/src +# +#jobs: +# test-dashboard-src: +# +# runs-on: ubuntu-latest +# +# strategy: +# matrix: +# node-version: [12.x, 14.x, 16.x] +# +# steps: +# - uses: actions/checkout@v2 +# - name: Use Node.js ${{ matrix.node-version }} +# uses: actions/setup-node@v2 +# with: +# node-version: ${{ matrix.node-version }} +# # Launch an orion server +# - name: Launch Orion server +# uses: actions/setup-python@v2 +# with: +# python-version: 3.8 +# - run: python -m pip install git+https://github.com/notoraptor/orion.git@feature/benchmark_webapi_rebased#egg=orion[profet] +# - name: info about current directory +# run: df . +# - name: check filesystem type using df +# run: df -Th +# - name: check filesystem type using fstab +# run: cat /etc/fstab +# +# - name: Setup MongoDB +# uses: supercharge/mongodb-github-action@1.8.0 +# with: +# mongodb-version: 4.2 +# mongodb-db: orion_dashboard_test +# - name: Populate MongoDB +# run: | +# cd ../../ +# python .github/workflows/orion/pickle_to_mongodb.py +# cd dashboard/src/ +# +# - name: Start Orion backend +# run: | +# # Start Orion backend in repository root folder. +# cd ../../ +# mkdir -p gunicorn_tmp_dir +# orion -vv serve -c .github/workflows/orion/orion_config_mongodb.yaml 2> orion-backend-${{ matrix.node-version }}.log & +# cd dashboard/src/ +# # install +# - run: yarn +# # check files formatting using Carbon's `ci-check` script +# - run: yarn ci-check +# # Run tests +# # NB: Tests are running in parallel by default, this may cause backend to receive too many requests in small time +# # Option --runInBand allows running tests sequentially: https://jestjs.io/docs/cli#--runinband +# - run: yarn test --all --verbose --runInBand +# # Upload orion backend log. +# - name: Get Orion backend log +# if: always() +# uses: actions/upload-artifact@v3 +# with: +# name: orion-backend-log-artifact-${{ matrix.node-version }} +# path: orion-backend-${{ matrix.node-version }}.log +# \ No newline at end of file diff --git a/docs/src/index.rst b/docs/src/index.rst index f126046e8..b053cd4a3 100644 --- a/docs/src/index.rst +++ b/docs/src/index.rst @@ -37,6 +37,7 @@ auto_tutorials/code_4_parallelism tutorials/cluster tutorials/pytorch_a2c_ppo + tutorials/speechbrain_tutorial .. toctree:: :caption: Plugins diff --git a/docs/src/tutorials/speech-brain.rst b/docs/src/tutorials/speech-brain.rst new file mode 100644 index 000000000..fba3d3962 --- /dev/null +++ b/docs/src/tutorials/speech-brain.rst @@ -0,0 +1,94 @@ +******************** +SpeechBrain +******************** + +In this short tutorial, we're going to demonstrate how OrĂ­on can be integrated to a `SpeechBrain +`_ speech recognition model. +The files mentioned in this tutorial are available in the `OrĂ­on +`_ repository. + +Installation and setup +====================== + +Make sure OrĂ­on is installed (:doc:`/install/core`). + +Then install SpeechBrain using ``$ pip install speechbrain`` + +Adapting the Speechbrain for OrĂ­on +================================== + +The Adaptation for using OrĂ­on is quite simple. + +1) We first need to import orion.report_objective() into the project. + +.. code-block:: python + + from orion.client import report_objective + +2) We then need to change the evaluation from the training data to the validation data. +The evaluation method should look like this. It returns the validation loss. + +.. literalinclude:: /../../examples/speechbrain_tutorial/main.py + :language: python + :lines: 75-80 + +3) Finally, we call ``report_objective`` at the end to return the final objective value, +the validation loss, to OrĂ­on. + +.. code-block:: python + + report_objective(valid_stats) + +The code is now adapted and ready to be used with OrĂ­on. + +Execution +========= + +We are now going to call the orion hunt function. Notice that we still need to give the train.yaml +file to speechbrain, since the general configuration is in there. However, we are going to specify +the hyper-parameters that we want to optimize after that, +which will automatically overrides the ones set in the train.yaml. + +.. code-block:: bash + + orion hunt \ + --enable-evc -n \ + python main.py train.yaml \ + --lr~'loguniform(0.05, 0.2)' \ + --ctc_weight~'loguniform(0.25, 0.75)' \ + --label_smoothing~'loguniform(1e-10, 10e-5)' \ + --coverage_penalty~'loguniform(1.0, 2.0)' \ + --temperature~'loguniform(1.0, 1.5)' \ + --temperature_lm~'loguniform(1.0, 1.5)' + +Results +======= + +When an experiment reaches its termination criterion, basically ``max-trials``, +you can see the results using the command + +.. code-block:: bash + + $ orion info -n + +This outputs the following statistics + +.. code-block:: bash + + Stats + ===== + completed: True + trials completed: 209 + best trial: + id: 8675cfcfba768243e1ed1ac7825c69b6 + evaluation: 0.13801406680803444 + params: + /coverage_penalty: 1.396 + /ctc_weight: 0.389 + /label_smoothing: 2.044e-10 + /lr: 0.06462 + /temperature: 1.175 + /temperature_lm: 1.087 + start time: 2022-09-29 14:37:41.048314 + finish time: 2022-09-30 20:08:07.384765 + duration: 1 day, 5:30:26.336451 diff --git a/examples/speechbrain_tutorial/download_data.py b/examples/speechbrain_tutorial/download_data.py new file mode 100644 index 000000000..2e08466bf --- /dev/null +++ b/examples/speechbrain_tutorial/download_data.py @@ -0,0 +1,43 @@ +import logging +import sys + +import speechbrain as sb +from hyperpyyaml import load_hyperpyyaml +from mini_librispeech_prepare import prepare_mini_librispeech + +logger = logging.getLogger(__name__) + + +def download(hparams_file, run_opts, overrides): + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Data preparation, to be run on only one process. + sb.utils.distributed.run_on_main( + prepare_mini_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_json_train": hparams["train_annotation"], + "save_json_valid": hparams["valid_annotation"], + "save_json_test": hparams["test_annotation"], + }, + ) + + return hparams + + +if __name__ == "__main__": + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + download(hparams_file, run_opts, overrides) diff --git a/examples/speechbrain_tutorial/main.py b/examples/speechbrain_tutorial/main.py new file mode 100644 index 000000000..9baed3a6d --- /dev/null +++ b/examples/speechbrain_tutorial/main.py @@ -0,0 +1,60 @@ +import logging +import sys + +import speechbrain as sb +import torch +from download_data import download +from speechbrain.utils.distributed import run_on_main +from train import ASR, dataio_prepare + +from orion.client import report_objective + +logger = logging.getLogger(__name__) + +if __name__ == "__main__": + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + print("Starting download") + hparams = download(hparams_file, run_opts, overrides) + print("finish download") + # We can now directly create the datasets for training, valid, and test + datasets = dataio_prepare(hparams) + + # In this case, pre-training is essential because mini-librispeech is not + # big enough to train an end-to-end model from scratch. With bigger dataset + # you can train from scratch and avoid this step. + # We download the pretrained LM from HuggingFace (or elsewhere depending on + # the path given in the YAML file). The tokenizer is loaded at the same time. + run_on_main(hparams["pretrainer"].collect_files) + hparams["pretrainer"].load_collected(device=torch.device("cpu")) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + # print("Starting fit") + + asr_brain.fit( + asr_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + print("Starting evaluate") + # Load best checkpoint for evaluation + valid_stats = asr_brain.evaluate( + test_set=datasets["valid"], + min_key="WER", + test_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + report_objective(valid_stats) diff --git a/examples/speechbrain_tutorial/mini_librispeech_prepare.py b/examples/speechbrain_tutorial/mini_librispeech_prepare.py new file mode 100644 index 000000000..dadaba7e4 --- /dev/null +++ b/examples/speechbrain_tutorial/mini_librispeech_prepare.py @@ -0,0 +1,188 @@ +""" +Code from the SpeechBrain Github repository : +https://github.com/speechbrain/speechbrain/blob/develop/templates/speech_recognition/mini_librispeech_prepare.py +Downloads and creates manifest files for speech recognition with Mini LibriSpeech. + +Authors: + * Peter Plantinga, 2021 + * Mirco Ravanelli, 2021 +""" + +import json +import logging +import os +import shutil + +from speechbrain.dataio.dataio import read_audio +from speechbrain.utils.data_utils import download_file, get_all_files + +logger = logging.getLogger(__name__) +MINILIBRI_TRAIN_URL = "http://www.openslr.org/resources/31/train-clean-5.tar.gz" +MINILIBRI_VALID_URL = "http://www.openslr.org/resources/31/dev-clean-2.tar.gz" +MINILIBRI_TEST_URL = "https://www.openslr.org/resources/12/test-clean.tar.gz" +SAMPLERATE = 16000 + + +def prepare_mini_librispeech( + data_folder, save_json_train, save_json_valid, save_json_test +): + """ + Prepares the json files for the Mini Librispeech dataset. + + Downloads the dataset if its not found in the `data_folder`. + + Arguments + --------- + data_folder : str + Path to the folder where the Mini Librispeech dataset is stored. + save_json_train : str + Path where the train data specification file will be saved. + save_json_valid : str + Path where the validation data specification file will be saved. + save_json_test : str + Path where the test data specification file will be saved. + + Example + ------- + >>> data_folder = '/path/to/mini_librispeech' + >>> prepare_mini_librispeech(data_folder, 'train.json', 'valid.json', 'test.json') + """ + + # Check if this phase is already done (if so, skip it) + if skip(save_json_train, save_json_valid, save_json_test): + logger.info("Preparation completed in previous run, skipping.") + return + + # If the dataset doesn't exist yet, download it + train_folder = os.path.join(data_folder, "LibriSpeech", "train-clean-5") + valid_folder = os.path.join(data_folder, "LibriSpeech", "dev-clean-2") + test_folder = os.path.join(data_folder, "LibriSpeech", "test-clean") + if not check_folders(train_folder, valid_folder, test_folder): + download_mini_librispeech(data_folder) + + # List files and create manifest from list + logger.info(f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}") + extension = [".flac"] + + # List of flac audio files + wav_list_train = get_all_files(train_folder, match_and=extension) + wav_list_valid = get_all_files(valid_folder, match_and=extension) + wav_list_test = get_all_files(test_folder, match_and=extension) + + # List of transcription file + extension = [".trans.txt"] + trans_list = get_all_files(data_folder, match_and=extension) + trans_dict = get_transcription(trans_list) + + # Create the json files + create_json(wav_list_train, trans_dict, save_json_train) + create_json(wav_list_valid, trans_dict, save_json_valid) + create_json(wav_list_test, trans_dict, save_json_test) + + +def get_transcription(trans_list): + """ + Returns a dictionary with the transcription of each sentence in the dataset. + + Arguments + --------- + trans_list : list of str + The list of transcription files. + """ + # Processing all the transcription files in the list + trans_dict = {} + for trans_file in trans_list: + # Reading the text file + with open(trans_file) as f: + for line in f: + uttid = line.split(" ")[0] + text = line.rstrip().split(" ")[1:] + text = " ".join(text) + trans_dict[uttid] = text + + logger.info("Transcription files read!") + return trans_dict + + +def create_json(wav_list, trans_dict, json_file): + """ + Creates the json file given a list of wav files and their transcriptions. + + Arguments + --------- + wav_list : list of str + The list of wav files. + trans_dict : dict + Dictionary of sentence ids and word transcriptions. + json_file : str + The path of the output json file + """ + # Processing all the wav files in the list + json_dict = {} + for wav_file in wav_list: + + # Reading the signal (to retrieve duration in seconds) + signal = read_audio(wav_file) + duration = signal.shape[0] / SAMPLERATE + + # Manipulate path to get relative path and uttid + path_parts = wav_file.split(os.path.sep) + uttid, _ = os.path.splitext(path_parts[-1]) + relative_path = os.path.join("{data_root}", *path_parts[-5:]) + + # Create entry for this utterance + json_dict[uttid] = { + "wav": relative_path, + "length": duration, + "words": trans_dict[uttid], + } + + # Writing the dictionary to the json file + with open(json_file, mode="w") as json_f: + json.dump(json_dict, json_f, indent=2) + + logger.info(f"{json_file} successfully created!") + + +def skip(*filenames): + """ + Detects if the data preparation has been already done. + If the preparation has been done, we can skip it. + + Returns + ------- + bool + if True, the preparation phase can be skipped. + if False, it must be done. + """ + for filename in filenames: + if not os.path.isfile(filename): + return False + return True + + +def check_folders(*folders): + """Returns False if any passed folder does not exist.""" + for folder in folders: + if not os.path.exists(folder): + return False + return True + + +def download_mini_librispeech(destination): + """Download dataset and unpack it. + + Arguments + --------- + destination : str + Place to put dataset. + """ + train_archive = os.path.join(destination, "train-clean-5.tar.gz") + valid_archive = os.path.join(destination, "dev-clean-2.tar.gz") + test_archive = os.path.join(destination, "test-clean.tar.gz") + download_file(MINILIBRI_TRAIN_URL, train_archive) + download_file(MINILIBRI_VALID_URL, valid_archive) + download_file(MINILIBRI_TEST_URL, test_archive) + shutil.unpack_archive(train_archive, destination) + shutil.unpack_archive(valid_archive, destination) + shutil.unpack_archive(test_archive, destination) diff --git a/examples/speechbrain_tutorial/train.py b/examples/speechbrain_tutorial/train.py new file mode 100644 index 000000000..010df449d --- /dev/null +++ b/examples/speechbrain_tutorial/train.py @@ -0,0 +1,459 @@ +#!/usr/bin/env/python3 +"""Recipe for training a sequence-to-sequence ASR system with mini-librispeech. +The system employs an encoder, a decoder, and an attention mechanism +between them. Decoding is performed with beam search coupled with a neural +language model. + +To run this recipe, do the following: +> python train.py train.yaml + +With the default hyperparameters, the system employs an LSTM encoder. +The decoder is based on a standard GRU. Beam search coupled with an RNN language +model is used on the top of decoder probabilities. + +The neural network is trained on both CTC and negative-log likelihood +targets and sub-word units estimated with Byte Pairwise Encoding (BPE) +are used as basic recognition tokens. Training is performed on the mini-librispeech +dataset. Note that this is a tiny dataset used here just to +provide a working example. To achieve a better performance you have to train with +larger datasets, such as the full LibriSpeech one. In this case, to allow the +model to converge, we pre-train it with a bigger one (trained on the full librispeech +with the seq2seq 1k BPE recipe). + +The experiment file is flexible enough to support a large variety of +different systems. By properly changing the parameter files, you can try +different encoders, decoders, tokens (e.g, characters instead of BPE). + +This recipe assumes that the tokenizer and the LM are already trained. +To avoid token mismatches, the tokenizer used for the acoustic model is +the same use for the LM. The recipe downloads the pre-trained tokenizer +and LM. + +If you would like to train a full system from scratch do the following: +1- Train a tokenizer (see ../Tokenizer) +2- Train a language model (see ../LM) +3- Train the speech recognizer (with this code). + + +Authors + * Mirco Ravanelli 2020 + * Ju-Chieh Chou 2020 + * Abdel Heba 2020 + * Peter Plantinga 2020 + * Samuele Cornell 2020 +""" + +import logging +import sys + +import speechbrain as sb +import torch +from hyperpyyaml import load_hyperpyyaml +from mini_librispeech_prepare import prepare_mini_librispeech +from speechbrain.utils.distributed import run_on_main + +logger = logging.getLogger(__name__) + + +# Brain class for speech recognition training +class ASR(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def compute_forward(self, batch, stage): + """Runs all the computation of the CTC + seq2seq ASR. It returns the + posterior probabilities of the CTC and seq2seq networks. + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + At training time it returns predicted seq2seq log probabilities. + If needed it also returns the ctc output log probabilities. + At validation/test time, it returns the predicted tokens as well. + """ + # We first move the batch to the appropriate device. + batch = batch.to(self.device) + feats, self.feat_lens = self.prepare_features(stage, batch.sig) + tokens_bos, _ = self.prepare_tokens(stage, batch.tokens_bos) + + # Running the encoder (prevent propagation to feature extraction) + encoded_signal = self.modules.encoder(feats.detach()) + + # Embed tokens and pass tokens & encoded signal to decoder + embedded_tokens = self.modules.embedding(tokens_bos) + decoder_outputs, _ = self.modules.decoder( + embedded_tokens, encoded_signal, self.feat_lens + ) + + # Output layer for seq2seq log-probabilities + logits = self.modules.seq_lin(decoder_outputs) + predictions = {"seq_logprobs": self.hparams.log_softmax(logits)} + + if self.is_ctc_active(stage): + # Output layer for ctc log-probabilities + ctc_logits = self.modules.ctc_lin(encoded_signal) + predictions["ctc_logprobs"] = self.hparams.log_softmax(ctc_logits) + elif stage == sb.Stage.VALID: + predictions["tokens"], _ = self.hparams.valid_search( + encoded_signal, self.feat_lens + ) + elif stage == sb.Stage.TEST: + predictions["tokens"], _ = self.hparams.test_search( + encoded_signal, self.feat_lens + ) + + return predictions + + def is_ctc_active(self, stage): + """Check if CTC is currently active. + + Arguments + --------- + stage : sb.Stage + Currently executing stage. + """ + if stage != sb.Stage.TRAIN: + return False + current_epoch = self.hparams.epoch_counter.current + return current_epoch <= self.hparams.number_of_ctc_epochs + + def prepare_features(self, stage, wavs): + """Prepare features for computation on-the-fly + + Arguments + --------- + stage : sb.Stage + Currently executing stage. + wavs : tuple + The input signals (tensor) and their lengths (tensor). + """ + wavs, wav_lens = wavs + + # Add augmentation if specified. In this version of augmentation, we + # concatenate the original and the augment batches in a single bigger + # batch. This is more memory-demanding, but helps to improve the + # performance. Change it if you run OOM. + if stage == sb.Stage.TRAIN: + if hasattr(self.modules, "env_corrupt"): + wavs_noise = self.modules.env_corrupt(wavs, wav_lens) + wavs = torch.cat([wavs, wavs_noise], dim=0) + wav_lens = torch.cat([wav_lens, wav_lens]) + + if hasattr(self.hparams, "augmentation"): + wavs = self.hparams.augmentation(wavs, wav_lens) + + # Feature computation and normalization + feats = self.hparams.compute_features(wavs) + feats = self.modules.normalize(feats, wav_lens) + + return feats, wav_lens + + def prepare_tokens(self, stage, tokens): + """Double the tokens batch if features are doubled. + + Arguments + --------- + stage : sb.Stage + Currently executing stage. + tokens : tuple + The tokens (tensor) and their lengths (tensor). + """ + tokens, token_lens = tokens + if hasattr(self.modules, "env_corrupt") and stage == sb.Stage.TRAIN: + tokens = torch.cat([tokens, tokens], dim=0) + token_lens = torch.cat([token_lens, token_lens], dim=0) + return tokens, token_lens + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + # Compute sequence loss against targets with EOS + tokens_eos, tokens_eos_lens = self.prepare_tokens(stage, batch.tokens_eos) + loss = sb.nnet.losses.nll_loss( + log_probabilities=predictions["seq_logprobs"], + targets=tokens_eos, + length=tokens_eos_lens, + label_smoothing=self.hparams.label_smoothing, + ) + + # Add ctc loss if necessary. The total cost is a weighted sum of + # ctc loss + seq2seq loss + if self.is_ctc_active(stage): + # Load tokens without EOS as CTC targets + tokens, tokens_lens = self.prepare_tokens(stage, batch.tokens) + loss_ctc = self.hparams.ctc_cost( + predictions["ctc_logprobs"], tokens, self.feat_lens, tokens_lens + ) + loss *= 1 - self.hparams.ctc_weight + loss += self.hparams.ctc_weight * loss_ctc + + if stage != sb.Stage.TRAIN: + # Converted predicted tokens from indexes to words + predicted_words = [ + self.hparams.tokenizer.decode_ids(prediction).split(" ") + for prediction in predictions["tokens"] + ] + target_words = [words.split(" ") for words in batch.words] + + # Monitor word error rate and character error rated at + # valid and test time. + self.wer_metric.append(batch.id, predicted_words, target_words) + self.cer_metric.append(batch.id, predicted_words, target_words) + + return loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + # Set up statistics trackers for this stage + # In this case, we would like to keep track of the word error rate (wer) + # and the character error rate (cer) + if stage != sb.Stage.TRAIN: + self.cer_metric = self.hparams.cer_computer() + self.wer_metric = self.hparams.error_rate_computer() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + # Summarize the statistics from the stage for record-keeping. + else: + stage_stats["CER"] = self.cer_metric.summarize("error_rate") + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + # Update learning rate + old_lr, new_lr = self.hparams.lr_annealing(stage_stats["WER"]) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": old_lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + # Save the current checkpoint and delete previous checkpoints. + self.checkpointer.save_and_keep_only( + meta={"WER": stage_stats["WER"]}, + min_keys=["WER"], + ) + + # We also write statistics about test data to stdout and to the logfile. + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + with open(self.hparams.wer_file, "w") as w: + self.wer_metric.write_stats(w) + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + """ + # Define audio pipeline. In this case, we simply read the path contained + # in the variable wav with the audio reader. + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(wav): + """Load the audio signal. This is done on the CPU in the `collate_fn`.""" + sig = sb.dataio.dataio.read_audio(wav) + return sig + + # Define text processing pipeline. We start from the raw text and then + # encode it using the tokenizer. The tokens with BOS are used for feeding + # decoder during training, the tokens with EOS for computing the cost function. + # The tokens without BOS or EOS is for computing CTC loss. + @sb.utils.data_pipeline.takes("words") + @sb.utils.data_pipeline.provides( + "words", "tokens_list", "tokens_bos", "tokens_eos", "tokens" + ) + def text_pipeline(words): + """Processes the transcriptions to generate proper labels""" + yield words + tokens_list = hparams["tokenizer"].encode_as_ids(words) + yield tokens_list + tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) + yield tokens_bos + tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) + yield tokens_eos + tokens = torch.LongTensor(tokens_list) + yield tokens + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_annotation"], + "valid": hparams["valid_annotation"], + "test": hparams["test_annotation"], + } + + for dataset in data_info: + datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=[audio_pipeline, text_pipeline], + output_keys=[ + "id", + "sig", + "words", + "tokens_bos", + "tokens_eos", + "tokens", + ], + ) + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + hparams["train_dataloader_opts"]["shuffle"] = True + + else: + raise NotImplementedError("sorting must be random, ascending or descending") + return datasets + + +if __name__ == "__main__": + + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Data preparation, to be run on only one process. + sb.utils.distributed.run_on_main( + prepare_mini_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_json_train": hparams["train_annotation"], + "save_json_valid": hparams["valid_annotation"], + "save_json_test": hparams["test_annotation"], + }, + ) + + # We can now directly create the datasets for training, valid, and test + datasets = dataio_prepare(hparams) + + # In this case, pre-training is essential because mini-librispeech is not + # big enough to train an end-to-end model from scratch. With bigger dataset + # you can train from scratch and avoid this step. + # We download the pretrained LM from HuggingFace (or elsewhere depending on + # the path given in the YAML file). The tokenizer is loaded at the same time. + run_on_main(hparams["pretrainer"].collect_files) + hparams["pretrainer"].load_collected(device=torch.device("cpu")) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + asr_brain.fit( + asr_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Load best checkpoint for evaluation + test_stats = asr_brain.evaluate( + test_set=datasets["test"], + min_key="WER", + test_loader_kwargs=hparams["test_dataloader_opts"], + ) diff --git a/examples/speechbrain_tutorial/train.yaml b/examples/speechbrain_tutorial/train.yaml new file mode 100644 index 000000000..ea0099026 --- /dev/null +++ b/examples/speechbrain_tutorial/train.yaml @@ -0,0 +1,334 @@ +# ############################################################################ +# Model: E2E ASR with attention-based ASR +# Encoder: CRDNN +# Decoder: GRU + beamsearch + RNNLM +# Tokens: 1000 BPE +# losses: CTC+ NLL +# Training: mini-librispeech +# Pre-Training: librispeech 960h +# Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga, Samuele Cornell 2020 +# # ############################################################################ + +# Seed needs to be set at top of yaml, before objects with parameters are instantiated +seed: 2602 +__set_seed: !apply:torch.manual_seed [!ref ] + +# If you plan to train a system on an HPC cluster with a big dataset, +# we strongly suggest doing the following: +# 1- Compress the dataset in a single tar or zip file. +# 2- Copy your dataset locally (i.e., the local disk of the computing node). +# 3- Uncompress the dataset in the local folder. +# 4- Set data_folder with the local path +# Reading data from the local disk of the compute node (e.g. $SLURM_TMPDIR with SLURM-based clusters) is very important. +# It allows you to read the data much faster without slowing down the shared filesystem. + +data_folder: ../data # In this case, data will be automatically downloaded here. +data_folder_rirs: !ref # noise/ris dataset will automatically be downloaded here +output_folder: !ref results/CRDNN_BPE_960h_LM/ +wer_file: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + +# Language model (LM) pretraining +# NB: To avoid mismatch, the speech recognizer must be trained with the same +# tokenizer used for LM training. Here, we download everything from the +# speechbrain HuggingFace repository. However, a local path pointing to a +# directory containing the lm.ckpt and tokenizer.ckpt may also be specified +# instead. E.g if you want to use your own LM / tokenizer. +pretrained_path: speechbrain/asr-crdnn-rnnlm-librispeech + + +# Path where data manifest files will be stored. The data manifest files are created by the +# data preparation script +train_annotation: ../train.json +valid_annotation: ../valid.json +test_annotation: ../test.json + +# The train logger writes training statistics to a file, as well as stdout. +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +# Training parameters +number_of_epochs: 15 +number_of_ctc_epochs: 5 +batch_size: 8 +lr: 0.1 +ctc_weight: 0.5 +sorting: ascending +ckpt_interval_minutes: 15 # save checkpoint every N min +label_smoothing: 0.1 + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +# Feature parameters +sample_rate: 16000 +n_fft: 400 +n_mels: 40 + +# Model parameters +activation: !name:torch.nn.LeakyReLU +dropout: 0.15 # 0.0 - 0.3 +cnn_blocks: 2 +cnn_channels: (128, 256) +inter_layer_pooling_size: (2, 2) +cnn_kernelsize: (3, 3) +time_pooling_size: 4 +rnn_class: !name:speechbrain.nnet.RNN.LSTM +rnn_layers: 4 +rnn_neurons: 1024 +rnn_bidirectional: True +dnn_blocks: 2 +dnn_neurons: 512 +emb_size: 128 +dec_neurons: 1024 +output_neurons: 1000 # Number of tokens (same as LM) +blank_index: 0 +bos_index: 0 +eos_index: 0 + +# Decoding parameters +min_decode_ratio: 0.0 +max_decode_ratio: 1.0 +valid_beam_size: 8 +test_beam_size: 80 +eos_threshold: 1.5 +using_max_attn_shift: True +max_attn_shift: 240 +lm_weight: 0.50 +ctc_weight_decode: 0.0 +coverage_penalty: 1.5 +temperature: 1.25 +temperature_lm: 1.25 + +# The first object passed to the Brain class is this "Epoch Counter" +# which is saved by the Checkpointer so that training can be resumed +# if it gets interrupted at any point. +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +# Feature extraction +compute_features: !new:speechbrain.lobes.features.Fbank + sample_rate: !ref + n_fft: !ref + n_mels: !ref + +# Feature normalization (mean and std) +normalize: !new:speechbrain.processing.features.InputNormalization + norm_type: global + +# Added noise and reverb come from OpenRIR dataset, automatically +# downloaded and prepared with this Environmental Corruption class. +env_corrupt: !new:speechbrain.lobes.augment.EnvCorrupt + openrir_folder: !ref + babble_prob: 0.0 + reverb_prob: 0.0 + noise_prob: 1.0 + noise_snr_low: 0 + noise_snr_high: 15 + +# Adds speech change + time and frequency dropouts (time-domain implementation). +augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + sample_rate: !ref + speeds: [95, 100, 105] + +# The CRDNN model is an encoder that combines CNNs, RNNs, and DNNs. +encoder: !new:speechbrain.lobes.models.CRDNN.CRDNN + input_shape: [null, null, !ref ] + activation: !ref + dropout: !ref + cnn_blocks: !ref + cnn_channels: !ref + cnn_kernelsize: !ref + inter_layer_pooling_size: !ref + time_pooling: True + using_2d_pooling: False + time_pooling_size: !ref + rnn_class: !ref + rnn_layers: !ref + rnn_neurons: !ref + rnn_bidirectional: !ref + rnn_re_init: True + dnn_blocks: !ref + dnn_neurons: !ref + use_rnnp: False + +# Embedding (from indexes to an embedding space of dimension emb_size). +embedding: !new:speechbrain.nnet.embedding.Embedding + num_embeddings: !ref + embedding_dim: !ref + +# Attention-based RNN decoder. +decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder + enc_dim: !ref + input_size: !ref + rnn_type: gru + attn_type: location + hidden_size: !ref + attn_dim: 1024 + num_layers: 1 + scaling: 1.0 + channels: 10 + kernel_size: 100 + re_init: True + dropout: !ref + +# Linear transformation on the top of the encoder. +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +# Linear transformation on the top of the decoder. +seq_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +# Final softmax (for log posteriors computation). +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +# Cost definition for the CTC part. +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +# Tokenizer initialization +tokenizer: !new:sentencepiece.SentencePieceProcessor + +# Objects in "modules" dict will have their parameters moved to the correct +# device, as well as having train()/eval() called on them by the Brain class +modules: + encoder: !ref + embedding: !ref + decoder: !ref + ctc_lin: !ref + seq_lin: !ref + normalize: !ref + env_corrupt: !ref + lm_model: !ref + +# Gathering all the submodels in a single model object. +model: !new:torch.nn.ModuleList + - - !ref + - !ref + - !ref + - !ref + - !ref + +# This is the RNNLM that is used according to the Huggingface repository +# NB: It has to match the pre-trained RNNLM!! +lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM + output_neurons: !ref + embedding_dim: !ref + activation: !name:torch.nn.LeakyReLU + dropout: 0.0 + rnn_layers: 2 + rnn_neurons: 2048 + dnn_blocks: 1 + dnn_neurons: 512 + return_hidden: True # For inference + +# Beamsearch is applied on the top of the decoder. If the language model is +# given, a language model is applied (with a weight specified in lm_weight). +# If ctc_weight is set, the decoder uses CTC + attention beamsearch. This +# improves the performance, but slows down decoding. For a description of +# the other parameters, please see the speechbrain.decoders.S2SRNNBeamSearchLM. + +# It makes sense to have a lighter search during validation. In this case, +# we don't use the LM and CTC probabilities during decoding. +valid_search: !new:speechbrain.decoders.S2SRNNBeamSearcher + embedding: !ref + decoder: !ref + linear: !ref + ctc_linear: !ref + bos_index: !ref + eos_index: !ref + blank_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + eos_threshold: !ref + using_max_attn_shift: !ref + max_attn_shift: !ref + coverage_penalty: !ref + temperature: !ref + +# The final decoding on the test set can be more computationally demanding. +# In this case, we use the LM + CTC probabilities during decoding as well. +# Please, remove this part if you need a faster decoder. +test_search: !new:speechbrain.decoders.S2SRNNBeamSearchLM + embedding: !ref + decoder: !ref + linear: !ref + ctc_linear: !ref + language_model: !ref + bos_index: !ref + eos_index: !ref + blank_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + eos_threshold: !ref + using_max_attn_shift: !ref + max_attn_shift: !ref + coverage_penalty: !ref + lm_weight: !ref + ctc_weight: !ref + temperature: !ref + temperature_lm: !ref + +# This function manages learning rate annealing over the epochs. +# We here use the NewBoB algorithm, that anneals the learning rate if +# the improvements over two consecutive epochs is less than the defined +# threshold. +lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 + +# This optimizer will be constructed by the Brain class after all parameters +# are moved to the correct device. Then it will be added to the checkpointer. +opt_class: !name:torch.optim.Adadelta + lr: !ref + rho: 0.95 + eps: 1.e-8 + +# Functions that compute the statistics to track during the validation step. +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True + +# This object is used for saving the state of training both so that it +# can be resumed if it gets interrupted, and also so that the best checkpoint +# can be later loaded for evaluation or inference. +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + normalizer: !ref + counter: !ref + +# This object is used to pretrain the language model and the tokenizers +# (defined above). In this case, we also pretrain the ASR model (to make +# sure the model converges on a small amount of data) +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + collect_in: !ref + loadables: + lm: !ref + tokenizer: !ref + model: !ref + paths: + lm: !ref /lm.ckpt + tokenizer: !ref /tokenizer.ckpt + model: !ref /asr.ckpt diff --git a/tests/functional/example/orion_config_speechbrain.yaml b/tests/functional/example/orion_config_speechbrain.yaml new file mode 100644 index 000000000..d9a33102f --- /dev/null +++ b/tests/functional/example/orion_config_speechbrain.yaml @@ -0,0 +1,7 @@ +name: speechbrain-tutorial-test + +max_trials: 1 + +algorithms: + random: + seed: 1 diff --git a/tests/functional/example/test_speechbrain_tutorial.py b/tests/functional/example/test_speechbrain_tutorial.py new file mode 100644 index 000000000..45f243d1b --- /dev/null +++ b/tests/functional/example/test_speechbrain_tutorial.py @@ -0,0 +1,172 @@ +"""Tests the minimalist example script on scitkit-learn and its integration to OrĂ­on.""" +import json +import os +import subprocess + +import pytest + +import orion.core.cli +from orion.client import create_experiment +from orion.storage.base import setup_storage + + +def json_clean(path, scale): + """Modifies json file to reduce it to some scale.""" + f = open(path) + json_list = json.load(f) + new_list = {} + ctr = 0 + + for item in json_list: + if ctr < len(json_list) * scale: + new_list[item] = json_list[item] + ctr += 1 + + with open(path, "w", encoding="utf-8") as f: + json.dump(new_list, f, ensure_ascii=False, indent=2) + + +@pytest.fixture(scope="module") +def download_data(tmp_path_factory): + # Creating paths + path = tmp_path_factory.mktemp("out") + data = path / "data" + output = data / "results" + + # Calling the script, for downloading the data + + script = os.path.abspath( + str(os.path.dirname(os.path.abspath(__file__))) + + "/../../../examples/speechbrain_tutorial/download_data.py" + ) + # Using commands and overriding download paths + return_code = subprocess.call( + [ + "python", + script, + "examples/speechbrain_tutorial/train.yaml", + "--device", + "cpu", + "--data_folder", + data, + "--output_folder", + output, + "--train_annotation", + path / "train.json", + "--valid_annotation", + path / "valid.json", + "--test_annotation", + path / "test.json", + ] + ) + + # Reducing the size of the training, testing and validation set for the purpose of this test. + + json_clean(path / "test.json", 0.005) + json_clean(path / "train.json", 0.005) + json_clean(path / "valid.json", 0.005) + + assert return_code != 2, "The example script does not exists." + assert return_code != 1, "The example script did not terminates its execution." + + # Verifying if the temp dict is populated + assert len(os.listdir(data)) != 0, "The data was not downloaded correctly" + + return path + + +def test_script_integrity(capsys, download_data): + """Verifies the example script can run in standalone via `python ...`.""" + script = os.path.abspath( + str(os.path.dirname(os.path.abspath(__file__))) + + "/../../../examples/speechbrain_tutorial/main.py" + ) + path = download_data + data = path / "data" + output = data / "results" + + return_code = subprocess.call( + [ + "python", + script, + "examples/speechbrain_tutorial/train.yaml", + "--device", + "cpu", + "--number_of_epochs", + "1", + "--data_folder", + data, + "--output_folder", + output, + "--train_annotation", + path / "train.json", + "--valid_annotation", + path / "valid.json", + "--test_annotation", + path / "test.json", + ] + ) + assert return_code != 2, "The example script does not exists." + assert return_code != 1, "The example script did not terminates its execution." + assert ( + return_code == 0 and not capsys.readouterr().err + ), "The example script encountered an error during its execution." + + +@pytest.mark.usefixtures("orionstate") +def test_orion_runs_script(download_data): + """Verifies OrĂ­on can execute the example script.""" + script = os.path.abspath( + str(os.path.dirname(os.path.abspath(__file__))) + + "/../../../examples/speechbrain_tutorial/main.py" + ) + path = download_data + data = path / "data" + output = data / "results" + + config = ( + str(os.path.dirname(os.path.abspath(__file__))) + + "/orion_config_speechbrain.yaml" + ) + + orion.core.cli.main( + [ + "hunt", + "--config", + config, + "python", + script, + "examples/speechbrain_tutorial/train.yaml", + "--device", + "cpu", + "--number_of_epochs", + "1", + "--data_folder", + str(data), + "--output_folder", + str(output), + "--train_annotation", + str(path / "train.json"), + "--valid_annotation", + str(path / "valid.json"), + "--test_annotation", + str(path / "test.json"), + "--lr~loguniform(0.05, 0.2)", + ] + ) + + experiment = create_experiment(name="speechbrain-tutorial-test") + assert experiment is not None + assert experiment.version == 1 + + keys = experiment.space.keys() + assert len(keys) == 1 + assert "/lr" in keys + + storage = setup_storage() + trials = storage.fetch_trials(uid=experiment.id) + assert len(trials) > 0 + + trial = trials[0] + assert trial.status == "completed" + assert trial.params["/lr"] == 0.07452 diff --git a/tests/requirements.txt b/tests/requirements.txt index 4ebee8d7d..3d244978c 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -6,4 +6,5 @@ pytest-lazy-fixture pytest-custom_exit_code scikit-learn ptera >= 1.1.0 -selenium \ No newline at end of file +selenium +speechbrain \ No newline at end of file