diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 1d6ee8a46c..bf507bab3b 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -29,13 +29,15 @@ jobs: args: ${{ env.args }} # This job only runs for pull request comments - if: contains('\ - Nic-Ma,\ - wyli,\ - pxLi,\ - YanxuanLiu,\ - KumoLiu,\ - ', format('{0},', github.actor)) && github.event.comment.body == '/build' + if: | + github.event.comment.body == '/build' && + ( + github.actor == 'Nic-Ma' || + github.actor == 'wyli' || + github.actor == 'pxLi' || + github.actor == 'YanxuanLiu' || + github.actor == 'KumoLiu' + ) steps: - name: Check if comment is issued by authorized person run: blossom-ci diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index a387c77ebd..394685acd3 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -26,12 +26,16 @@ jobs: steps: - if: runner.os == 'windows' name: Config pagefile (Windows only) - uses: al-cheb/configure-pagefile-action@v1.3 + uses: al-cheb/configure-pagefile-action@v1.4 with: minimum-size: 8GB maximum-size: 16GB disk-root: "D:" - uses: actions/checkout@v4 + - name: Clean up disk space + run: | + find /opt/hostedtoolcache/* -maxdepth 0 ! -name 'Python' -exec rm -rf {} \; + rm -rf /usr/share/dotnet/ - uses: conda-incubator/setup-miniconda@v3 with: auto-update-conda: true @@ -56,6 +60,10 @@ jobs: conda deactivate - name: Test env (CPU ${{ runner.os }}) shell: bash -el {0} + env: + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} run: | conda activate monai $(pwd)/runtests.sh --build --unittests diff --git a/.github/workflows/cron-ngc-bundle.yml b/.github/workflows/cron-ngc-bundle.yml index bd45bc8d1e..d4b45e1d55 100644 --- a/.github/workflows/cron-ngc-bundle.yml +++ b/.github/workflows/cron-ngc-bundle.yml @@ -18,10 +18,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: cache weekly timestamp id: pip-cache run: echo "datew=$(date '+%Y-%V')" >> $GITHUB_OUTPUT diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 792fda5279..cc113b0446 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -19,18 +19,18 @@ jobs: - "PTLATEST+CUDA121" include: # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes - - environment: PT191+CUDA113 - pytorch: "torch==1.9.1 torchvision==0.10.1 --extra-index-url https://download.pytorch.org/whl/cu113" - base: "nvcr.io/nvidia/pytorch:21.06-py3" # CUDA 11.3 - environment: PT110+CUDA113 pytorch: "torch==1.10.2 torchvision==0.11.3 --extra-index-url https://download.pytorch.org/whl/cu113" base: "nvcr.io/nvidia/pytorch:21.06-py3" # CUDA 11.3 - environment: PT113+CUDA113 pytorch: "torch==1.13.1 torchvision==0.14.1 --extra-index-url https://download.pytorch.org/whl/cu113" base: "nvcr.io/nvidia/pytorch:21.06-py3" # CUDA 11.3 - - environment: PTLATEST+CUDA121 - pytorch: "-U torch torchvision --extra-index-url https://download.pytorch.org/whl/cu118" + - environment: PT113+CUDA122 + pytorch: "torch==1.13.1 torchvision==0.14.1 --extra-index-url https://download.pytorch.org/whl/cu121" base: "nvcr.io/nvidia/pytorch:23.08-py3" # CUDA 12.2 + - environment: PTLATEST+CUDA124 + pytorch: "-U torch torchvision --extra-index-url https://download.pytorch.org/whl/cu121" + base: "nvcr.io/nvidia/pytorch:24.03-py3" # CUDA 12.4 container: image: ${{ matrix.base }} options: "--gpus all" @@ -50,6 +50,10 @@ jobs: python -m pip install -r requirements-dev.txt python -m pip list - name: Run tests report coverage + env: + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} run: | export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] echo "Sleep $LAUNCH_DELAY" @@ -76,7 +80,7 @@ jobs: if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: - container: ["pytorch:22.10", "pytorch:23.08"] + container: ["pytorch:23.08", "pytorch:24.03"] container: image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image options: "--gpus all" @@ -94,6 +98,10 @@ jobs: python -m pip install -r requirements-dev.txt python -m pip list - name: Run tests report coverage + env: + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} run: | export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] echo "Sleep $LAUNCH_DELAY" @@ -121,7 +129,7 @@ jobs: if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: - container: ["pytorch:23.08"] + container: ["pytorch:24.03"] container: image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image options: "--gpus all" @@ -196,6 +204,10 @@ jobs: - name: Run tests report coverage # The docker image process has done the compilation. # BUILD_MONAI=1 is necessary for triggering the USE_COMPILED flag. + env: + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} run: | cd /opt/monai nvidia-smi @@ -221,7 +233,7 @@ jobs: if: github.repository == 'Project-MONAI/MONAI' needs: cron-gpu # so that monai itself is verified first container: - image: nvcr.io/nvidia/pytorch:23.08-py3 # testing with the latest pytorch base image + image: nvcr.io/nvidia/pytorch:24.03-py3 # testing with the latest pytorch base image options: "--gpus all --ipc=host" runs-on: [self-hosted, linux, x64, integration] steps: diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 65716f86f9..17ffe4cf90 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -100,3 +100,6 @@ jobs: shell: bash env: QUICKTEST: True + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index c82530a551..5be2ebb86c 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -68,6 +68,9 @@ jobs: shell: bash env: BUILD_MONAI: 1 + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} run: ./runtests.sh --build --net - name: Add reaction diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml index a6d7981814..ead622b39c 100644 --- a/.github/workflows/pythonapp-gpu.yml +++ b/.github/workflows/pythonapp-gpu.yml @@ -29,10 +29,6 @@ jobs: - "PT210+CUDA121DOCKER" include: # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes - - environment: PT19+CUDA114DOCKER - # 21.10: 1.10.0a0+0aef44c - pytorch: "-h" # we explicitly set pytorch to -h to avoid pip install error - base: "nvcr.io/nvidia/pytorch:21.10-py3" - environment: PT110+CUDA111 pytorch: "torch==1.10.2 torchvision==0.11.3 --extra-index-url https://download.pytorch.org/whl/cu111" base: "nvcr.io/nvidia/cuda:11.1.1-devel-ubuntu18.04" @@ -47,6 +43,10 @@ jobs: # 23.08: 2.1.0a0+29c30b1 pytorch: "-h" # we explicitly set pytorch to -h to avoid pip install error base: "nvcr.io/nvidia/pytorch:23.08-py3" + - environment: PT210+CUDA121DOCKER + # 24.03: 2.3.0a0+40ec155e58.nv24.3 + pytorch: "-h" # we explicitly set pytorch to -h to avoid pip install error + base: "nvcr.io/nvidia/pytorch:24.03-py3" container: image: ${{ matrix.base }} options: --gpus all --env NVIDIA_DISABLE_REQUIRE=true # workaround for unsatisfied condition: cuda>=11.6 @@ -62,7 +62,7 @@ jobs: if [ ${{ matrix.environment }} = "PT110+CUDA111" ] || \ [ ${{ matrix.environment }} = "PT113+CUDA116" ] then - PYVER=3.8 PYSFX=3 DISTUTILS=python3-distutils && \ + PYVER=3.9 PYSFX=3 DISTUTILS=python3-distutils && \ apt-get update && apt-get install -y --no-install-recommends \ curl \ pkg-config \ diff --git a/.github/workflows/pythonapp-min.yml b/.github/workflows/pythonapp-min.yml index bbe7579774..b0d37937e9 100644 --- a/.github/workflows/pythonapp-min.yml +++ b/.github/workflows/pythonapp-min.yml @@ -9,6 +9,8 @@ on: - main - releasing/* pull_request: + head_ref-ignore: + - dev concurrency: # automatically cancel the previously triggered workflows when there's a newer version @@ -29,10 +31,10 @@ jobs: timeout-minutes: 40 steps: - uses: actions/checkout@v4 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Prepare pip wheel run: | which python @@ -65,13 +67,16 @@ jobs: shell: bash env: QUICKTEST: True + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} min-dep-py3: # min dependencies installed tests for different python runs-on: ubuntu-latest strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.9', '3.10', '3.11', '3.12'] timeout-minutes: 40 steps: - uses: actions/checkout@v4 @@ -110,20 +115,23 @@ jobs: ./runtests.sh --min env: QUICKTEST: True + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} min-dep-pytorch: # min dependencies installed tests for different pytorch runs-on: ubuntu-latest strategy: fail-fast: false matrix: - pytorch-version: ['1.9.1', '1.10.2', '1.11.0', '1.12.1', '1.13', 'latest'] + pytorch-version: ['1.10.2', '1.11.0', '1.12.1', '1.13', '2.0.1', 'latest'] timeout-minutes: 40 steps: - uses: actions/checkout@v4 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Prepare pip wheel run: | which python @@ -159,3 +167,6 @@ jobs: ./runtests.sh --min env: QUICKTEST: True + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index b011e65cf1..3c39166c1e 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -9,6 +9,8 @@ on: - main - releasing/* pull_request: + head_ref-ignore: + - dev concurrency: # automatically cancel the previously triggered workflows when there's a newer version @@ -27,10 +29,10 @@ jobs: opt: ["codeformat", "pytype", "mypy"] steps: - uses: actions/checkout@v4 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: cache weekly timestamp id: pip-cache run: | @@ -43,6 +45,7 @@ jobs: key: ${{ runner.os }}-pip-${{ steps.pip-cache.outputs.datew }} - name: Install dependencies run: | + find /opt/hostedtoolcache/* -maxdepth 0 ! -name 'Python' -exec rm -rf {} \; python -m pip install --upgrade pip wheel python -m pip install -r requirements-dev.txt - name: Lint and type check @@ -62,16 +65,16 @@ jobs: steps: - if: runner.os == 'windows' name: Config pagefile (Windows only) - uses: al-cheb/configure-pagefile-action@v1.3 + uses: al-cheb/configure-pagefile-action@v1.4 with: minimum-size: 8GB maximum-size: 16GB disk-root: "D:" - uses: actions/checkout@v4 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Prepare pip wheel run: | which python @@ -96,8 +99,10 @@ jobs: name: Install itk pre-release (Linux only) run: | python -m pip install --pre -U itk + find /opt/hostedtoolcache/* -maxdepth 0 ! -name 'Python' -exec rm -rf {} \; - name: Install the dependencies run: | + python -m pip install --user --upgrade pip wheel python -m pip install torch==1.13.1 torchvision==0.14.1 cat "requirements-dev.txt" python -m pip install -r requirements-dev.txt @@ -127,10 +132,10 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: cache weekly timestamp id: pip-cache run: | @@ -145,7 +150,8 @@ jobs: key: ${{ runner.os }}-pip-${{ steps.pip-cache.outputs.datew }} - name: Install dependencies run: | - python -m pip install --user --upgrade pip setuptools wheel twine + find /opt/hostedtoolcache/* -maxdepth 0 ! -name 'Python' -exec rm -rf {} \; + python -m pip install --user --upgrade pip setuptools wheel twine packaging # install the latest pytorch for testing # however, "pip install monai*.tar.gz" will build cpp/cuda with an isolated # fresh torch installation according to pyproject.toml @@ -208,10 +214,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: cache weekly timestamp id: pip-cache run: | diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c134724665..a014a4ed1d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install setuptools run: | - python -m pip install --user --upgrade setuptools wheel + python -m pip install --user --upgrade setuptools wheel packaging - name: Build and test source archive and wheel file run: | find /opt/hostedtoolcache/* -maxdepth 0 ! -name 'Python' -exec rm -rf {} \; @@ -104,7 +104,7 @@ jobs: run: | find /opt/hostedtoolcache/* -maxdepth 0 ! -name 'Python' -exec rm -rf {} \; git describe - python -m pip install --user --upgrade setuptools wheel + python -m pip install --user --upgrade setuptools wheel packaging python setup.py build cat build/lib/monai/_version.py - name: Upload version @@ -119,7 +119,8 @@ jobs: rm -rf {*,.[^.]*} release_tag_docker: - if: github.repository == 'Project-MONAI/MONAI' + # if: github.repository == 'Project-MONAI/MONAI' + if: ${{ false }} needs: versioning runs-on: ubuntu-latest steps: diff --git a/.github/workflows/setupapp.yml b/.github/workflows/setupapp.yml index c6ad243b81..7e01f55cd9 100644 --- a/.github/workflows/setupapp.yml +++ b/.github/workflows/setupapp.yml @@ -49,6 +49,10 @@ jobs: python -m pip install --upgrade torch torchvision python -m pip install -r requirements-dev.txt - name: Run unit tests report coverage + env: + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} run: | python -m pip list git config --global --add safe.directory /__w/MONAI/MONAI @@ -77,7 +81,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v4 with: @@ -104,6 +108,10 @@ jobs: python -m pip install --upgrade pip wheel python -m pip install -r requirements-dev.txt - name: Run quick tests CPU ubuntu + env: + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + NGC_ORG: ${{ secrets.NGC_ORG }} + NGC_TEAM: ${{ secrets.NGC_TEAM }} run: | python -m pip list python -c 'import torch; print(torch.__version__); print(torch.rand(5,3))' @@ -119,10 +127,10 @@ jobs: install: # pip install from github url, the default branch is dev runs-on: ubuntu-latest steps: - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: cache weekly timestamp id: pip-cache run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 14b41bbeb8..3fff6ed631 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ ci: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.6.0 hooks: - id: end-of-file-fixer - id: trailing-whitespace @@ -27,7 +27,7 @@ repos: - id: end-of-file-fixer - id: mixed-line-ending - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.261 + rev: v0.3.5 hooks: - id: ruff args: @@ -58,7 +58,7 @@ repos: name: Unused noqa additional_dependencies: - flake8>=3.8.1 - - flake8-bugbear + - flake8-bugbear<=24.2.6 - flake8-comprehensions - pep8-naming exclude: | @@ -69,7 +69,7 @@ repos: )$ - repo: https://github.com/hadialqattan/pycln - rev: v2.1.3 + rev: v2.4.0 hooks: - id: pycln args: [--config=pyproject.toml] diff --git a/CHANGELOG.md b/CHANGELOG.md index 61be8f07c1..804508c262 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,103 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [Unreleased] +## [1.3.2] - 2024-06-25 +### Fixed +#### misc. +* Updated Numpy version constraint to < 2.0 (#7859) + +## [1.3.1] - 2024-05-17 +### Added +* Support for `by_measure` argument in `RemoveSmallObjects` (#7137) +* Support for `pretrained` flag in `ResNet` (#7095) +* Support for uploading and downloading bundles to and from the Hugging Face Hub (#6454) +* Added weight parameter in DiceLoss to apply weight to voxels of each class (#7158) +* Support for returning dice for each class in `DiceMetric` (#7163) +* Introduced `ComponentStore` for storage purposes (#7159) +* Added utilities used in MONAI Generative (#7134) +* Enabled Python 3.11 support for `convert_to_torchscript` and `convert_to_onnx` (#7182) +* Support for MLflow in `AutoRunner` (#7176) +* `fname_regex` option in PydicomReader (#7181) +* Allowed setting AutoRunner parameters from config (#7175) +* `VoxelMorphUNet` and `VoxelMorph` (#7178) +* Enabled `cache` option in `GridPatchDataset` (#7180) +* Introduced `class_labels` option in `write_metrics_reports` for improved readability (#7249) +* `DiffusionLoss` for image registration task (#7272) +* Supported specifying `filename` in `Saveimage` (#7318) +* Compile support in `SupervisedTrainer` and `SupervisedEvaluator` (#7375) +* `mlflow_experiment_name` support in `Auto3DSeg` (#7442) +* Arm support (#7500) +* `BarlowTwinsLoss` for representation learning (#7530) +* `SURELoss` and `ConjugateGradient` for diffusion models (#7308) +* Support for `CutMix`, `CutOut`, and `MixUp` augmentation techniques (#7198) +* `meta_file` and `logging_file` options to `BundleWorkflow` (#7549) +* `properties_path` option to `BundleWorkflow` for customized properties (#7542) +* Support for both soft and hard clipping in `ClipIntensityPercentiles` (#7535) +* Support for not saving artifacts in `MLFlowHandler` (#7604) +* Support for multi-channel images in `PerceptualLoss` (#7568) +* Added ResNet backbone for `FlexibleUNet` (#7571) +* Introduced `dim_head` option in `SABlock` to set dimensions for each head (#7664) +* Direct links to github source code to docs (#7738, #7779) +#### misc. +* Refactored `list_data_collate` and `collate_meta_tensor` to utilize the latest PyTorch API (#7165) +* Added __str__ method in `Metric` base class (#7487) +* Made enhancements for testing files (#7662, #7670, #7663, #7671, #7672) +* Improved documentation for bundles (#7116) +### Fixed +#### transforms +* Addressed issue where lazy mode was ignored in `SpatialPadd` (#7316) +* Tracked applied operations in `ImageFilter` (#7395) +* Warnings are now given only if missing class is not set to 0 in `generate_label_classes_crop_centers` (#7602) +* Input is now always converted to C-order in `distance_transform_edt` to ensure consistent behavior (#7675) +#### data +* Modified .npz file behavior to use keys in `NumpyReader` (#7148) +* Handled corrupted cached files in `PersistentDataset` (#7244) +* Corrected affine update in `NrrdReader` (#7415) +#### metrics and losses +* Addressed precision issue in `get_confusion_matrix` (#7187) +* Harmonized and clarified documentation and tests for dice losses variants (#7587) +#### networks +* Removed hard-coded `spatial_dims` in `SwinTransformer` (#7302) +* Fixed learnable `position_embeddings` in `PatchEmbeddingBlock` (#7564, #7605) +* Removed `memory_pool_limit` in TRT config (#7647) +* Propagated `kernel_size` to `ConvBlocks` within `AttentionUnet` (#7734) +* Addressed hard-coded activation layer in `ResNet` (#7749) +#### bundle +* Resolved bundle download issue (#7280) +* Updated `bundle_root` directory for `NNIGen` (#7586) +* Checked for `num_fold` and failed early if incorrect (#7634) +* Enhanced logging logic in `ConfigWorkflow` (#7745) +#### misc. +* Enabled chaining in `Auto3DSeg` CLI (#7168) +* Addressed useless error message in `nnUNetV2Runner` (#7217) +* Resolved typing and deprecation issues in Mypy (#7231) +* Quoted `$PY_EXE` variable to handle Python path that contains spaces in Bash (#7268) +* Improved documentation, code examples, and warning messages in various modules (#7234, #7213, #7271, #7326, #7569, #7584) +* Fixed typos in various modules (#7321, #7322, #7458, #7595, #7612) +* Enhanced docstrings in various modules (#7245, #7381, #7746) +* Handled error when data is on CPU in `DataAnalyzer` (#7310) +* Updated version requirements for third-party packages (#7343, #7344, #7384, #7448, #7659, #7704, #7744, #7742, #7780) +* Addressed incorrect slice compute in `ImageStats` (#7374) +* Avoided editing a loop's mutable iterable to address B308 (#7397) +* Fixed issue with `CUDA_VISIBLE_DEVICES` setting being ignored (#7408, #7581) +* Avoided changing Python version in CICD (#7424) +* Renamed partial to callable in instantiate mode (#7413) +* Imported AttributeError for Python 3.12 compatibility (#7482) +* Updated `nnUNetV2Runner` to support nnunetv2 2.2 (#7483) +* Used uint8 instead of int8 in `LabelStats` (#7489) +* Utilized subprocess for nnUNet training (#7576) +* Addressed deprecated warning in ruff (#7625) +* Fixed downloading failure on FIPS machine (#7698) +* Updated `torch_tensorrt` compile parameters to avoid warning (#7714) +* Restrict `Auto3DSeg` fold input based on datalist (#7778) +### Changed +* Base Docker image upgraded to `nvcr.io/nvidia/pytorch:24.03-py3` from `nvcr.io/nvidia/pytorch:23.08-py3` +### Removed +* Removed unrecommended star-arg unpacking after a keyword argument, addressed B026 (#7262) +* Skipped old PyTorch version test for `SwinUNETR` (#7266) +* Dropped docker build workflow and migrated to Nvidia Blossom system (#7450) +* Dropped Python 3.8 test on quick-py3 workflow (#7719) + ## [1.3.0] - 2023-10-12 ### Added * Intensity transforms `ScaleIntensityFixedMean` and `RandScaleIntensityFixedMean` (#6542) @@ -943,7 +1040,9 @@ the postprocessing steps should be used before calling the metrics methods [highlights]: https://github.com/Project-MONAI/MONAI/blob/master/docs/source/highlights.md -[Unreleased]: https://github.com/Project-MONAI/MONAI/compare/1.3.0...HEAD +[Unreleased]: https://github.com/Project-MONAI/MONAI/compare/1.3.2...HEAD +[1.3.2]: https://github.com/Project-MONAI/MONAI/compare/1.3.1...1.3.2 +[1.3.1]: https://github.com/Project-MONAI/MONAI/compare/1.3.0...1.3.1 [1.3.0]: https://github.com/Project-MONAI/MONAI/compare/1.2.0...1.3.0 [1.2.0]: https://github.com/Project-MONAI/MONAI/compare/1.1.0...1.2.0 [1.1.0]: https://github.com/Project-MONAI/MONAI/compare/1.0.1...1.1.0 diff --git a/CITATION.cff b/CITATION.cff index cac47faae4..b535a77a9f 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -6,8 +6,8 @@ title: "MONAI: Medical Open Network for AI" abstract: "AI Toolkit for Healthcare Imaging" authors: - name: "MONAI Consortium" -date-released: 2023-10-12 -version: "1.3.0" +date-released: 2024-06-26 +version: "1.3.2" identifiers: - description: "This DOI represents all versions of MONAI, and will always resolve to the latest one." type: doi diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5c886cff30..8db796637f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -123,7 +123,7 @@ or (for new features that would not break existing functionality): ``` It is recommended that the new test `test_[module_name].py` is constructed by using only -python 3.8+ build-in functions, `torch`, `numpy`, `coverage` (for reporting code coverages) and `parameterized` (for organising test cases) packages. +python 3.9+ build-in functions, `torch`, `numpy`, `coverage` (for reporting code coverages) and `parameterized` (for organising test cases) packages. If it requires any other external packages, please make sure: - the packages are listed in [`requirements-dev.txt`](requirements-dev.txt) - the new test `test_[module_name].py` is added to the `exclude_cases` in [`./tests/min_tests.py`](./tests/min_tests.py) so that diff --git a/Dockerfile b/Dockerfile index cb1300ea90..8e255597d1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,11 +11,19 @@ # To build with a different base image # please run `docker build` using the `--build-arg PYTORCH_IMAGE=...` flag. -ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.08-py3 +ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3 FROM ${PYTORCH_IMAGE} LABEL maintainer="monai.contact@gmail.com" +# TODO: remark for issue [revise the dockerfile](https://github.com/zarr-developers/numcodecs/issues/431) +RUN if [[ $(uname -m) =~ "aarch64" ]]; then \ + export CFLAGS="-O3" && \ + export DISABLE_NUMCODECS_SSE2=true && \ + export DISABLE_NUMCODECS_AVX2=true && \ + pip install numcodecs; \ + fi + WORKDIR /opt/monai # install full deps diff --git a/README.md b/README.md index 7565fea1b7..5345cdb926 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,6 @@ [![premerge](https://github.com/Project-MONAI/MONAI/actions/workflows/pythonapp.yml/badge.svg?branch=dev)](https://github.com/Project-MONAI/MONAI/actions/workflows/pythonapp.yml) [![postmerge](https://img.shields.io/github/checks-status/project-monai/monai/dev?label=postmerge)](https://github.com/Project-MONAI/MONAI/actions?query=branch%3Adev) -[![docker](https://github.com/Project-MONAI/MONAI/actions/workflows/docker.yml/badge.svg?branch=dev)](https://github.com/Project-MONAI/MONAI/actions/workflows/docker.yml) [![Documentation Status](https://readthedocs.org/projects/monai/badge/?version=latest)](https://docs.monai.io/en/latest/) [![codecov](https://codecov.io/gh/Project-MONAI/MONAI/branch/dev/graph/badge.svg?token=6FTC7U1JJ4)](https://codecov.io/gh/Project-MONAI/MONAI) diff --git a/docs/images/python.svg b/docs/images/python.svg index b7aa7c59bd..8ef6b61c03 100644 --- a/docs/images/python.svg +++ b/docs/images/python.svg @@ -1 +1 @@ -pythonpython3.8+3.8+ +pythonpython3.9+3.9+ diff --git a/docs/requirements.txt b/docs/requirements.txt index e5bedf8552..ff94f7b6de 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -6,7 +6,7 @@ itk>=5.2 nibabel parameterized scikit-image>=0.19.0 -scipy>=1.7.1 +scipy>=1.12.0; python_version >= '3.9' tensorboard commonmark==0.9.1 recommonmark==0.6.0 @@ -21,8 +21,8 @@ sphinxcontrib-serializinghtml sphinx-autodoc-typehints==1.11.1 pandas einops -transformers<4.22; python_version <= '3.10' # https://github.com/Project-MONAI/MONAI/issues/5157 -mlflow>=1.28.0 +transformers>=4.36.0, <4.41.0; python_version <= '3.10' +mlflow>=2.12.2 clearml>=1.10.0rc0 tensorboardX imagecodecs; platform_system == "Linux" or platform_system == "Darwin" @@ -40,3 +40,5 @@ onnx>=1.13.0 onnxruntime; python_version <= '3.10' zarr huggingface_hub +pyamg>=5.0.0 +packaging diff --git a/docs/source/conf.py b/docs/source/conf.py index fdb10fbe03..a91f38081f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,6 +13,8 @@ import os import subprocess import sys +import importlib +import inspect sys.path.insert(0, os.path.abspath("..")) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) @@ -95,7 +97,7 @@ def generate_apidocs(*args): "sphinx.ext.mathjax", "sphinx.ext.napoleon", "sphinx.ext.autodoc", - "sphinx.ext.viewcode", + "sphinx.ext.linkcode", "sphinx.ext.autosectionlabel", "sphinx.ext.autosummary", "sphinx_autodoc_typehints", @@ -137,8 +139,8 @@ def generate_apidocs(*args): "github_user": "Project-MONAI", "github_repo": "MONAI", "github_version": "dev", - "doc_path": "docs/", - "conf_py_path": "/docs/", + "doc_path": "docs/source", + "conf_py_path": "/docs/source", "VERSION": version, } html_scaled_image_link = False @@ -162,3 +164,60 @@ def setup(app): # Hook to allow for automatic generation of API docs # before doc deployment begins. app.connect("builder-inited", generate_apidocs) + + +# -- Linkcode configuration -------------------------------------------------- +DEFAULT_REF = "dev" +read_the_docs_ref = os.environ.get("READTHEDOCS_GIT_IDENTIFIER", None) +if read_the_docs_ref: + # When building on ReadTheDocs, link to the specific commit + # https://docs.readthedocs.io/en/stable/reference/environment-variables.html#envvar-READTHEDOCS_GIT_IDENTIFIER + git_ref = read_the_docs_ref +elif os.environ.get("GITHUB_REF_TYPE", "branch") == "tag": + # When building a tag, link to the tag itself + git_ref = os.environ.get("GITHUB_REF", DEFAULT_REF) +else: + git_ref = os.environ.get("GITHUB_SHA", DEFAULT_REF) + +DEFAULT_REPOSITORY = "Project-MONAI/MONAI" +repository = os.environ.get("GITHUB_REPOSITORY", DEFAULT_REPOSITORY) + +base_code_url = f"https://github.com/{repository}/blob/{git_ref}" +MODULE_ROOT_FOLDER = "monai" +repo_root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + + +# Adjusted from https://github.com/python-websockets/websockets/blob/main/docs/conf.py +def linkcode_resolve(domain, info): + if domain != "py": + raise ValueError( + f"expected domain to be 'py', got {domain}." + "Please adjust linkcode_resolve to either handle this domain or ignore it." + ) + + mod = importlib.import_module(info["module"]) + if "." in info["fullname"]: + objname, attrname = info["fullname"].split(".") + obj = getattr(mod, objname) + try: + # object is a method of a class + obj = getattr(obj, attrname) + except AttributeError: + # object is an attribute of a class + return None + else: + obj = getattr(mod, info["fullname"]) + + try: + file = inspect.getsourcefile(obj) + source, lineno = inspect.getsourcelines(obj) + except TypeError: + # e.g. object is a typing.Union + return None + file = os.path.relpath(file, repo_root_path) + if not file.startswith(MODULE_ROOT_FOLDER): + # e.g. object is a typing.NewType + return None + start, end = lineno, lineno + len(source) - 1 + url = f"{base_code_url}/{file}#L{start}-L{end}" + return url diff --git a/docs/source/engines.rst b/docs/source/engines.rst index afb2682822..a015c7b2a3 100644 --- a/docs/source/engines.rst +++ b/docs/source/engines.rst @@ -30,6 +30,11 @@ Workflows .. autoclass:: GanTrainer :members: +`AdversarialTrainer` +~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: AdversarialTrainer + :members: + `Evaluator` ~~~~~~~~~~~ .. autoclass:: Evaluator diff --git a/docs/source/index.rst b/docs/source/index.rst index b6c8c22f98..85adee7e44 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -37,7 +37,7 @@ Features Getting started --------------- -`MedNIST demo `_ and `MONAI for PyTorch Users `_ are available on Colab. +`MedNIST demo `_ and `MONAI for PyTorch Users `_ are available on Colab. Examples and notebook tutorials are located at `Project-MONAI/tutorials `_. diff --git a/docs/source/inferers.rst b/docs/source/inferers.rst index 33f9e14d83..326f56e96c 100644 --- a/docs/source/inferers.rst +++ b/docs/source/inferers.rst @@ -49,6 +49,29 @@ Inferers :members: :special-members: __call__ +`DiffusionInferer` +~~~~~~~~~~~~~~~~~~ +.. autoclass:: DiffusionInferer + :members: + :special-members: __call__ + +`LatentDiffusionInferer` +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: LatentDiffusionInferer + :members: + :special-members: __call__ + +`ControlNetDiffusionInferer` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: ControlNetDiffusionInferer + :members: + :special-members: __call__ + +`ControlNetLatentDiffusionInferer` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: ControlNetLatentDiffusionInferer + :members: + :special-members: __call__ Splitters --------- diff --git a/docs/source/installation.md b/docs/source/installation.md index d77253f0f9..70a8b6f1d4 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -19,7 +19,7 @@ --- -MONAI's core functionality is written in Python 3 (>= 3.8) and only requires [Numpy](https://numpy.org/) and [Pytorch](https://pytorch.org/). +MONAI's core functionality is written in Python 3 (>= 3.9) and only requires [Numpy](https://numpy.org/) and [Pytorch](https://pytorch.org/). The package is currently distributed via Github as the primary source code repository, and the Python package index (PyPI). The pre-built Docker images are made available on DockerHub. @@ -254,10 +254,10 @@ Since MONAI v0.2.0, the extras syntax such as `pip install 'monai[nibabel]'` is - The options are ``` -[nibabel, skimage, scipy, pillow, tensorboard, gdown, ignite, torchvision, itk, tqdm, lmdb, psutil, cucim, openslide, pandas, einops, transformers, mlflow, clearml, matplotlib, tensorboardX, tifffile, imagecodecs, pyyaml, fire, jsonschema, ninja, pynrrd, pydicom, h5py, nni, optuna, onnx, onnxruntime, zarr, lpips, pynvml, huggingface_hub] +[nibabel, skimage, scipy, pillow, tensorboard, gdown, ignite, torchvision, itk, tqdm, lmdb, psutil, cucim, openslide, pandas, einops, transformers, mlflow, clearml, matplotlib, tensorboardX, tifffile, imagecodecs, pyyaml, fire, jsonschema, ninja, pynrrd, pydicom, h5py, nni, optuna, onnx, onnxruntime, zarr, lpips, pynvml, huggingface_hub, segment-anything] ``` which correspond to `nibabel`, `scikit-image`,`scipy`, `pillow`, `tensorboard`, -`gdown`, `pytorch-ignite`, `torchvision`, `itk`, `tqdm`, `lmdb`, `psutil`, `cucim`, `openslide-python`, `pandas`, `einops`, `transformers`, `mlflow`, `clearml`, `matplotlib`, `tensorboardX`, `tifffile`, `imagecodecs`, `pyyaml`, `fire`, `jsonschema`, `ninja`, `pynrrd`, `pydicom`, `h5py`, `nni`, `optuna`, `onnx`, `onnxruntime`, `zarr`, `lpips`, `nvidia-ml-py`, and `huggingface_hub` respectively. +`gdown`, `pytorch-ignite`, `torchvision`, `itk`, `tqdm`, `lmdb`, `psutil`, `cucim`, `openslide-python`, `pandas`, `einops`, `transformers`, `mlflow`, `clearml`, `matplotlib`, `tensorboardX`, `tifffile`, `imagecodecs`, `pyyaml`, `fire`, `jsonschema`, `ninja`, `pynrrd`, `pydicom`, `h5py`, `nni`, `optuna`, `onnx`, `onnxruntime`, `zarr`, `lpips`, `nvidia-ml-py`, `huggingface_hub`, `pyamg` and `segment-anything` respectively. - `pip install 'monai[all]'` installs all the optional dependencies. diff --git a/docs/source/losses.rst b/docs/source/losses.rst index e929e9d605..528ccd1173 100644 --- a/docs/source/losses.rst +++ b/docs/source/losses.rst @@ -73,6 +73,11 @@ Segmentation Losses .. autoclass:: ContrastiveLoss :members: +`BarlowTwinsLoss` +~~~~~~~~~~~~~~~~~ +.. autoclass:: BarlowTwinsLoss + :members: + `HausdorffDTLoss` ~~~~~~~~~~~~~~~~~ .. autoclass:: HausdorffDTLoss @@ -88,6 +93,11 @@ Segmentation Losses .. autoclass:: SoftDiceclDiceLoss :members: +`NACLLoss` +~~~~~~~~~~ +.. autoclass:: NACLLoss + :members: + Registration Losses ------------------- @@ -134,6 +144,11 @@ Reconstruction Losses .. autoclass:: JukeboxLoss :members: +`SURELoss` +~~~~~~~~~~ +.. autoclass:: SURELoss + :members: + Loss Wrappers ------------- diff --git a/docs/source/mb_specification.rst b/docs/source/mb_specification.rst index cedafa0d23..56d660e35c 100644 --- a/docs/source/mb_specification.rst +++ b/docs/source/mb_specification.rst @@ -63,12 +63,12 @@ This file contains the metadata information relating to the model, including wha * **monai_version**: version of MONAI the bundle was generated on, later versions expected to work. * **pytorch_version**: version of Pytorch the bundle was generated on, later versions expected to work. * **numpy_version**: version of Numpy the bundle was generated on, later versions expected to work. -* **optional_packages_version**: dictionary relating optional package names to their versions, these packages are not needed but are recommended to be installed with this stated minimum version. +* **required_packages_version**: dictionary relating required package names to their versions. These are packages in addition to the base requirements of MONAI which this bundle absolutely needs. For example, if the bundle must load Nifti files the Nibabel package will be required. * **task**: plain-language description of what the model is meant to do. * **description**: longer form plain-language description of what the model is, what it does, etc. * **authors**: state author(s) of the model. * **copyright**: state model copyright. -* **network_data_format**: defines the format, shape, and meaning of inputs and outputs to the model, contains keys "inputs" and "outputs" relating named inputs/outputs to their format specifiers (defined below). +* **network_data_format**: defines the format, shape, and meaning of inputs and outputs to the (primary) model, contains keys "inputs" and "outputs" relating named inputs/outputs to their format specifiers (defined below). There is also an optional "post_processed_outputs" key stating the format of "outputs" after postprocessing transforms are applied, this is used to describe the final output from the bundle if it varies from the raw network output. These keys can also relate to primitive values (number, string, boolean), instead of the tensor format specified below. Tensor format specifiers are used to define input and output tensors and their meanings, and must be a dictionary containing at least these keys: @@ -89,6 +89,8 @@ Optional keys: * **data_source**: description of where training/validation can be sourced. * **data_type**: type of source data used for training/validation. * **references**: list of published referenced relating to the model. +* **supported_apps**: list of supported applications which use bundles, eg. 'monai-label' would be present if the bundle is compatible with MONAI Label applications. +* **\*_data_format**: defines the format, shape, and meaning of inputs and outputs to additional models which are secondary to the main model. This contains the same sort of information as **network_data_format** which describes networks providing secondary functionality, eg. a localisation network used to identify ROI in an image for cropping before data is sent to the primary network of this bundle. The format for tensors used as inputs and outputs can be used to specify semantic meaning of these values, and later is used by software handling bundles to determine how to process and interpret this data. There are various types of image data that MONAI is uses, and other data types such as point clouds, dictionary sequences, time signals, and others. The following list is provided as a set of supported definitions of what a tensor "format" is but is not exhaustive and users can provide their own which would be left up to the model users to interpret: @@ -124,7 +126,7 @@ An example JSON metadata file: "monai_version": "0.9.0", "pytorch_version": "1.10.0", "numpy_version": "1.21.2", - "optional_packages_version": {"nibabel": "3.2.1"}, + "required_packages_version": {"nibabel": "3.2.1"}, "task": "Decathlon spleen segmentation", "description": "A pre-trained model for volumetric (3D) segmentation of the spleen from CT image", "authors": "MONAI team", diff --git a/docs/source/networks.rst b/docs/source/networks.rst index 8eada7933f..1810fec49b 100644 --- a/docs/source/networks.rst +++ b/docs/source/networks.rst @@ -408,6 +408,11 @@ Layers .. autoclass:: LLTM :members: +`ConjugateGradient` +~~~~~~~~~~~~~~~~~~~ +.. autoclass:: ConjugateGradient + :members: + `Utilities` ~~~~~~~~~~~ .. automodule:: monai.networks.layers.convutils @@ -476,6 +481,11 @@ Nets .. autoclass:: SegResNetDS :members: +`SegResNetDS2` +~~~~~~~~~~~~~~ +.. autoclass:: SegResNetDS2 + :members: + `SegResNetVAE` ~~~~~~~~~~~~~~ .. autoclass:: SegResNetVAE @@ -486,6 +496,11 @@ Nets .. autoclass:: ResNet :members: +`ResNetFeatures` +~~~~~~~~~~~~~~~~ +.. autoclass:: ResNetFeatures + :members: + `SENet` ~~~~~~~ .. autoclass:: SENet @@ -546,6 +561,11 @@ Nets .. autoclass:: UNETR :members: +`VISTA3D` +~~~~~~~~~ +.. autoclass:: VISTA3D + :members: + `SwinUNETR` ~~~~~~~~~~~ .. autoclass:: SwinUNETR diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 8990e7991d..637f0873f1 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -309,6 +309,12 @@ Intensity :members: :special-members: __call__ +`ClipIntensityPercentiles` +"""""""""""""""""""""""""" +.. autoclass:: ClipIntensityPercentiles + :members: + :special-members: __call__ + `RandScaleIntensity` """""""""""""""""""" .. image:: https://raw.githubusercontent.com/Project-MONAI/DocImages/main/transforms/RandScaleIntensity.png @@ -661,6 +667,33 @@ Post-processing :members: :special-members: __call__ +`Invert` +""""""""" +.. autoclass:: Invert + :members: + :special-members: __call__ + +Regularization +^^^^^^^^^^^^^^ + +`CutMix` +"""""""" +.. autoclass:: CutMix + :members: + :special-members: __call__ + +`CutOut` +"""""""" +.. autoclass:: CutOut + :members: + :special-members: __call__ + +`MixUp` +""""""" +.. autoclass:: MixUp + :members: + :special-members: __call__ + Signal ^^^^^^^ @@ -1384,6 +1417,12 @@ Intensity (Dict) :members: :special-members: __call__ +`ClipIntensityPercentilesd` +""""""""""""""""""""""""""" +.. autoclass:: ClipIntensityPercentilesd + :members: + :special-members: __call__ + `RandScaleIntensityd` """"""""""""""""""""" .. image:: https://raw.githubusercontent.com/Project-MONAI/DocImages/main/transforms/RandScaleIntensityd.png @@ -1707,6 +1746,27 @@ Post-processing (Dict) :members: :special-members: __call__ +Regularization (Dict) +^^^^^^^^^^^^^^^^^^^^^ + +`CutMixd` +""""""""" +.. autoclass:: CutMixd + :members: + :special-members: __call__ + +`CutOutd` +""""""""" +.. autoclass:: CutOutd + :members: + :special-members: __call__ + +`MixUpd` +"""""""" +.. autoclass:: MixUpd + :members: + :special-members: __call__ + Signal (Dict) ^^^^^^^^^^^^^ @@ -2250,6 +2310,9 @@ Utilities .. automodule:: monai.transforms.utils_pytorch_numpy_unification :members: +.. automodule:: monai.transforms.utils_morphological_ops + :members: + By Categories ------------- .. toctree:: diff --git a/docs/source/transforms_idx.rst b/docs/source/transforms_idx.rst index f4d02a483f..650d45db71 100644 --- a/docs/source/transforms_idx.rst +++ b/docs/source/transforms_idx.rst @@ -74,6 +74,16 @@ Post-processing post.array post.dictionary +Regularization +^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: _gen + :nosignatures: + + regularization.array + regularization.dictionary + Signal ^^^^^^ diff --git a/docs/source/utils.rst b/docs/source/utils.rst index 527247799f..fef671e1f8 100644 --- a/docs/source/utils.rst +++ b/docs/source/utils.rst @@ -81,3 +81,8 @@ Component store --------------- .. autoclass:: monai.utils.component_store.ComponentStore :members: + +Ordering +-------- +.. automodule:: monai.utils.ordering + :members: diff --git a/monai/__init__.py b/monai/__init__.py index 638220f6df..cb0ccd36f8 100644 --- a/monai/__init__.py +++ b/monai/__init__.py @@ -17,7 +17,7 @@ from ._version import get_versions PY_REQUIRED_MAJOR = 3 -PY_REQUIRED_MINOR = 8 +PY_REQUIRED_MINOR = 9 version_dict = get_versions() __version__: str = version_dict.get("version", "0+unknown") @@ -83,6 +83,11 @@ from .utils.tf32 import detect_default_tf32 detect_default_tf32() + import torch + + # workaround related to https://github.com/Project-MONAI/MONAI/issues/7575 + if hasattr(torch.cuda.device_count, "cache_clear"): + torch.cuda.device_count.cache_clear() except BaseException: from .utils.misc import MONAIEnvVars diff --git a/monai/apps/auto3dseg/auto_runner.py b/monai/apps/auto3dseg/auto_runner.py index 52a0824227..99bf92c481 100644 --- a/monai/apps/auto3dseg/auto_runner.py +++ b/monai/apps/auto3dseg/auto_runner.py @@ -298,9 +298,13 @@ def __init__( pass # inspect and update folds - num_fold = self.inspect_datalist_folds(datalist_filename=datalist_filename) + self.max_fold = self.inspect_datalist_folds(datalist_filename=datalist_filename) if "num_fold" in self.data_src_cfg: num_fold = int(self.data_src_cfg["num_fold"]) # override from config + logger.info(f"Setting num_fold {num_fold} based on the input config.") + else: + num_fold = self.max_fold + logger.info(f"Setting num_fold {num_fold} based on the input datalist {datalist_filename}.") self.data_src_cfg["datalist"] = datalist_filename # update path to a version in work_dir and save user input ConfigParser.export_config_file( @@ -398,7 +402,10 @@ def inspect_datalist_folds(self, datalist_filename: str) -> int: if len(fold_list) > 0: num_fold = max(fold_list) + 1 - logger.info(f"Setting num_fold {num_fold} based on the input datalist {datalist_filename}.") + logger.info(f"Found num_fold {num_fold} based on the input datalist {datalist_filename}.") + # check if every fold is present + if len(set(fold_list)) != num_fold: + raise ValueError(f"Fold numbers are not continuous from 0 to {num_fold - 1}") elif "validation" in datalist and len(datalist["validation"]) > 0: logger.info("No fold numbers provided, attempting to use a single fold based on the validation key") # update the datalist file @@ -492,6 +499,11 @@ def set_num_fold(self, num_fold: int = 5) -> AutoRunner: if num_fold <= 0: raise ValueError(f"num_fold is expected to be an integer greater than zero. Now it gets {num_fold}") + if num_fold > self.max_fold: + # Auto3DSeg must contain validation set, so the maximum fold number is max_fold. + raise ValueError( + f"num_fold is greater than the maximum fold number {self.max_fold} in {self.datalist_filename}." + ) self.num_fold = num_fold return self @@ -539,7 +551,7 @@ def set_device_info( cmd_prefix: command line prefix for subprocess running in BundleAlgo and EnsembleRunner. Default using env "CMD_PREFIX" or None, examples are: - - single GPU/CPU or multinode bcprun: "python " or "/opt/conda/bin/python3.8 ", + - single GPU/CPU or multinode bcprun: "python " or "/opt/conda/bin/python3.9 ", - single node multi-GPU running "torchrun --nnodes=1 --nproc_per_node=2 " If user define this prefix, please make sure --nproc_per_node matches cuda_visible_device or diff --git a/monai/apps/auto3dseg/hpo_gen.py b/monai/apps/auto3dseg/hpo_gen.py index 688bf2b916..ed6d903897 100644 --- a/monai/apps/auto3dseg/hpo_gen.py +++ b/monai/apps/auto3dseg/hpo_gen.py @@ -53,7 +53,7 @@ def update_params(self, *args, **kwargs): raise NotImplementedError @abstractmethod - def set_score(self): + def set_score(self, *args, **kwargs): """Report the evaluated results to HPO.""" raise NotImplementedError @@ -193,7 +193,9 @@ def generate(self, output_folder: str = ".") -> None: self.obj_filename = os.path.join(write_path, "algo_object.pkl") if isinstance(self.algo, BundleAlgo): - self.algo.export_to_disk(output_folder, task_prefix + task_id, fill_with_datastats=False) + self.algo.export_to_disk( + output_folder, task_prefix + task_id, bundle_root=write_path, fill_with_datastats=False + ) else: ConfigParser.export_config_file(self.params, write_path) logger.info(write_path) diff --git a/monai/apps/deepedit/transforms.py b/monai/apps/deepedit/transforms.py index 6d0825f54a..5af082e2b0 100644 --- a/monai/apps/deepedit/transforms.py +++ b/monai/apps/deepedit/transforms.py @@ -30,7 +30,7 @@ logger = logging.getLogger(__name__) -distance_transform_cdt, _ = optional_import("scipy.ndimage.morphology", name="distance_transform_cdt") +distance_transform_cdt, _ = optional_import("scipy.ndimage", name="distance_transform_cdt") class DiscardAddGuidanced(MapTransform): diff --git a/monai/apps/deepgrow/transforms.py b/monai/apps/deepgrow/transforms.py index 9aca77a36c..c2f97091fd 100644 --- a/monai/apps/deepgrow/transforms.py +++ b/monai/apps/deepgrow/transforms.py @@ -27,7 +27,7 @@ from monai.utils.enums import PostFix measure, _ = optional_import("skimage.measure", "0.14.2", min_version) -distance_transform_cdt, _ = optional_import("scipy.ndimage.morphology", name="distance_transform_cdt") +distance_transform_cdt, _ = optional_import("scipy.ndimage", name="distance_transform_cdt") DEFAULT_POST_FIX = PostFix.meta() diff --git a/monai/apps/detection/utils/anchor_utils.py b/monai/apps/detection/utils/anchor_utils.py index 283169b653..cbde3ebae9 100644 --- a/monai/apps/detection/utils/anchor_utils.py +++ b/monai/apps/detection/utils/anchor_utils.py @@ -189,7 +189,7 @@ def generate_anchors( w_ratios = 1 / area_scale h_ratios = area_scale # if 3d, w:h:d = 1:aspect_ratios[:,0]:aspect_ratios[:,1] - elif self.spatial_dims == 3: + else: area_scale = torch.pow(aspect_ratios_t[:, 0] * aspect_ratios_t[:, 1], 1 / 3.0) w_ratios = 1 / area_scale h_ratios = aspect_ratios_t[:, 0] / area_scale @@ -199,7 +199,7 @@ def generate_anchors( hs = (h_ratios[:, None] * scales_t[None, :]).view(-1) if self.spatial_dims == 2: base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2.0 - elif self.spatial_dims == 3: + else: # elif self.spatial_dims == 3: ds = (d_ratios[:, None] * scales_t[None, :]).view(-1) base_anchors = torch.stack([-ws, -hs, -ds, ws, hs, ds], dim=1) / 2.0 diff --git a/monai/apps/generation/__init__.py b/monai/apps/generation/__init__.py new file mode 100644 index 0000000000..1e97f89407 --- /dev/null +++ b/monai/apps/generation/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/monai/apps/generation/maisi/__init__.py b/monai/apps/generation/maisi/__init__.py new file mode 100644 index 0000000000..1e97f89407 --- /dev/null +++ b/monai/apps/generation/maisi/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/monai/apps/generation/maisi/networks/__init__.py b/monai/apps/generation/maisi/networks/__init__.py new file mode 100644 index 0000000000..1e97f89407 --- /dev/null +++ b/monai/apps/generation/maisi/networks/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/monai/apps/generation/maisi/networks/autoencoderkl_maisi.py b/monai/apps/generation/maisi/networks/autoencoderkl_maisi.py new file mode 100644 index 0000000000..a52274b24a --- /dev/null +++ b/monai/apps/generation/maisi/networks/autoencoderkl_maisi.py @@ -0,0 +1,991 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import gc +import logging +from typing import Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from monai.networks.blocks import Convolution +from monai.networks.blocks.spatialattention import SpatialAttentionBlock +from monai.networks.nets.autoencoderkl import AEKLResBlock, AutoencoderKL +from monai.utils.type_conversion import convert_to_tensor + +# Set up logging configuration +logger = logging.getLogger(__name__) + + +def _empty_cuda_cache(save_mem: bool) -> None: + if torch.cuda.is_available() and save_mem: + torch.cuda.empty_cache() + return + + +class MaisiGroupNorm3D(nn.GroupNorm): + """ + Custom 3D Group Normalization with optional print_info output. + + Args: + num_groups: Number of groups for the group norm. + num_channels: Number of channels for the group norm. + eps: Epsilon value for numerical stability. + affine: Whether to use learnable affine parameters, default to `True`. + norm_float16: If True, convert output of MaisiGroupNorm3D to float16 format, default to `False`. + print_info: Whether to print information, default to `False`. + save_mem: Whether to clean CUDA cache in order to save GPU memory, default to `True`. + """ + + def __init__( + self, + num_groups: int, + num_channels: int, + eps: float = 1e-5, + affine: bool = True, + norm_float16: bool = False, + print_info: bool = False, + save_mem: bool = True, + ): + super().__init__(num_groups, num_channels, eps, affine) + self.norm_float16 = norm_float16 + self.print_info = print_info + self.save_mem = save_mem + + def forward(self, input: torch.Tensor) -> torch.Tensor: + if self.print_info: + logger.info(f"MaisiGroupNorm3D with input size: {input.size()}") + + if len(input.shape) != 5: + raise ValueError("Expected a 5D tensor") + + param_n, param_c, param_d, param_h, param_w = input.shape + input = input.view(param_n, self.num_groups, param_c // self.num_groups, param_d, param_h, param_w) + + inputs = [] + for i in range(input.size(1)): + array = input[:, i : i + 1, ...].to(dtype=torch.float32) + mean = array.mean([2, 3, 4, 5], keepdim=True) + std = array.var([2, 3, 4, 5], unbiased=False, keepdim=True).add_(self.eps).sqrt_() + if self.norm_float16: + inputs.append(((array - mean) / std).to(dtype=torch.float16)) + else: + inputs.append((array - mean) / std) + + del input + _empty_cuda_cache(self.save_mem) + + input = torch.cat(inputs, dim=1) if max(inputs[0].size()) < 500 else self._cat_inputs(inputs) + + input = input.view(param_n, param_c, param_d, param_h, param_w) + if self.affine: + input.mul_(self.weight.view(1, param_c, 1, 1, 1)).add_(self.bias.view(1, param_c, 1, 1, 1)) + + if self.print_info: + logger.info(f"MaisiGroupNorm3D with output size: {input.size()}") + + return input + + def _cat_inputs(self, inputs): + input_type = inputs[0].device.type + input = inputs[0].clone().to("cpu", non_blocking=True) if input_type == "cuda" else inputs[0].clone() + inputs[0] = 0 + _empty_cuda_cache(self.save_mem) + + for k in range(len(inputs) - 1): + input = torch.cat((input, inputs[k + 1].cpu()), dim=1) + inputs[k + 1] = 0 + _empty_cuda_cache(self.save_mem) + gc.collect() + + if self.print_info: + logger.info(f"MaisiGroupNorm3D concat progress: {k + 1}/{len(inputs) - 1}.") + + return input.to("cuda", non_blocking=True) if input_type == "cuda" else input + + +class MaisiConvolution(nn.Module): + """ + Convolutional layer with optional print_info output and custom splitting mechanism. + + Args: + spatial_dims: Number of spatial dimensions (1D, 2D, 3D). + in_channels: Number of input channels. + out_channels: Number of output channels. + num_splits: Number of splits for the input tensor. + dim_split: Dimension of splitting for the input tensor. + print_info: Whether to print information. + save_mem: Whether to clean CUDA cache in order to save GPU memory, default to `True`. + Additional arguments for the convolution operation. + https://docs.monai.io/en/stable/networks.html#convolution + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + num_splits: int, + dim_split: int, + print_info: bool, + save_mem: bool = True, + strides: Sequence[int] | int = 1, + kernel_size: Sequence[int] | int = 3, + adn_ordering: str = "NDA", + act: tuple | str | None = "PRELU", + norm: tuple | str | None = "INSTANCE", + dropout: tuple | str | float | None = None, + dropout_dim: int = 1, + dilation: Sequence[int] | int = 1, + groups: int = 1, + bias: bool = True, + conv_only: bool = False, + is_transposed: bool = False, + padding: Sequence[int] | int | None = None, + output_padding: Sequence[int] | int | None = None, + ) -> None: + super().__init__() + self.conv = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=out_channels, + strides=strides, + kernel_size=kernel_size, + adn_ordering=adn_ordering, + act=act, + norm=norm, + dropout=dropout, + dropout_dim=dropout_dim, + dilation=dilation, + groups=groups, + bias=bias, + conv_only=conv_only, + is_transposed=is_transposed, + padding=padding, + output_padding=output_padding, + ) + + self.dim_split = dim_split + self.stride = strides[self.dim_split] if isinstance(strides, list) else strides + self.num_splits = num_splits + self.print_info = print_info + self.save_mem = save_mem + + def _split_tensor(self, x: torch.Tensor, split_size: int, padding: int) -> list[torch.Tensor]: + overlaps = [0] + [padding] * (self.num_splits - 1) + last_padding = x.size(self.dim_split + 2) % split_size + + slices = [slice(None)] * 5 + splits: list[torch.Tensor] = [] + for i in range(self.num_splits): + slices[self.dim_split + 2] = slice( + i * split_size - overlaps[i], + (i + 1) * split_size + (padding if i != self.num_splits - 1 else last_padding), + ) + splits.append(x[tuple(slices)]) + + if self.print_info: + for j in range(len(splits)): + logger.info(f"Split {j + 1}/{len(splits)} size: {splits[j].size()}") + + return splits + + def _concatenate_tensors(self, outputs: list[torch.Tensor], split_size: int, padding: int) -> torch.Tensor: + slices = [slice(None)] * 5 + for i in range(self.num_splits): + slices[self.dim_split + 2] = slice(None, split_size) if i == 0 else slice(padding, padding + split_size) + outputs[i] = outputs[i][tuple(slices)] + + if self.print_info: + for i in range(self.num_splits): + logger.info(f"Output {i + 1}/{len(outputs)} size after: {outputs[i].size()}") + + if max(outputs[0].size()) < 500: + x = torch.cat(outputs, dim=self.dim_split + 2) + else: + x = outputs[0].clone().to("cpu", non_blocking=True) + outputs[0] = torch.Tensor(0) + _empty_cuda_cache(self.save_mem) + for k in range(len(outputs) - 1): + x = torch.cat((x, outputs[k + 1].cpu()), dim=self.dim_split + 2) + outputs[k + 1] = torch.Tensor(0) + _empty_cuda_cache(self.save_mem) + gc.collect() + if self.print_info: + logger.info(f"MaisiConvolution concat progress: {k + 1}/{len(outputs) - 1}.") + + x = x.to("cuda", non_blocking=True) + return x + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.print_info: + logger.info(f"Number of splits: {self.num_splits}") + + # compute size of splits + l = x.size(self.dim_split + 2) + split_size = l // self.num_splits + + # update padding length if necessary + padding = 3 + if padding % self.stride > 0: + padding = (padding // self.stride + 1) * self.stride + if self.print_info: + logger.info(f"Padding size: {padding}") + + # split tensor into a list of tensors + splits = self._split_tensor(x, split_size, padding) + + del x + _empty_cuda_cache(self.save_mem) + + # convolution + outputs = [self.conv(split) for split in splits] + if self.print_info: + for j in range(len(outputs)): + logger.info(f"Output {j + 1}/{len(outputs)} size before: {outputs[j].size()}") + + # update size of splits and padding length for output + split_size_out = split_size + padding_s = padding + non_dim_split = self.dim_split + 1 if self.dim_split < 2 else 0 + if outputs[0].size(non_dim_split + 2) // splits[0].size(non_dim_split + 2) == 2: + split_size_out *= 2 + padding_s *= 2 + elif splits[0].size(non_dim_split + 2) // outputs[0].size(non_dim_split + 2) == 2: + split_size_out //= 2 + padding_s //= 2 + + # concatenate list of tensors + x = self._concatenate_tensors(outputs, split_size_out, padding_s) + + del outputs + _empty_cuda_cache(self.save_mem) + + return x + + +class MaisiUpsample(nn.Module): + """ + Convolution-based upsampling layer. + + Args: + spatial_dims: Number of spatial dimensions (1D, 2D, 3D). + in_channels: Number of input channels to the layer. + use_convtranspose: If True, use ConvTranspose to upsample feature maps in decoder. + num_splits: Number of splits for the input tensor. + dim_split: Dimension of splitting for the input tensor. + print_info: Whether to print information. + save_mem: Whether to clean CUDA cache in order to save GPU memory, default to `True`. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + use_convtranspose: bool, + num_splits: int, + dim_split: int, + print_info: bool, + save_mem: bool = True, + ) -> None: + super().__init__() + self.conv = MaisiConvolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=in_channels, + strides=2 if use_convtranspose else 1, + kernel_size=3, + padding=1, + conv_only=True, + is_transposed=use_convtranspose, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + self.use_convtranspose = use_convtranspose + self.save_mem = save_mem + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.use_convtranspose: + x = self.conv(x) + x_tensor: torch.Tensor = convert_to_tensor(x) + return x_tensor + + x = F.interpolate(x, scale_factor=2.0, mode="trilinear") + _empty_cuda_cache(self.save_mem) + x = self.conv(x) + _empty_cuda_cache(self.save_mem) + + out_tensor: torch.Tensor = convert_to_tensor(x) + return out_tensor + + +class MaisiDownsample(nn.Module): + """ + Convolution-based downsampling layer. + + Args: + spatial_dims: Number of spatial dimensions (1D, 2D, 3D). + in_channels: Number of input channels. + num_splits: Number of splits for the input tensor. + dim_split: Dimension of splitting for the input tensor. + print_info: Whether to print information. + save_mem: Whether to clean CUDA cache in order to save GPU memory, default to `True`. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + num_splits: int, + dim_split: int, + print_info: bool, + save_mem: bool = True, + ) -> None: + super().__init__() + self.pad = (0, 1) * spatial_dims + self.conv = MaisiConvolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=in_channels, + strides=2, + kernel_size=3, + padding=0, + conv_only=True, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = F.pad(x, self.pad, mode="constant", value=0.0) + x = self.conv(x) + return x + + +class MaisiResBlock(nn.Module): + """ + Residual block consisting of a cascade of 2 convolutions + activation + normalisation block, and a + residual connection between input and output. + + Args: + spatial_dims: Number of spatial dimensions (1D, 2D, 3D). + in_channels: Input channels to the layer. + norm_num_groups: Number of groups for the group norm layer. + norm_eps: Epsilon for the normalization. + out_channels: Number of output channels. + num_splits: Number of splits for the input tensor. + dim_split: Dimension of splitting for the input tensor. + norm_float16: If True, convert output of MaisiGroupNorm3D to float16 format, default to `False`. + print_info: Whether to print information, default to `False`. + save_mem: Whether to clean CUDA cache in order to save GPU memory, default to `True`. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + norm_num_groups: int, + norm_eps: float, + out_channels: int, + num_splits: int, + dim_split: int, + norm_float16: bool = False, + print_info: bool = False, + save_mem: bool = True, + ) -> None: + super().__init__() + self.in_channels = in_channels + self.out_channels = in_channels if out_channels is None else out_channels + self.save_mem = save_mem + + self.norm1 = MaisiGroupNorm3D( + num_groups=norm_num_groups, + num_channels=in_channels, + eps=norm_eps, + affine=True, + norm_float16=norm_float16, + print_info=print_info, + save_mem=save_mem, + ) + self.conv1 = MaisiConvolution( + spatial_dims=spatial_dims, + in_channels=self.in_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + self.norm2 = MaisiGroupNorm3D( + num_groups=norm_num_groups, + num_channels=out_channels, + eps=norm_eps, + affine=True, + norm_float16=norm_float16, + print_info=print_info, + save_mem=save_mem, + ) + self.conv2 = MaisiConvolution( + spatial_dims=spatial_dims, + in_channels=self.out_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + + self.nin_shortcut = ( + MaisiConvolution( + spatial_dims=spatial_dims, + in_channels=self.in_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + if self.in_channels != self.out_channels + else nn.Identity() + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + h = self.norm1(x) + _empty_cuda_cache(self.save_mem) + + h = F.silu(h) + _empty_cuda_cache(self.save_mem) + h = self.conv1(h) + _empty_cuda_cache(self.save_mem) + + h = self.norm2(h) + _empty_cuda_cache(self.save_mem) + + h = F.silu(h) + _empty_cuda_cache(self.save_mem) + h = self.conv2(h) + _empty_cuda_cache(self.save_mem) + + if self.in_channels != self.out_channels: + x = self.nin_shortcut(x) + _empty_cuda_cache(self.save_mem) + + out = x + h + out_tensor: torch.Tensor = convert_to_tensor(out) + return out_tensor + + +class MaisiEncoder(nn.Module): + """ + Convolutional cascade that downsamples the image into a spatial latent space. + + Args: + spatial_dims: Number of spatial dimensions (1D, 2D, 3D). + in_channels: Number of input channels. + num_channels: Sequence of block output channels. + out_channels: Number of channels in the bottom layer (latent space) of the autoencoder. + num_res_blocks: Number of residual blocks (see AEKLResBlock) per level. + norm_num_groups: Number of groups for the group norm layers. + norm_eps: Epsilon for the normalization. + attention_levels: Indicate which level from num_channels contain an attention block. + with_nonlocal_attn: If True, use non-local attention block. + include_fc: whether to include the final linear layer in the attention block. Default to False. + use_combined_linear: whether to use a single linear layer for qkv projection in the attention block, default to False. + use_flash_attention: If True, use flash attention for a memory efficient attention mechanism. + num_splits: Number of splits for the input tensor. + dim_split: Dimension of splitting for the input tensor. + norm_float16: If True, convert output of MaisiGroupNorm3D to float16 format, default to `False`. + print_info: Whether to print information, default to `False`. + save_mem: Whether to clean CUDA cache in order to save GPU memory, default to `True`. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + num_channels: Sequence[int], + out_channels: int, + num_res_blocks: Sequence[int], + norm_num_groups: int, + norm_eps: float, + attention_levels: Sequence[bool], + num_splits: int, + dim_split: int, + norm_float16: bool = False, + print_info: bool = False, + save_mem: bool = True, + with_nonlocal_attn: bool = True, + include_fc: bool = False, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + + # Check if attention_levels and num_channels have the same size + if len(attention_levels) != len(num_channels): + raise ValueError("attention_levels and num_channels must have the same size") + + # Check if num_res_blocks and num_channels have the same size + if len(num_res_blocks) != len(num_channels): + raise ValueError("num_res_blocks and num_channels must have the same size") + + self.save_mem = save_mem + + blocks: list[nn.Module] = [] + + blocks.append( + MaisiConvolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=num_channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + ) + + output_channel = num_channels[0] + for i in range(len(num_channels)): + input_channel = output_channel + output_channel = num_channels[i] + is_final_block = i == len(num_channels) - 1 + + for _ in range(num_res_blocks[i]): + blocks.append( + MaisiResBlock( + spatial_dims=spatial_dims, + in_channels=input_channel, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=output_channel, + num_splits=num_splits, + dim_split=dim_split, + norm_float16=norm_float16, + print_info=print_info, + save_mem=save_mem, + ) + ) + input_channel = output_channel + if attention_levels[i]: + blocks.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=input_channel, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + if not is_final_block: + blocks.append( + MaisiDownsample( + spatial_dims=spatial_dims, + in_channels=input_channel, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + ) + + if with_nonlocal_attn: + blocks.append( + AEKLResBlock( + spatial_dims=spatial_dims, + in_channels=num_channels[-1], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=num_channels[-1], + ) + ) + + blocks.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=num_channels[-1], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + blocks.append( + AEKLResBlock( + spatial_dims=spatial_dims, + in_channels=num_channels[-1], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=num_channels[-1], + ) + ) + + blocks.append( + MaisiGroupNorm3D( + num_groups=norm_num_groups, + num_channels=num_channels[-1], + eps=norm_eps, + affine=True, + norm_float16=norm_float16, + print_info=print_info, + save_mem=save_mem, + ) + ) + blocks.append( + MaisiConvolution( + spatial_dims=spatial_dims, + in_channels=num_channels[-1], + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + ) + + self.blocks = nn.ModuleList(blocks) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + for block in self.blocks: + x = block(x) + _empty_cuda_cache(self.save_mem) + return x + + +class MaisiDecoder(nn.Module): + """ + Convolutional cascade upsampling from a spatial latent space into an image space. + + Args: + spatial_dims: Number of spatial dimensions (1D, 2D, 3D). + num_channels: Sequence of block output channels. + in_channels: Number of channels in the bottom layer (latent space) of the autoencoder. + out_channels: Number of output channels. + num_res_blocks: Number of residual blocks (see AEKLResBlock) per level. + norm_num_groups: Number of groups for the group norm layers. + norm_eps: Epsilon for the normalization. + attention_levels: Indicate which level from num_channels contain an attention block. + with_nonlocal_attn: If True, use non-local attention block. + include_fc: whether to include the final linear layer in the attention block. Default to False. + use_combined_linear: whether to use a single linear layer for qkv projection in the attention block, default to False. + use_flash_attention: If True, use flash attention for a memory efficient attention mechanism. + use_convtranspose: If True, use ConvTranspose to upsample feature maps in decoder. + num_splits: Number of splits for the input tensor. + dim_split: Dimension of splitting for the input tensor. + norm_float16: If True, convert output of MaisiGroupNorm3D to float16 format, default to `False`. + print_info: Whether to print information, default to `False`. + save_mem: Whether to clean CUDA cache in order to save GPU memory, default to `True`. + """ + + def __init__( + self, + spatial_dims: int, + num_channels: Sequence[int], + in_channels: int, + out_channels: int, + num_res_blocks: Sequence[int], + norm_num_groups: int, + norm_eps: float, + attention_levels: Sequence[bool], + num_splits: int, + dim_split: int, + norm_float16: bool = False, + print_info: bool = False, + save_mem: bool = True, + with_nonlocal_attn: bool = True, + include_fc: bool = False, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + use_convtranspose: bool = False, + ) -> None: + super().__init__() + self.print_info = print_info + self.save_mem = save_mem + + reversed_block_out_channels = list(reversed(num_channels)) + + blocks: list[nn.Module] = [] + + blocks.append( + MaisiConvolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=reversed_block_out_channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + ) + + if with_nonlocal_attn: + blocks.append( + AEKLResBlock( + spatial_dims=spatial_dims, + in_channels=reversed_block_out_channels[0], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=reversed_block_out_channels[0], + ) + ) + blocks.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=reversed_block_out_channels[0], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + blocks.append( + AEKLResBlock( + spatial_dims=spatial_dims, + in_channels=reversed_block_out_channels[0], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=reversed_block_out_channels[0], + ) + ) + + reversed_attention_levels = list(reversed(attention_levels)) + reversed_num_res_blocks = list(reversed(num_res_blocks)) + block_out_ch = reversed_block_out_channels[0] + for i in range(len(reversed_block_out_channels)): + block_in_ch = block_out_ch + block_out_ch = reversed_block_out_channels[i] + is_final_block = i == len(num_channels) - 1 + + for _ in range(reversed_num_res_blocks[i]): + blocks.append( + MaisiResBlock( + spatial_dims=spatial_dims, + in_channels=block_in_ch, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=block_out_ch, + num_splits=num_splits, + dim_split=dim_split, + norm_float16=norm_float16, + print_info=print_info, + save_mem=save_mem, + ) + ) + block_in_ch = block_out_ch + + if reversed_attention_levels[i]: + blocks.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=block_in_ch, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + if not is_final_block: + blocks.append( + MaisiUpsample( + spatial_dims=spatial_dims, + in_channels=block_in_ch, + use_convtranspose=use_convtranspose, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + ) + + blocks.append( + MaisiGroupNorm3D( + num_groups=norm_num_groups, + num_channels=block_in_ch, + eps=norm_eps, + affine=True, + norm_float16=norm_float16, + print_info=print_info, + save_mem=save_mem, + ) + ) + blocks.append( + MaisiConvolution( + spatial_dims=spatial_dims, + in_channels=block_in_ch, + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + num_splits=num_splits, + dim_split=dim_split, + print_info=print_info, + save_mem=save_mem, + ) + ) + + self.blocks = nn.ModuleList(blocks) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + for block in self.blocks: + x = block(x) + _empty_cuda_cache(self.save_mem) + return x + + +class AutoencoderKlMaisi(AutoencoderKL): + """ + AutoencoderKL with custom MaisiEncoder and MaisiDecoder. + + Args: + spatial_dims: Number of spatial dimensions (1D, 2D, 3D). + in_channels: Number of input channels. + out_channels: Number of output channels. + num_res_blocks: Number of residual blocks per level. + num_channels: Sequence of block output channels. + attention_levels: Indicate which level from num_channels contain an attention block. + latent_channels: Number of channels in the latent space. + norm_num_groups: Number of groups for the group norm layers. + norm_eps: Epsilon for the normalization. + with_encoder_nonlocal_attn: If True, use non-local attention block in the encoder. + with_decoder_nonlocal_attn: If True, use non-local attention block in the decoder. + include_fc: whether to include the final linear layer. Default to False. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: If True, use flash attention for a memory efficient attention mechanism. + use_checkpointing: If True, use activation checkpointing. + use_convtranspose: If True, use ConvTranspose to upsample feature maps in decoder. + num_splits: Number of splits for the input tensor. + dim_split: Dimension of splitting for the input tensor. + norm_float16: If True, convert output of MaisiGroupNorm3D to float16 format, default to `False`. + print_info: Whether to print information, default to `False`. + save_mem: Whether to clean CUDA cache in order to save GPU memory, default to `True`. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + num_res_blocks: Sequence[int], + num_channels: Sequence[int], + attention_levels: Sequence[bool], + latent_channels: int = 3, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + with_encoder_nonlocal_attn: bool = False, + with_decoder_nonlocal_attn: bool = False, + include_fc: bool = False, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + use_checkpointing: bool = False, + use_convtranspose: bool = False, + num_splits: int = 16, + dim_split: int = 0, + norm_float16: bool = False, + print_info: bool = False, + save_mem: bool = True, + ) -> None: + super().__init__( + spatial_dims, + in_channels, + out_channels, + num_res_blocks, + num_channels, + attention_levels, + latent_channels, + norm_num_groups, + norm_eps, + with_encoder_nonlocal_attn, + with_decoder_nonlocal_attn, + use_checkpointing, + use_convtranspose, + include_fc, + use_combined_linear, + use_flash_attention, + ) + + self.encoder: nn.Module = MaisiEncoder( + spatial_dims=spatial_dims, + in_channels=in_channels, + num_channels=num_channels, + out_channels=latent_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + attention_levels=attention_levels, + with_nonlocal_attn=with_encoder_nonlocal_attn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + num_splits=num_splits, + dim_split=dim_split, + norm_float16=norm_float16, + print_info=print_info, + save_mem=save_mem, + ) + + self.decoder: nn.Module = MaisiDecoder( + spatial_dims=spatial_dims, + num_channels=num_channels, + in_channels=latent_channels, + out_channels=out_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + attention_levels=attention_levels, + with_nonlocal_attn=with_decoder_nonlocal_attn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + use_convtranspose=use_convtranspose, + num_splits=num_splits, + dim_split=dim_split, + norm_float16=norm_float16, + print_info=print_info, + save_mem=save_mem, + ) diff --git a/monai/apps/generation/maisi/networks/controlnet_maisi.py b/monai/apps/generation/maisi/networks/controlnet_maisi.py new file mode 100644 index 0000000000..269086d971 --- /dev/null +++ b/monai/apps/generation/maisi/networks/controlnet_maisi.py @@ -0,0 +1,175 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Sequence + +import torch + +from monai.networks.nets.controlnet import ControlNet +from monai.networks.nets.diffusion_model_unet import get_timestep_embedding + + +class ControlNetMaisi(ControlNet): + """ + Control network for diffusion models based on Zhang and Agrawala "Adding Conditional Control to Text-to-Image + Diffusion Models" (https://arxiv.org/abs/2302.05543) + + Args: + spatial_dims: number of spatial dimensions. + in_channels: number of input channels. + num_res_blocks: number of residual blocks (see ResnetBlock) per level. + num_channels: tuple of block output channels. + attention_levels: list of levels to add attention. + norm_num_groups: number of groups for the normalization. + norm_eps: epsilon for the normalization. + resblock_updown: if True use residual blocks for up/downsampling. + num_head_channels: number of channels in each attention head. + with_conditioning: if True add spatial transformers to perform conditioning. + transformer_num_layers: number of layers of Transformer blocks to use. + cross_attention_dim: number of context dimensions to use. + num_class_embeds: if specified (as an int), then this model will be class-conditional with `num_class_embeds` + classes. + upcast_attention: if True, upcast attention operations to full precision. + conditioning_embedding_in_channels: number of input channels for the conditioning embedding. + conditioning_embedding_num_channels: number of channels for the blocks in the conditioning embedding. + use_checkpointing: if True, use activation checkpointing to save memory. + include_fc: whether to include the final linear layer. Default to False. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use flash attention for a memory efficient attention mechanism. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), + num_channels: Sequence[int] = (32, 64, 64, 64), + attention_levels: Sequence[bool] = (False, False, True, True), + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + resblock_updown: bool = False, + num_head_channels: int | Sequence[int] = 8, + with_conditioning: bool = False, + transformer_num_layers: int = 1, + cross_attention_dim: int | None = None, + num_class_embeds: int | None = None, + upcast_attention: bool = False, + conditioning_embedding_in_channels: int = 1, + conditioning_embedding_num_channels: Sequence[int] = (16, 32, 96, 256), + use_checkpointing: bool = True, + include_fc: bool = False, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__( + spatial_dims, + in_channels, + num_res_blocks, + num_channels, + attention_levels, + norm_num_groups, + norm_eps, + resblock_updown, + num_head_channels, + with_conditioning, + transformer_num_layers, + cross_attention_dim, + num_class_embeds, + upcast_attention, + conditioning_embedding_in_channels, + conditioning_embedding_num_channels, + include_fc, + use_combined_linear, + use_flash_attention, + ) + self.use_checkpointing = use_checkpointing + + def forward( + self, + x: torch.Tensor, + timesteps: torch.Tensor, + controlnet_cond: torch.Tensor, + conditioning_scale: float = 1.0, + context: torch.Tensor | None = None, + class_labels: torch.Tensor | None = None, + ) -> tuple[list[torch.Tensor], torch.Tensor]: + emb = self._prepare_time_and_class_embedding(x, timesteps, class_labels) + h = self._apply_initial_convolution(x) + if self.use_checkpointing: + controlnet_cond = torch.utils.checkpoint.checkpoint( + self.controlnet_cond_embedding, controlnet_cond, use_reentrant=False + ) + else: + controlnet_cond = self.controlnet_cond_embedding(controlnet_cond) + h += controlnet_cond + down_block_res_samples, h = self._apply_down_blocks(emb, context, h) + h = self._apply_mid_block(emb, context, h) + down_block_res_samples, mid_block_res_sample = self._apply_controlnet_blocks(h, down_block_res_samples) + # scaling + down_block_res_samples = [h * conditioning_scale for h in down_block_res_samples] + mid_block_res_sample *= conditioning_scale + + return down_block_res_samples, mid_block_res_sample + + def _prepare_time_and_class_embedding(self, x, timesteps, class_labels): + # 1. time + t_emb = get_timestep_embedding(timesteps, self.block_out_channels[0]) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=x.dtype) + emb = self.time_embed(t_emb) + + # 2. class + if self.num_class_embeds is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + class_emb = self.class_embedding(class_labels) + class_emb = class_emb.to(dtype=x.dtype) + emb = emb + class_emb + + return emb + + def _apply_initial_convolution(self, x): + # 3. initial convolution + h = self.conv_in(x) + return h + + def _apply_down_blocks(self, emb, context, h): + # 4. down + if context is not None and self.with_conditioning is False: + raise ValueError("model should have with_conditioning = True if context is provided") + down_block_res_samples: list[torch.Tensor] = [h] + for downsample_block in self.down_blocks: + h, res_samples = downsample_block(hidden_states=h, temb=emb, context=context) + for residual in res_samples: + down_block_res_samples.append(residual) + + return down_block_res_samples, h + + def _apply_mid_block(self, emb, context, h): + # 5. mid + h = self.middle_block(hidden_states=h, temb=emb, context=context) + return h + + def _apply_controlnet_blocks(self, h, down_block_res_samples): + # 6. Control net blocks + controlnet_down_block_res_samples = [] + for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks): + down_block_res_sample = controlnet_block(down_block_res_sample) + controlnet_down_block_res_samples.append(down_block_res_sample) + + mid_block_res_sample = self.controlnet_mid_block(h) + + return controlnet_down_block_res_samples, mid_block_res_sample diff --git a/monai/apps/generation/maisi/networks/diffusion_model_unet_maisi.py b/monai/apps/generation/maisi/networks/diffusion_model_unet_maisi.py new file mode 100644 index 0000000000..e990b5fc98 --- /dev/null +++ b/monai/apps/generation/maisi/networks/diffusion_model_unet_maisi.py @@ -0,0 +1,410 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ========================================================================= +# Adapted from https://github.com/huggingface/diffusers +# which has the following license: +# https://github.com/huggingface/diffusers/blob/main/LICENSE +# +# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + +from __future__ import annotations + +from collections.abc import Sequence + +import torch +from torch import nn + +from monai.networks.blocks import Convolution +from monai.networks.nets.diffusion_model_unet import ( + get_down_block, + get_mid_block, + get_timestep_embedding, + get_up_block, + zero_module, +) +from monai.utils import ensure_tuple_rep +from monai.utils.type_conversion import convert_to_tensor + +__all__ = ["DiffusionModelUNetMaisi"] + + +class DiffusionModelUNetMaisi(nn.Module): + """ + U-Net network with timestep embedding and attention mechanisms for conditioning based on + Rombach et al. "High-Resolution Image Synthesis with Latent Diffusion Models" https://arxiv.org/abs/2112.10752 + and Pinaya et al. "Brain Imaging Generation with Latent Diffusion Models" https://arxiv.org/abs/2209.07162 + + Args: + spatial_dims: Number of spatial dimensions. + in_channels: Number of input channels. + out_channels: Number of output channels. + num_res_blocks: Number of residual blocks (see ResnetBlock) per level. Can be a single integer or a sequence of integers. + num_channels: Tuple of block output channels. + attention_levels: List of levels to add attention. + norm_num_groups: Number of groups for the normalization. + norm_eps: Epsilon for the normalization. + resblock_updown: If True, use residual blocks for up/downsampling. + num_head_channels: Number of channels in each attention head. Can be a single integer or a sequence of integers. + with_conditioning: If True, add spatial transformers to perform conditioning. + transformer_num_layers: Number of layers of Transformer blocks to use. + cross_attention_dim: Number of context dimensions to use. + num_class_embeds: If specified (as an int), then this model will be class-conditional with `num_class_embeds` classes. + upcast_attention: If True, upcast attention operations to full precision. + include_fc: whether to include the final linear layer. Default to False. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: If True, use flash attention for a memory efficient attention mechanism. + dropout_cattn: If different from zero, this will be the dropout value for the cross-attention layers. + include_top_region_index_input: If True, use top region index input. + include_bottom_region_index_input: If True, use bottom region index input. + include_spacing_input: If True, use spacing input. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), + num_channels: Sequence[int] = (32, 64, 64, 64), + attention_levels: Sequence[bool] = (False, False, True, True), + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + resblock_updown: bool = False, + num_head_channels: int | Sequence[int] = 8, + with_conditioning: bool = False, + transformer_num_layers: int = 1, + cross_attention_dim: int | None = None, + num_class_embeds: int | None = None, + upcast_attention: bool = False, + include_fc: bool = False, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + dropout_cattn: float = 0.0, + include_top_region_index_input: bool = False, + include_bottom_region_index_input: bool = False, + include_spacing_input: bool = False, + ) -> None: + super().__init__() + if with_conditioning is True and cross_attention_dim is None: + raise ValueError( + "DiffusionModelUNetMaisi expects dimension of the cross-attention conditioning (cross_attention_dim) " + "when using with_conditioning." + ) + if cross_attention_dim is not None and with_conditioning is False: + raise ValueError( + "DiffusionModelUNetMaisi expects with_conditioning=True when specifying the cross_attention_dim." + ) + if dropout_cattn > 1.0 or dropout_cattn < 0.0: + raise ValueError("Dropout cannot be negative or >1.0!") + + # All number of channels should be multiple of num_groups + if any((out_channel % norm_num_groups) != 0 for out_channel in num_channels): + raise ValueError( + f"DiffusionModelUNetMaisi expects all num_channels being multiple of norm_num_groups, " + f"but get num_channels: {num_channels} and norm_num_groups: {norm_num_groups}" + ) + + if len(num_channels) != len(attention_levels): + raise ValueError( + f"DiffusionModelUNetMaisi expects num_channels being same size of attention_levels, " + f"but get num_channels: {len(num_channels)} and attention_levels: {len(attention_levels)}" + ) + + if isinstance(num_head_channels, int): + num_head_channels = ensure_tuple_rep(num_head_channels, len(attention_levels)) + + if len(num_head_channels) != len(attention_levels): + raise ValueError( + "num_head_channels should have the same length as attention_levels. For the i levels without attention," + " i.e. `attention_level[i]=False`, the num_head_channels[i] will be ignored." + ) + + if isinstance(num_res_blocks, int): + num_res_blocks = ensure_tuple_rep(num_res_blocks, len(num_channels)) + + if len(num_res_blocks) != len(num_channels): + raise ValueError( + "`num_res_blocks` should be a single integer or a tuple of integers with the same length as " + "`num_channels`." + ) + + if use_flash_attention is True and not torch.cuda.is_available(): + raise ValueError( + "torch.cuda.is_available() should be True but is False. Flash attention is only available for GPU." + ) + + self.in_channels = in_channels + self.block_out_channels = num_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_levels = attention_levels + self.num_head_channels = num_head_channels + self.with_conditioning = with_conditioning + + # input + self.conv_in = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=num_channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + + # time + time_embed_dim = num_channels[0] * 4 + self.time_embed = self._create_embedding_module(num_channels[0], time_embed_dim) + + # class embedding + self.num_class_embeds = num_class_embeds + if num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) + + self.include_top_region_index_input = include_top_region_index_input + self.include_bottom_region_index_input = include_bottom_region_index_input + self.include_spacing_input = include_spacing_input + + new_time_embed_dim = time_embed_dim + if self.include_top_region_index_input: + self.top_region_index_layer = self._create_embedding_module(4, time_embed_dim) + new_time_embed_dim += time_embed_dim + if self.include_bottom_region_index_input: + self.bottom_region_index_layer = self._create_embedding_module(4, time_embed_dim) + new_time_embed_dim += time_embed_dim + if self.include_spacing_input: + self.spacing_layer = self._create_embedding_module(3, time_embed_dim) + new_time_embed_dim += time_embed_dim + + # down + self.down_blocks = nn.ModuleList([]) + output_channel = num_channels[0] + for i in range(len(num_channels)): + input_channel = output_channel + output_channel = num_channels[i] + is_final_block = i == len(num_channels) - 1 + down_block = get_down_block( + spatial_dims=spatial_dims, + in_channels=input_channel, + out_channels=output_channel, + temb_channels=new_time_embed_dim, + num_res_blocks=num_res_blocks[i], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_downsample=not is_final_block, + resblock_updown=resblock_updown, + with_attn=(attention_levels[i] and not with_conditioning), + with_cross_attn=(attention_levels[i] and with_conditioning), + num_head_channels=num_head_channels[i], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + dropout_cattn=dropout_cattn, + ) + + self.down_blocks.append(down_block) + + # mid + self.middle_block = get_mid_block( + spatial_dims=spatial_dims, + in_channels=num_channels[-1], + temb_channels=new_time_embed_dim, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + with_conditioning=with_conditioning, + num_head_channels=num_head_channels[-1], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + dropout_cattn=dropout_cattn, + ) + + # up + self.up_blocks = nn.ModuleList([]) + reversed_block_out_channels = list(reversed(num_channels)) + reversed_num_res_blocks = list(reversed(num_res_blocks)) + reversed_attention_levels = list(reversed(attention_levels)) + reversed_num_head_channels = list(reversed(num_head_channels)) + output_channel = reversed_block_out_channels[0] + for i in range(len(reversed_block_out_channels)): + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[min(i + 1, len(num_channels) - 1)] + + is_final_block = i == len(num_channels) - 1 + + up_block = get_up_block( + spatial_dims=spatial_dims, + in_channels=input_channel, + prev_output_channel=prev_output_channel, + out_channels=output_channel, + temb_channels=new_time_embed_dim, + num_res_blocks=reversed_num_res_blocks[i] + 1, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_upsample=not is_final_block, + resblock_updown=resblock_updown, + with_attn=(reversed_attention_levels[i] and not with_conditioning), + with_cross_attn=(reversed_attention_levels[i] and with_conditioning), + num_head_channels=reversed_num_head_channels[i], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + dropout_cattn=dropout_cattn, + ) + + self.up_blocks.append(up_block) + + # out + self.out = nn.Sequential( + nn.GroupNorm(num_groups=norm_num_groups, num_channels=num_channels[0], eps=norm_eps, affine=True), + nn.SiLU(), + zero_module( + Convolution( + spatial_dims=spatial_dims, + in_channels=num_channels[0], + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ), + ) + + def _create_embedding_module(self, input_dim, embed_dim): + model = nn.Sequential(nn.Linear(input_dim, embed_dim), nn.SiLU(), nn.Linear(embed_dim, embed_dim)) + return model + + def _get_time_and_class_embedding(self, x, timesteps, class_labels): + t_emb = get_timestep_embedding(timesteps, self.block_out_channels[0]) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=x.dtype) + emb = self.time_embed(t_emb) + + if self.num_class_embeds is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + class_emb = self.class_embedding(class_labels) + class_emb = class_emb.to(dtype=x.dtype) + emb += class_emb + return emb + + def _get_input_embeddings(self, emb, top_index, bottom_index, spacing): + if self.include_top_region_index_input: + _emb = self.top_region_index_layer(top_index) + emb = torch.cat((emb, _emb), dim=1) + if self.include_bottom_region_index_input: + _emb = self.bottom_region_index_layer(bottom_index) + emb = torch.cat((emb, _emb), dim=1) + if self.include_spacing_input: + _emb = self.spacing_layer(spacing) + emb = torch.cat((emb, _emb), dim=1) + return emb + + def _apply_down_blocks(self, h, emb, context, down_block_additional_residuals): + if context is not None and self.with_conditioning is False: + raise ValueError("model should have with_conditioning = True if context is provided") + down_block_res_samples: list[torch.Tensor] = [h] + for downsample_block in self.down_blocks: + h, res_samples = downsample_block(hidden_states=h, temb=emb, context=context) + down_block_res_samples.extend(res_samples) + + # Additional residual conections for Controlnets + if down_block_additional_residuals is not None: + new_down_block_res_samples: list[torch.Tensor] = [] + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample += down_block_additional_residual + new_down_block_res_samples.append(down_block_res_sample) + + down_block_res_samples = new_down_block_res_samples + return h, down_block_res_samples + + def _apply_up_blocks(self, h, emb, context, down_block_res_samples): + for upsample_block in self.up_blocks: + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + h = upsample_block(hidden_states=h, res_hidden_states_list=res_samples, temb=emb, context=context) + + return h + + def forward( + self, + x: torch.Tensor, + timesteps: torch.Tensor, + context: torch.Tensor | None = None, + class_labels: torch.Tensor | None = None, + down_block_additional_residuals: tuple[torch.Tensor] | None = None, + mid_block_additional_residual: torch.Tensor | None = None, + top_region_index_tensor: torch.Tensor | None = None, + bottom_region_index_tensor: torch.Tensor | None = None, + spacing_tensor: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Forward pass through the UNet model. + + Args: + x: Input tensor of shape (N, C, SpatialDims). + timesteps: Timestep tensor of shape (N,). + context: Context tensor of shape (N, 1, ContextDim). + class_labels: Class labels tensor of shape (N,). + down_block_additional_residuals: Additional residual tensors for down blocks of shape (N, C, FeatureMapsDims). + mid_block_additional_residual: Additional residual tensor for mid block of shape (N, C, FeatureMapsDims). + top_region_index_tensor: Tensor representing top region index of shape (N, 4). + bottom_region_index_tensor: Tensor representing bottom region index of shape (N, 4). + spacing_tensor: Tensor representing spacing of shape (N, 3). + + Returns: + A tensor representing the output of the UNet model. + """ + + emb = self._get_time_and_class_embedding(x, timesteps, class_labels) + emb = self._get_input_embeddings(emb, top_region_index_tensor, bottom_region_index_tensor, spacing_tensor) + h = self.conv_in(x) + h, _updated_down_block_res_samples = self._apply_down_blocks(h, emb, context, down_block_additional_residuals) + h = self.middle_block(h, emb, context) + + # Additional residual conections for Controlnets + if mid_block_additional_residual is not None: + h += mid_block_additional_residual + + h = self._apply_up_blocks(h, emb, context, _updated_down_block_res_samples) + h = self.out(h) + h_tensor: torch.Tensor = convert_to_tensor(h) + return h_tensor diff --git a/monai/apps/generation/maisi/utils/__init__.py b/monai/apps/generation/maisi/utils/__init__.py new file mode 100644 index 0000000000..1e97f89407 --- /dev/null +++ b/monai/apps/generation/maisi/utils/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/monai/apps/nnunet/nnunetv2_runner.py b/monai/apps/nnunet/nnunetv2_runner.py index 44b3c24256..8a10849904 100644 --- a/monai/apps/nnunet/nnunetv2_runner.py +++ b/monai/apps/nnunet/nnunetv2_runner.py @@ -22,6 +22,7 @@ from monai.apps.nnunet.utils import analyze_data, create_new_data_copy, create_new_dataset_json from monai.bundle import ConfigParser from monai.utils import ensure_tuple, optional_import +from monai.utils.misc import run_cmd load_pickle, _ = optional_import("batchgenerators.utilities.file_and_folder_operations", name="load_pickle") join, _ = optional_import("batchgenerators.utilities.file_and_folder_operations", name="join") @@ -495,65 +496,64 @@ def train_single_model(self, config: Any, fold: int, gpu_id: tuple | list | int fold: fold of the 5-fold cross-validation. Should be an int between 0 and 4. gpu_id: an integer to select the device to use, or a tuple/list of GPU device indices used for multi-GPU training (e.g., (0,1)). Default: 0. - from nnunetv2.run.run_training import run_training kwargs: this optional parameter allows you to specify additional arguments in - ``nnunetv2.run.run_training.run_training``. Currently supported args are - - plans_identifier: custom plans identifier. Default: "nnUNetPlans". - - pretrained_weights: path to nnU-Net checkpoint file to be used as pretrained model. Will only be - used when actually training. Beta. Use with caution. Default: False. - - use_compressed_data: True to use compressed data for training. Reading compressed data is much - more CPU and (potentially) RAM intensive and should only be used if you know what you are - doing. Default: False. - - continue_training: continue training from latest checkpoint. Default: False. - - only_run_validation: True to run the validation only. Requires training to have finished. - Default: False. - - disable_checkpointing: True to disable checkpointing. Ideal for testing things out and you - don't want to flood your hard drive with checkpoints. Default: False. + ``nnunetv2.run.run_training.run_training_entry``. + + Currently supported args are: + + - p: custom plans identifier. Default: "nnUNetPlans". + - pretrained_weights: path to nnU-Net checkpoint file to be used as pretrained model. Will only be + used when actually training. Beta. Use with caution. Default: False. + - use_compressed: True to use compressed data for training. Reading compressed data is much + more CPU and (potentially) RAM intensive and should only be used if you know what you are + doing. Default: False. + - c: continue training from latest checkpoint. Default: False. + - val: True to run the validation only. Requires training to have finished. + Default: False. + - disable_checkpointing: True to disable checkpointing. Ideal for testing things out and you + don't want to flood your hard drive with checkpoints. Default: False. """ if "num_gpus" in kwargs: kwargs.pop("num_gpus") logger.warning("please use gpu_id to set the GPUs to use") - if "trainer_class_name" in kwargs: - kwargs.pop("trainer_class_name") + if "tr" in kwargs: + kwargs.pop("tr") logger.warning("please specify the `trainer_class_name` in the __init__ of `nnUNetV2Runner`.") - if "export_validation_probabilities" in kwargs: - kwargs.pop("export_validation_probabilities") + if "npz" in kwargs: + kwargs.pop("npz") logger.warning("please specify the `export_validation_probabilities` in the __init__ of `nnUNetV2Runner`.") + cmd = self.train_single_model_command(config, fold, gpu_id, kwargs) + run_cmd(cmd, shell=True) + + def train_single_model_command(self, config, fold, gpu_id, kwargs): if isinstance(gpu_id, (tuple, list)): if len(gpu_id) > 1: gpu_ids_str = "" for _i in range(len(gpu_id)): gpu_ids_str += f"{gpu_id[_i]}," - os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids_str[:-1] + device_setting = f"CUDA_VISIBLE_DEVICES={gpu_ids_str[:-1]}" else: - os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id[0]}" - else: - os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" - - from nnunetv2.run.run_training import run_training - - if isinstance(gpu_id, int) or len(gpu_id) == 1: - run_training( - dataset_name_or_id=self.dataset_name_or_id, - configuration=config, - fold=fold, - trainer_class_name=self.trainer_class_name, - export_validation_probabilities=self.export_validation_probabilities, - **kwargs, - ) + device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id[0]}" else: - run_training( - dataset_name_or_id=self.dataset_name_or_id, - configuration=config, - fold=fold, - num_gpus=len(gpu_id), - trainer_class_name=self.trainer_class_name, - export_validation_probabilities=self.export_validation_probabilities, - **kwargs, - ) + device_setting = f"CUDA_VISIBLE_DEVICES={gpu_id}" + num_gpus = 1 if isinstance(gpu_id, int) or len(gpu_id) == 1 else len(gpu_id) + + cmd = ( + f"{device_setting} nnUNetv2_train " + + f"{self.dataset_name_or_id} {config} {fold} " + + f"-tr {self.trainer_class_name} -num_gpus {num_gpus}" + ) + if self.export_validation_probabilities: + cmd += " --npz" + for _key, _value in kwargs.items(): + if _key == "p" or _key == "pretrained_weights": + cmd += f" -{_key} {_value}" + else: + cmd += f" --{_key} {_value}" + return cmd def train( self, @@ -637,15 +637,7 @@ def train_parallel_cmd( if _config in ensure_tuple(configs): for _i in range(self.num_folds): the_device = gpu_id_for_all[_index % n_devices] # type: ignore - cmd = ( - "python -m monai.apps.nnunet nnUNetV2Runner train_single_model " - + f"--input_config '{self.input_config_or_dict}' --work_dir '{self.work_dir}' " - + f"--config '{_config}' --fold {_i} --gpu_id {the_device} " - + f"--trainer_class_name {self.trainer_class_name} " - + f"--export_validation_probabilities {self.export_validation_probabilities}" - ) - for _key, _value in kwargs.items(): - cmd += f" --{_key} {_value}" + cmd = self.train_single_model_command(_config, _i, the_device, kwargs) all_cmds[-1][the_device].append(cmd) _index += 1 return all_cmds diff --git a/monai/apps/nuclick/transforms.py b/monai/apps/nuclick/transforms.py index f22ea764be..4828bd2e5a 100644 --- a/monai/apps/nuclick/transforms.py +++ b/monai/apps/nuclick/transforms.py @@ -24,7 +24,7 @@ measure, _ = optional_import("skimage.measure") morphology, _ = optional_import("skimage.morphology") -distance_transform_cdt, _ = optional_import("scipy.ndimage.morphology", name="distance_transform_cdt") +distance_transform_cdt, _ = optional_import("scipy.ndimage", name="distance_transform_cdt") class NuclickKeys(StrEnum): diff --git a/monai/apps/pathology/transforms/post/array.py b/monai/apps/pathology/transforms/post/array.py index 99e94f89c0..035bce2c69 100644 --- a/monai/apps/pathology/transforms/post/array.py +++ b/monai/apps/pathology/transforms/post/array.py @@ -28,12 +28,12 @@ SobelGradients, ) from monai.transforms.transform import Transform -from monai.transforms.utils_pytorch_numpy_unification import max, maximum, min, sum, unique +from monai.transforms.utils_pytorch_numpy_unification import max, maximum, min, sum, unique, where from monai.utils import TransformBackends, convert_to_numpy, optional_import from monai.utils.misc import ensure_tuple_rep from monai.utils.type_conversion import convert_to_dst_type, convert_to_tensor -label, _ = optional_import("scipy.ndimage.measurements", name="label") +label, _ = optional_import("scipy.ndimage", name="label") disk, _ = optional_import("skimage.morphology", name="disk") opening, _ = optional_import("skimage.morphology", name="opening") watershed, _ = optional_import("skimage.segmentation", name="watershed") @@ -162,7 +162,8 @@ def __call__(self, prob_map: NdarrayOrTensor) -> NdarrayOrTensor: pred = label(pred)[0] if self.remove_small_objects is not None: pred = self.remove_small_objects(pred) - pred[pred > 0] = 1 + pred_indices = np.where(pred > 0) + pred[pred_indices] = 1 return convert_to_dst_type(pred, prob_map, dtype=self.dtype)[0] @@ -338,7 +339,8 @@ def __call__(self, mask: NdarrayOrTensor, instance_border: NdarrayOrTensor) -> N instance_border = instance_border >= self.threshold # uncertain area marker = mask - convert_to_dst_type(instance_border, mask)[0] # certain foreground - marker[marker < 0] = 0 + marker_indices = where(marker < 0) + marker[marker_indices] = 0 # type: ignore[index] marker = self.postprocess_fn(marker) marker = convert_to_numpy(marker) @@ -379,6 +381,7 @@ def _generate_contour_coord(self, current: np.ndarray, previous: np.ndarray) -> """ p_delta = (current[0] - previous[0], current[1] - previous[1]) + row, col = -1, -1 if p_delta in ((0.0, 1.0), (0.5, 0.5), (1.0, 0.0)): row = int(current[0] + 0.5) @@ -634,7 +637,7 @@ def __call__( # type: ignore seg_map_crop = convert_to_dst_type(seg_map_crop == instance_id, type_map_crop, dtype=bool)[0] - inst_type = type_map_crop[seg_map_crop] + inst_type = type_map_crop[seg_map_crop] # type: ignore[index] type_list, type_pixels = unique(inst_type, return_counts=True) type_list = list(zip(type_list, type_pixels)) type_list = sorted(type_list, key=lambda x: x[1], reverse=True) diff --git a/monai/apps/pathology/utils.py b/monai/apps/pathology/utils.py index d3ebe0a7a6..3aa0bfab86 100644 --- a/monai/apps/pathology/utils.py +++ b/monai/apps/pathology/utils.py @@ -33,10 +33,10 @@ def compute_multi_instance_mask(mask: np.ndarray, threshold: float) -> Any: """ neg = 255 - mask * 255 - distance = ndimage.morphology.distance_transform_edt(neg) + distance = ndimage.distance_transform_edt(neg) binary = distance < threshold - filled_image = ndimage.morphology.binary_fill_holes(binary) + filled_image = ndimage.binary_fill_holes(binary) multi_instance_mask = measure.label(filled_image, connectivity=2) return multi_instance_mask diff --git a/monai/apps/utils.py b/monai/apps/utils.py index db541923b5..0c998146a3 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -135,7 +135,12 @@ def check_hash(filepath: PathLike, val: str | None = None, hash_type: str = "md5 logger.info(f"Expected {hash_type} is None, skip {hash_type} check for file {filepath}.") return True actual_hash_func = look_up_option(hash_type.lower(), SUPPORTED_HASH_TYPES) - actual_hash = actual_hash_func() + + if sys.version_info >= (3, 9): + actual_hash = actual_hash_func(usedforsecurity=False) # allows checks on FIPS enabled machines + else: + actual_hash = actual_hash_func() + try: with open(filepath, "rb") as f: for chunk in iter(lambda: f.read(1024 * 1024), b""): diff --git a/monai/auto3dseg/analyzer.py b/monai/auto3dseg/analyzer.py index 56419da4cb..e60327b551 100644 --- a/monai/auto3dseg/analyzer.py +++ b/monai/auto3dseg/analyzer.py @@ -460,7 +460,7 @@ def __call__(self, data: Mapping[Hashable, MetaTensor]) -> dict[Hashable, MetaTe torch.set_grad_enabled(False) ndas: list[MetaTensor] = [d[self.image_key][i] for i in range(d[self.image_key].shape[0])] # type: ignore - ndas_label: MetaTensor = d[self.label_key].astype(torch.int8) # (H,W,D) + ndas_label: MetaTensor = d[self.label_key].astype(torch.int16) # (H,W,D) if ndas_label.shape != ndas[0].shape: raise ValueError(f"Label shape {ndas_label.shape} is different from image shape {ndas[0].shape}") @@ -470,9 +470,9 @@ def __call__(self, data: Mapping[Hashable, MetaTensor]) -> dict[Hashable, MetaTe unique_label = unique(ndas_label) if isinstance(ndas_label, (MetaTensor, torch.Tensor)): - unique_label = unique_label.data.cpu().numpy() + unique_label = unique_label.data.cpu().numpy() # type: ignore[assignment] - unique_label = unique_label.astype(np.int8).tolist() + unique_label = unique_label.astype(np.int16).tolist() label_substats = [] # each element is one label pixel_sum = 0 diff --git a/monai/auto3dseg/utils.py b/monai/auto3dseg/utils.py index 58b900d410..211f23c415 100644 --- a/monai/auto3dseg/utils.py +++ b/monai/auto3dseg/utils.py @@ -407,7 +407,7 @@ def _prepare_cmd_default(cmd: str, cmd_prefix: str | None = None, **kwargs: Any) Args: cmd: the command or script to run in the distributed job. - cmd_prefix: the command prefix to run the script, e.g., "python", "python -m", "python3", "/opt/conda/bin/python3.8 ". + cmd_prefix: the command prefix to run the script, e.g., "python", "python -m", "python3", "/opt/conda/bin/python3.9 ". kwargs: the keyword arguments to be passed to the script. Returns: diff --git a/monai/bundle/config_item.py b/monai/bundle/config_item.py index 844d5b30bf..e5122bf3de 100644 --- a/monai/bundle/config_item.py +++ b/monai/bundle/config_item.py @@ -289,10 +289,7 @@ def instantiate(self, **kwargs: Any) -> object: mode = self.get_config().get("_mode_", CompInitMode.DEFAULT) args = self.resolve_args() args.update(kwargs) - try: - return instantiate(modname, mode, **args) - except Exception as e: - raise RuntimeError(f"Failed to instantiate {self}") from e + return instantiate(modname, mode, **args) class ConfigExpression(ConfigItem): diff --git a/monai/bundle/config_parser.py b/monai/bundle/config_parser.py index 829036af6f..a2ffeedc92 100644 --- a/monai/bundle/config_parser.py +++ b/monai/bundle/config_parser.py @@ -118,7 +118,7 @@ def __init__( self.ref_resolver = ReferenceResolver() if config is None: config = {self.meta_key: {}} - self.set(config=config) + self.set(config=self.ref_resolver.normalize_meta_id(config)) def __repr__(self): return f"{self.config}" @@ -221,7 +221,7 @@ def set(self, config: Any, id: str = "", recursive: bool = True) -> None: if isinstance(conf_, dict) and k not in conf_: conf_[k] = {} conf_ = conf_[k if isinstance(conf_, dict) else int(k)] - self[ReferenceResolver.normalize_id(id)] = config + self[ReferenceResolver.normalize_id(id)] = self.ref_resolver.normalize_meta_id(config) def update(self, pairs: dict[str, Any]) -> None: """ diff --git a/monai/bundle/reference_resolver.py b/monai/bundle/reference_resolver.py index b36f2cc4a5..050cd75fa7 100644 --- a/monai/bundle/reference_resolver.py +++ b/monai/bundle/reference_resolver.py @@ -17,7 +17,7 @@ from typing import Any, Iterator from monai.bundle.config_item import ConfigComponent, ConfigExpression, ConfigItem -from monai.bundle.utils import ID_REF_KEY, ID_SEP_KEY +from monai.bundle.utils import DEPRECATED_ID_MAPPING, ID_REF_KEY, ID_SEP_KEY from monai.utils import allow_missing_reference, look_up_option __all__ = ["ReferenceResolver"] @@ -202,6 +202,23 @@ def normalize_id(cls, id: str | int) -> str: """ return str(id).replace("#", cls.sep) # backward compatibility `#` is the old separator + def normalize_meta_id(self, config: Any) -> Any: + """ + Update deprecated identifiers in `config` using `DEPRECATED_ID_MAPPING`. + This will replace names that are marked as deprecated with their replacement. + + Args: + config: input config to be updated. + """ + if isinstance(config, dict): + for _id, _new_id in DEPRECATED_ID_MAPPING.items(): + if _id in config.keys(): + warnings.warn( + f"Detected deprecated name '{_id}' in configuration file, replacing with '{_new_id}'." + ) + config[_new_id] = config.pop(_id) + return config + @classmethod def split_id(cls, id: str | int, last: bool = False) -> list[str]: """ diff --git a/monai/bundle/scripts.py b/monai/bundle/scripts.py index 2565a3cf64..6dd83c1f81 100644 --- a/monai/bundle/scripts.py +++ b/monai/bundle/scripts.py @@ -16,6 +16,7 @@ import os import re import warnings +import zipfile from collections.abc import Mapping, Sequence from pathlib import Path from pydoc import locate @@ -26,7 +27,7 @@ import torch from torch.cuda import is_available -from monai.apps.mmars.mmars import _get_all_ngc_models +from monai._version import get_versions from monai.apps.utils import _basename, download_url, extractall, get_logger from monai.bundle.config_item import ConfigComponent from monai.bundle.config_parser import ConfigParser @@ -66,6 +67,9 @@ DEFAULT_DOWNLOAD_SOURCE = os.environ.get("BUNDLE_DOWNLOAD_SRC", "monaihosting") PPRINT_CONFIG_N = 5 +MONAI_HOSTING_BASE_URL = "https://api.ngc.nvidia.com/v2/models/nvidia/monaihosting" +NGC_BASE_URL = "https://api.ngc.nvidia.com/v2/models/nvidia/monaitoolkit" + def update_kwargs(args: str | dict | None = None, ignore_none: bool = True, **kwargs: Any) -> dict: """ @@ -168,12 +172,19 @@ def _get_git_release_url(repo_owner: str, repo_name: str, tag_name: str, filenam def _get_ngc_bundle_url(model_name: str, version: str) -> str: - return f"https://api.ngc.nvidia.com/v2/models/nvidia/monaitoolkit/{model_name.lower()}/versions/{version}/zip" + return f"{NGC_BASE_URL}/{model_name.lower()}/versions/{version}/zip" + + +def _get_ngc_private_base_url(repo: str) -> str: + return f"https://api.ngc.nvidia.com/v2/{repo}/models" + + +def _get_ngc_private_bundle_url(model_name: str, version: str, repo: str) -> str: + return f"{_get_ngc_private_base_url(repo)}/{model_name.lower()}/versions/{version}/zip" def _get_monaihosting_bundle_url(model_name: str, version: str) -> str: - monaihosting_root_path = "https://api.ngc.nvidia.com/v2/models/nvidia/monaihosting" - return f"{monaihosting_root_path}/{model_name.lower()}/versions/{version}/files/{model_name}_v{version}.zip" + return f"{MONAI_HOSTING_BASE_URL}/{model_name.lower()}/versions/{version}/files/{model_name}_v{version}.zip" def _download_from_github(repo: str, download_path: Path, filename: str, progress: bool = True) -> None: @@ -206,10 +217,15 @@ def _remove_ngc_prefix(name: str, prefix: str = "monai_") -> str: def _download_from_ngc( - download_path: Path, filename: str, version: str, remove_prefix: str | None, progress: bool + download_path: Path, + filename: str, + version: str, + prefix: str = "monai_", + remove_prefix: str | None = "monai_", + progress: bool = True, ) -> None: # ensure prefix is contained - filename = _add_ngc_prefix(filename) + filename = _add_ngc_prefix(filename, prefix=prefix) url = _get_ngc_bundle_url(model_name=filename, version=version) filepath = download_path / f"{filename}_v{version}.zip" if remove_prefix: @@ -219,29 +235,174 @@ def _download_from_ngc( extractall(filepath=filepath, output_dir=extract_path, has_base=True) +def _download_from_ngc_private( + download_path: Path, + filename: str, + version: str, + repo: str, + prefix: str = "monai_", + remove_prefix: str | None = "monai_", + headers: dict | None = None, +) -> None: + # ensure prefix is contained + filename = _add_ngc_prefix(filename, prefix=prefix) + request_url = _get_ngc_private_bundle_url(model_name=filename, version=version, repo=repo) + if has_requests: + headers = {} if headers is None else headers + response = requests_get(request_url, headers=headers) + response.raise_for_status() + else: + raise ValueError("NGC API requires requests package. Please install it.") + + zip_path = download_path / f"{filename}_v{version}.zip" + with open(zip_path, "wb") as f: + f.write(response.content) + logger.info(f"Downloading: {zip_path}.") + if remove_prefix: + filename = _remove_ngc_prefix(filename, prefix=remove_prefix) + extract_path = download_path / f"{filename}" + with zipfile.ZipFile(zip_path, "r") as z: + z.extractall(extract_path) + logger.info(f"Writing into directory: {extract_path}.") + + +def _get_ngc_token(api_key, retry=0): + """Try to connect to NGC.""" + url = "https://authn.nvidia.com/token?service=ngc" + headers = {"Accept": "application/json", "Authorization": "ApiKey " + api_key} + if has_requests: + response = requests_get(url, headers=headers) + if not response.ok: + # retry 3 times, if failed, raise an error. + if retry < 3: + logger.info(f"Retrying {retry} time(s) to GET {url}.") + return _get_ngc_token(url, retry + 1) + raise RuntimeError("NGC API response is not ok. Failed to get token.") + else: + token = response.json()["token"] + return token + + def _get_latest_bundle_version_monaihosting(name): - url = "https://api.ngc.nvidia.com/v2/models/nvidia/monaihosting" - full_url = f"{url}/{name.lower()}" + full_url = f"{MONAI_HOSTING_BASE_URL}/{name.lower()}" requests_get, has_requests = optional_import("requests", name="get") if has_requests: resp = requests_get(full_url) resp.raise_for_status() else: - raise ValueError("NGC API requires requests package. Please install it.") + raise ValueError("NGC API requires requests package. Please install it.") model_info = json.loads(resp.text) return model_info["model"]["latestVersionIdStr"] -def _get_latest_bundle_version(source: str, name: str, repo: str) -> dict[str, list[str] | str] | Any | None: +def _examine_monai_version(monai_version: str) -> tuple[bool, str]: + """Examine if the package version is compatible with the MONAI version in the metadata.""" + version_dict = get_versions() + package_version = version_dict.get("version", "0+unknown") + if package_version == "0+unknown": + return False, "Package version is not available. Skipping version check." + if monai_version == "0+unknown": + return False, "MONAI version is not specified in the bundle. Skipping version check." + # treat rc versions as the same as the release version + package_version = re.sub(r"rc\d.*", "", package_version) + monai_version = re.sub(r"rc\d.*", "", monai_version) + if package_version < monai_version: + return ( + False, + f"Your MONAI version is {package_version}, but the bundle is built on MONAI version {monai_version}.", + ) + return True, "" + + +def _check_monai_version(bundle_dir: PathLike, name: str) -> None: + """Get the `monai_version` from the metadata.json and compare if it is smaller than the installed `monai` package version""" + metadata_file = Path(bundle_dir) / name / "configs" / "metadata.json" + if not metadata_file.exists(): + logger.warning(f"metadata file not found in {metadata_file}.") + return + with open(metadata_file) as f: + metadata = json.load(f) + is_compatible, msg = _examine_monai_version(metadata.get("monai_version", "0+unknown")) + if not is_compatible: + logger.warning(msg) + + +def _list_latest_versions(data: dict, max_versions: int = 3) -> list[str]: + """ + Extract the latest versions from the data dictionary. + + Args: + data: the data dictionary. + max_versions: the maximum number of versions to return. + + Returns: + versions of the latest models in the reverse order of creation date, e.g. ['1.0.0', '0.9.0', '0.8.0']. + """ + # Check if the data is a dictionary and it has the key 'modelVersions' + if not isinstance(data, dict) or "modelVersions" not in data: + raise ValueError("The data is not a dictionary or it does not have the key 'modelVersions'.") + + # Extract the list of model versions + model_versions = data["modelVersions"] + + if ( + not isinstance(model_versions, list) + or len(model_versions) == 0 + or "createdDate" not in model_versions[0] + or "versionId" not in model_versions[0] + ): + raise ValueError( + "The model versions are not a list or it is empty or it does not have the keys 'createdDate' and 'versionId'." + ) + + # Sort the versions by the 'createdDate' in descending order + sorted_versions = sorted(model_versions, key=lambda x: x["createdDate"], reverse=True) + return [v["versionId"] for v in sorted_versions[:max_versions]] + + +def _get_latest_bundle_version_ngc(name: str, repo: str | None = None, headers: dict | None = None) -> str: + base_url = _get_ngc_private_base_url(repo) if repo else NGC_BASE_URL + version_endpoint = base_url + f"/{name.lower()}/versions/" + + if not has_requests: + raise ValueError("requests package is required, please install it.") + + version_header = {"Accept-Encoding": "gzip, deflate"} # Excluding 'zstd' to fit NGC requirements + if headers: + version_header.update(headers) + resp = requests_get(version_endpoint, headers=version_header) + resp.raise_for_status() + model_info = json.loads(resp.text) + latest_versions = _list_latest_versions(model_info) + + for version in latest_versions: + file_endpoint = base_url + f"/{name.lower()}/versions/{version}/files/configs/metadata.json" + resp = requests_get(file_endpoint, headers=headers) + metadata = json.loads(resp.text) + resp.raise_for_status() + # if the package version is not available or the model is compatible with the package version + is_compatible, _ = _examine_monai_version(metadata["monai_version"]) + if is_compatible: + if version != latest_versions[0]: + logger.info(f"Latest version is {latest_versions[0]}, but the compatible version is {version}.") + return version + + # if no compatible version is found, return the latest version + return latest_versions[0] + + +def _get_latest_bundle_version( + source: str, name: str, repo: str, **kwargs: Any +) -> dict[str, list[str] | str] | Any | None: if source == "ngc": name = _add_ngc_prefix(name) - model_dict = _get_all_ngc_models(name) - for v in model_dict.values(): - if v["name"] == name: - return v["latest"] - return None + return _get_latest_bundle_version_ngc(name) elif source == "monaihosting": return _get_latest_bundle_version_monaihosting(name) + elif source == "ngc_private": + headers = kwargs.pop("headers", {}) + name = _add_ngc_prefix(name) + return _get_latest_bundle_version_ngc(name, repo=repo, headers=headers) elif source == "github": repo_owner, repo_name, tag_name = repo.split("/") return get_bundle_versions(name, repo=f"{repo_owner}/{repo_name}", tag=tag_name)["latest_version"] @@ -308,6 +469,9 @@ def download( # Execute this module as a CLI entry, and download bundle via URL: python -m monai.bundle download --name --url + # Execute this module as a CLI entry, and download bundle from ngc_private with latest version: + python -m monai.bundle download --name --source "ngc_private" --bundle_dir "./" --repo "org/org_name" + # Set default args of `run` in a JSON / YAML file, help to record and simplify the command line. # Other args still can override the default args at runtime. # The content of the JSON / YAML file is a dictionary. For example: @@ -328,14 +492,17 @@ def download( Default is `bundle` subfolder under `torch.hub.get_dir()`. source: storage location name. This argument is used when `url` is `None`. In default, the value is achieved from the environment variable BUNDLE_DOWNLOAD_SRC, and - it should be "ngc", "monaihosting", "github", or "huggingface_hub". + it should be "ngc", "monaihosting", "github", "ngc_private", or "huggingface_hub". + If source is "ngc_private", you need specify the NGC_API_KEY in the environment variable. repo: repo name. This argument is used when `url` is `None` and `source` is "github" or "huggingface_hub". If `source` is "github", it should be in the form of "repo_owner/repo_name/release_tag". If `source` is "huggingface_hub", it should be in the form of "repo_owner/repo_name". + If `source` is "ngc_private", it should be in the form of "org/org_name" or "org/org_name/team/team_name", + or you can specify the environment variable NGC_ORG and NGC_TEAM. url: url to download the data. If not `None`, data will be downloaded directly and `source` will not be checked. If `name` is `None`, filename is determined by `monai.apps.utils._basename(url)`. - remove_prefix: This argument is used when `source` is "ngc". Currently, all ngc bundles + remove_prefix: This argument is used when `source` is "ngc" or "ngc_private". Currently, all ngc bundles have the ``monai_`` prefix, which is not existing in their model zoo contrasts. In order to maintain the consistency between these two sources, remove prefix is necessary. Therefore, if specified, downloaded folder name will remove the prefix. @@ -363,11 +530,18 @@ def download( bundle_dir_ = _process_bundle_dir(bundle_dir_) if repo_ is None: - repo_ = "Project-MONAI/model-zoo/hosting_storage_v1" - if len(repo_.split("/")) != 3 and source_ != "huggingface_hub": - raise ValueError("repo should be in the form of `repo_owner/repo_name/release_tag`.") + org_ = os.getenv("NGC_ORG", None) + team_ = os.getenv("NGC_TEAM", None) + if org_ is not None and source_ == "ngc_private": + repo_ = f"org/{org_}/team/{team_}" if team_ is not None else f"org/{org_}" + else: + repo_ = "Project-MONAI/model-zoo/hosting_storage_v1" + if len(repo_.split("/")) not in (2, 4) and source_ == "ngc_private": + raise ValueError(f"repo should be in the form of `org/org_name/team/team_name` or `org/org_name`, got {repo_}.") + if len(repo_.split("/")) != 3 and source_ == "github": + raise ValueError(f"repo should be in the form of `repo_owner/repo_name/release_tag`, got {repo_}.") elif len(repo_.split("/")) != 2 and source_ == "huggingface_hub": - raise ValueError("Hugging Face Hub repo should be in the form of `repo_owner/repo_name`") + raise ValueError(f"Hugging Face Hub repo should be in the form of `repo_owner/repo_name`, got {repo_}.") if url_ is not None: if name_ is not None: filepath = bundle_dir_ / f"{name_}.zip" @@ -376,14 +550,22 @@ def download( download_url(url=url_, filepath=filepath, hash_val=None, progress=progress_) extractall(filepath=filepath, output_dir=bundle_dir_, has_base=True) else: + headers = {} if name_ is None: raise ValueError(f"To download from source: {source_}, `name` must be provided.") + if source == "ngc_private": + api_key = os.getenv("NGC_API_KEY", None) + if api_key is None: + raise ValueError("API key is required for ngc_private source.") + else: + token = _get_ngc_token(api_key) + headers = {"Authorization": f"Bearer {token}"} + if version_ is None: - version_ = _get_latest_bundle_version(source=source_, name=name_, repo=repo_) + version_ = _get_latest_bundle_version(source=source_, name=name_, repo=repo_, headers=headers) if source_ == "github": - if version_ is not None: - name_ = "_v".join([name_, version_]) - _download_from_github(repo=repo_, download_path=bundle_dir_, filename=name_, progress=progress_) + name_ver = "_v".join([name_, version_]) if version_ is not None else name_ + _download_from_github(repo=repo_, download_path=bundle_dir_, filename=name_ver, progress=progress_) elif source_ == "monaihosting": _download_from_monaihosting(download_path=bundle_dir_, filename=name_, version=version_, progress=progress_) elif source_ == "ngc": @@ -394,6 +576,15 @@ def download( remove_prefix=remove_prefix_, progress=progress_, ) + elif source_ == "ngc_private": + _download_from_ngc_private( + download_path=bundle_dir_, + filename=name_, + version=version_, + remove_prefix=remove_prefix_, + repo=repo_, + headers=headers, + ) elif source_ == "huggingface_hub": extract_path = os.path.join(bundle_dir_, name_) huggingface_hub.snapshot_download(repo_id=repo_, revision=version_, local_dir=extract_path) @@ -403,6 +594,8 @@ def download( f"got source: {source_}." ) + _check_monai_version(bundle_dir_, name_) + @deprecated_arg("net_name", since="1.2", removed="1.5", msg_suffix="please use ``model`` instead.") @deprecated_arg("net_kwargs", since="1.2", removed="1.5", msg_suffix="please use ``model`` instead.") @@ -778,10 +971,19 @@ def run( https://docs.python.org/3/library/logging.config.html#logging.config.fileConfig. Default to None. tracking: if not None, enable the experiment tracking at runtime with optionally configurable and extensible. - if "mlflow", will add `MLFlowHandler` to the parsed bundle with default tracking settings, - if other string, treat it as file path to load the tracking settings. - if `dict`, treat it as tracking settings. - will patch the target config content with `tracking handlers` and the top-level items of `configs`. + If "mlflow", will add `MLFlowHandler` to the parsed bundle with default tracking settings where a set of + common parameters shown below will be added and can be passed through the `override` parameter of this method. + + - ``"output_dir"``: the path to save mlflow tracking outputs locally, default to "/eval". + - ``"tracking_uri"``: uri to save mlflow tracking outputs, default to "/output_dir/mlruns". + - ``"experiment_name"``: experiment name for this run, default to "monai_experiment". + - ``"run_name"``: the name of current run. + - ``"save_execute_config"``: whether to save the executed config files. It can be `False`, `/path/to/artifacts` + or `True`. If set to `True`, will save to the default path "/eval". Default to `True`. + + If other string, treat it as file path to load the tracking settings. + If `dict`, treat it as tracking settings. + Will patch the target config content with `tracking handlers` and the top-level items of `configs`. for detailed usage examples, please check the tutorial: https://github.com/Project-MONAI/tutorials/blob/main/experiment_management/bundle_integrate_mlflow.ipynb. args_file: a JSON or YAML file to provide default values for `run_id`, `meta_file`, diff --git a/monai/bundle/utils.py b/monai/bundle/utils.py index b187159c89..50d2608f4c 100644 --- a/monai/bundle/utils.py +++ b/monai/bundle/utils.py @@ -36,7 +36,7 @@ "monai_version": _conf_values["MONAI"], "pytorch_version": str(_conf_values["Pytorch"]).split("+")[0].split("a")[0], # 1.9.0a0+df837d0 or 1.13.0+cu117 "numpy_version": _conf_values["Numpy"], - "optional_packages_version": {}, + "required_packages_version": {}, "task": "Describe what the network predicts", "description": "A longer description of what the network does, use context, inputs, outputs, etc.", "authors": "Your Name Here", @@ -113,7 +113,7 @@ "experiment_name": "monai_experiment", "run_name": None, # may fill it at runtime - "execute_config": None, + "save_execute_config": True, "is_not_rank0": ( "$torch.distributed.is_available() \ and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0" @@ -125,7 +125,7 @@ "tracking_uri": "@tracking_uri", "experiment_name": "@experiment_name", "run_name": "@run_name", - "artifacts": "@execute_config", + "artifacts": "@save_execute_config", "iteration_log": True, "epoch_log": True, "tag_name": "train_loss", @@ -148,7 +148,7 @@ "tracking_uri": "@tracking_uri", "experiment_name": "@experiment_name", "run_name": "@run_name", - "artifacts": "@execute_config", + "artifacts": "@save_execute_config", "iteration_log": False, "close_on_complete": True, }, @@ -157,6 +157,8 @@ DEFAULT_EXP_MGMT_SETTINGS = {"mlflow": DEFAULT_MLFLOW_SETTINGS} # default experiment management settings +DEPRECATED_ID_MAPPING = {"optional_packages_version": "required_packages_version"} + def load_bundle_config(bundle_path: str, *config_names: str, **load_kw_args: Any) -> Any: """ @@ -221,6 +223,7 @@ def load_bundle_config(bundle_path: str, *config_names: str, **load_kw_args: Any raise ValueError(f"Cannot find config file '{full_cname}'") ardata = archive.read(full_cname) + cdata = {} if full_cname.lower().endswith("json"): cdata = json.loads(ardata, **load_kw_args) diff --git a/monai/bundle/workflows.py b/monai/bundle/workflows.py index da3aa30141..11c9bf0562 100644 --- a/monai/bundle/workflows.py +++ b/monai/bundle/workflows.py @@ -11,6 +11,7 @@ from __future__ import annotations +import json import os import sys import time @@ -24,6 +25,7 @@ from monai.bundle.config_parser import ConfigParser from monai.bundle.properties import InferProperties, MetaProperties, TrainProperties from monai.bundle.utils import DEFAULT_EXP_MGMT_SETTINGS, EXPR_KEY, ID_REF_KEY, ID_SEP_KEY +from monai.config import PathLike from monai.utils import BundleProperty, BundlePropertyConfig, deprecated_arg, deprecated_arg_default, ensure_tuple __all__ = ["BundleWorkflow", "ConfigWorkflow"] @@ -46,6 +48,10 @@ class BundleWorkflow(ABC): or "infer", "inference", "eval", "evaluation" for a inference workflow, other unsupported string will raise a ValueError. default to `None` for common workflow. + properties_path: the path to the JSON file of properties. + meta_file: filepath of the metadata file, if this is a list of file paths, their contents will be merged in order. + logging_file: config file for `logging` module in the program. for more details: + https://docs.python.org/3/library/logging.config.html#logging.config.fileConfig. """ @@ -59,21 +65,62 @@ class BundleWorkflow(ABC): new_name="workflow_type", msg_suffix="please use `workflow_type` instead.", ) - def __init__(self, workflow_type: str | None = None, workflow: str | None = None): + def __init__( + self, + workflow_type: str | None = None, + workflow: str | None = None, + properties_path: PathLike | None = None, + meta_file: str | Sequence[str] | None = None, + logging_file: str | None = None, + ): + if logging_file is not None: + if not os.path.isfile(logging_file): + raise FileNotFoundError(f"Cannot find the logging config file: {logging_file}.") + logger.info(f"Setting logging properties based on config: {logging_file}.") + fileConfig(logging_file, disable_existing_loggers=False) + + if meta_file is not None: + if isinstance(meta_file, str) and not os.path.isfile(meta_file): + logger.error( + f"Cannot find the metadata config file: {meta_file}. " + "Please see: https://docs.monai.io/en/stable/mb_specification.html" + ) + meta_file = None + if isinstance(meta_file, list): + for f in meta_file: + if not os.path.isfile(f): + logger.error( + f"Cannot find the metadata config file: {f}. " + "Please see: https://docs.monai.io/en/stable/mb_specification.html" + ) + meta_file = None + workflow_type = workflow if workflow is not None else workflow_type - if workflow_type is None: + if workflow_type is None and properties_path is None: self.properties = copy(MetaProperties) self.workflow_type = None + self.meta_file = meta_file + return + if properties_path is not None: + properties_path = Path(properties_path) + if not properties_path.is_file(): + raise ValueError(f"Property file {properties_path} does not exist.") + with open(properties_path) as json_file: + self.properties = json.load(json_file) + self.workflow_type = None + self.meta_file = meta_file return - if workflow_type.lower() in self.supported_train_type: + if workflow_type.lower() in self.supported_train_type: # type: ignore[union-attr] self.properties = {**TrainProperties, **MetaProperties} self.workflow_type = "train" - elif workflow_type.lower() in self.supported_infer_type: + elif workflow_type.lower() in self.supported_infer_type: # type: ignore[union-attr] self.properties = {**InferProperties, **MetaProperties} self.workflow_type = "infer" else: raise ValueError(f"Unsupported workflow type: '{workflow_type}'.") + self.meta_file = meta_file + @abstractmethod def initialize(self, *args: Any, **kwargs: Any) -> Any: """ @@ -142,6 +189,13 @@ def get_workflow_type(self): """ return self.workflow_type + def get_meta_file(self): + """ + Get the meta file. + + """ + return self.meta_file + def add_property(self, name: str, required: str, desc: str | None = None) -> None: """ Besides the default predefined properties, some 3rd party applications may need the bundle @@ -185,6 +239,7 @@ class ConfigWorkflow(BundleWorkflow): logging_file: config file for `logging` module in the program. for more details: https://docs.python.org/3/library/logging.config.html#logging.config.fileConfig. If None, default to "configs/logging.conf", which is commonly used for bundles in MONAI model zoo. + If False, the logging logic for the bundle will not be modified. init_id: ID name of the expected config expression to initialize before running, default to "initialize". allow a config to have no `initialize` logic and the ID. run_id: ID name of the expected config expression to run, default to "run". @@ -206,6 +261,7 @@ class ConfigWorkflow(BundleWorkflow): or "infer", "inference", "eval", "evaluation" for a inference workflow, other unsupported string will raise a ValueError. default to `None` for common workflow. + properties_path: the path to the JSON file of properties. override: id-value pairs to override or add the corresponding config content. e.g. ``--net#input_chns 42``, ``--net %/data/other.json#net_arg`` @@ -223,53 +279,51 @@ def __init__( self, config_file: str | Sequence[str], meta_file: str | Sequence[str] | None = None, - logging_file: str | None = None, + logging_file: str | bool | None = None, init_id: str = "initialize", run_id: str = "run", final_id: str = "finalize", tracking: str | dict | None = None, workflow_type: str | None = None, workflow: str | None = None, + properties_path: PathLike | None = None, **override: Any, ) -> None: workflow_type = workflow if workflow is not None else workflow_type - super().__init__(workflow_type=workflow_type) if config_file is not None: _config_files = ensure_tuple(config_file) - self.config_root_path = Path(_config_files[0]).parent + config_root_path = Path(_config_files[0]).parent for _config_file in _config_files: _config_file = Path(_config_file) - if _config_file.parent != self.config_root_path: + if _config_file.parent != config_root_path: logger.warn( - f"Not all config files are in {self.config_root_path}. If logging_file and meta_file are" - f"not specified, {self.config_root_path} will be used as the default config root directory." + f"Not all config files are in {config_root_path}. If logging_file and meta_file are" + f"not specified, {config_root_path} will be used as the default config root directory." ) if not _config_file.is_file(): raise FileNotFoundError(f"Cannot find the config file: {_config_file}.") else: - self.config_root_path = Path("configs") - + config_root_path = Path("configs") + meta_file = str(config_root_path / "metadata.json") if meta_file is None else meta_file + super().__init__(workflow_type=workflow_type, meta_file=meta_file, properties_path=properties_path) + self.config_root_path = config_root_path logging_file = str(self.config_root_path / "logging.conf") if logging_file is None else logging_file - if logging_file is not None: - if not os.path.exists(logging_file): + if logging_file is False: + logger.warn(f"Logging file is set to {logging_file}, skipping logging.") + else: + if not os.path.isfile(logging_file): if logging_file == str(self.config_root_path / "logging.conf"): logger.warn(f"Default logging file in {logging_file} does not exist, skipping logging.") else: raise FileNotFoundError(f"Cannot find the logging config file: {logging_file}.") else: logger.info(f"Setting logging properties based on config: {logging_file}.") - fileConfig(logging_file, disable_existing_loggers=False) + fileConfig(str(logging_file), disable_existing_loggers=False) self.parser = ConfigParser() self.parser.read_config(f=config_file) - meta_file = str(self.config_root_path / "metadata.json") if meta_file is None else meta_file - if isinstance(meta_file, str) and not os.path.exists(meta_file): - logger.error( - f"Cannot find the metadata config file: {meta_file}. " - "Please see: https://docs.monai.io/en/stable/mb_specification.html" - ) - else: - self.parser.read_meta(f=meta_file) + if self.meta_file is not None: + self.parser.read_meta(f=self.meta_file) # the rest key-values in the _args are to override config content self.parser.update(pairs=override) @@ -455,13 +509,19 @@ def patch_bundle_tracking(parser: ConfigParser, settings: dict) -> None: parser[k] = v # save the executed config into file default_name = f"config_{time.strftime('%Y%m%d_%H%M%S')}.json" - filepath = parser.get("execute_config", None) - if filepath is None: - if "output_dir" not in parser: - # if no "output_dir" in the bundle config, default to "/eval" - parser["output_dir"] = f"{EXPR_KEY}{ID_REF_KEY}bundle_root + '/eval'" - # experiment management tools can refer to this config item to track the config info - parser["execute_config"] = parser["output_dir"] + f" + '/{default_name}'" - filepath = os.path.join(parser.get_parsed_content("output_dir"), default_name) - Path(filepath).parent.mkdir(parents=True, exist_ok=True) - parser.export_config_file(parser.get(), filepath) + # Users can set the `save_execute_config` to `False`, `/path/to/artifacts` or `True`. + # If set to False, nothing will be recorded. If set to True, the default path will be logged. + # If set to a file path, the given path will be logged. + filepath = parser.get("save_execute_config", True) + if filepath: + if isinstance(filepath, bool): + if "output_dir" not in parser: + # if no "output_dir" in the bundle config, default to "/eval" + parser["output_dir"] = f"{EXPR_KEY}{ID_REF_KEY}bundle_root + '/eval'" + # experiment management tools can refer to this config item to track the config info + parser["save_execute_config"] = parser["output_dir"] + f" + '/{default_name}'" + filepath = os.path.join(parser.get_parsed_content("output_dir"), default_name) + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + parser.export_config_file(parser.get(), filepath) + else: + parser["save_execute_config"] = None diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 531893d768..871b523289 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -36,15 +36,7 @@ from monai.data.meta_tensor import MetaTensor from monai.data.utils import SUPPORTED_PICKLE_MOD, convert_tables_to_dicts, pickle_hashing -from monai.transforms import ( - Compose, - Randomizable, - RandomizableTrait, - Transform, - apply_transform, - convert_to_contiguous, - reset_ops_id, -) +from monai.transforms import Compose, Randomizable, RandomizableTrait, Transform, convert_to_contiguous, reset_ops_id from monai.utils import MAX_SEED, convert_to_tensor, get_seed, look_up_option, min_version, optional_import from monai.utils.misc import first @@ -77,15 +69,19 @@ class Dataset(_TorchDataset): }, }, }] """ - def __init__(self, data: Sequence, transform: Callable | None = None) -> None: + def __init__(self, data: Sequence, transform: Sequence[Callable] | Callable | None = None) -> None: """ Args: data: input data to load and transform to generate dataset for model. - transform: a callable data transform on input data. - + transform: a callable, sequence of callables or None. If transform is not + a `Compose` instance, it will be wrapped in a `Compose` instance. Sequences + of callables are applied in order and if `None` is passed, the data is returned as is. """ self.data = data - self.transform: Any = transform + try: + self.transform = Compose(transform) if not isinstance(transform, Compose) else transform + except Exception as e: + raise ValueError("`transform` must be a callable or a list of callables that is Composable") from e def __len__(self) -> int: return len(self.data) @@ -95,7 +91,7 @@ def _transform(self, index: int): Fetch single data item from `self.data`. """ data_i = self.data[index] - return apply_transform(self.transform, data_i) if self.transform is not None else data_i + return self.transform(data_i) def __getitem__(self, index: int | slice | Sequence[int]): """ @@ -264,8 +260,6 @@ def __init__( using the cached content and with re-created transform instances. """ - if not isinstance(transform, Compose): - transform = Compose(transform) super().__init__(data=data, transform=transform) self.cache_dir = Path(cache_dir) if cache_dir is not None else None self.hash_func = hash_func @@ -323,9 +317,6 @@ def _pre_transform(self, item_transformed): random transform object """ - if not isinstance(self.transform, Compose): - raise ValueError("transform must be an instance of monai.transforms.Compose.") - first_random = self.transform.get_index_of_first( lambda t: isinstance(t, RandomizableTrait) or not isinstance(t, Transform) ) @@ -346,9 +337,6 @@ def _post_transform(self, item_transformed): the transformed element through the random transforms """ - if not isinstance(self.transform, Compose): - raise ValueError("transform must be an instance of monai.transforms.Compose.") - first_random = self.transform.get_index_of_first( lambda t: isinstance(t, RandomizableTrait) or not isinstance(t, Transform) ) @@ -427,7 +415,7 @@ def _transform(self, index: int): class CacheNTransDataset(PersistentDataset): """ - Extension of `PersistentDataset`, tt can also cache the result of first N transforms, no matter it's random or not. + Extension of `PersistentDataset`, it can also cache the result of first N transforms, no matter it's random or not. """ @@ -501,9 +489,6 @@ def _pre_transform(self, item_transformed): Returns: the transformed element up to the N transform object """ - if not isinstance(self.transform, Compose): - raise ValueError("transform must be an instance of monai.transforms.Compose.") - item_transformed = self.transform(item_transformed, end=self.cache_n_trans, threading=True) reset_ops_id(item_transformed) @@ -519,9 +504,6 @@ def _post_transform(self, item_transformed): Returns: the final transformed result """ - if not isinstance(self.transform, Compose): - raise ValueError("transform must be an instance of monai.transforms.Compose.") - return self.transform(item_transformed, start=self.cache_n_trans) @@ -809,8 +791,6 @@ def __init__( Not following these recommendations may lead to runtime errors or duplicated cache across processes. """ - if not isinstance(transform, Compose): - transform = Compose(transform) super().__init__(data=data, transform=transform) self.set_num = cache_num # tracking the user-provided `cache_num` option self.set_rate = cache_rate # tracking the user-provided `cache_rate` option @@ -1282,8 +1262,10 @@ def to_list(x): data = [] for dataset in self.data: data.extend(to_list(dataset[index])) + if self.transform is not None: - data = apply_transform(self.transform, data, map_items=False) # transform the list data + self.transform.map_items = False # Compose object map_items to false so transform is applied to list + data = self.transform(data) # use tuple instead of list as the default collate_fn callback of MONAI DataLoader flattens nested lists return tuple(data) @@ -1432,15 +1414,11 @@ def __len__(self): def _transform(self, index: int): data = {k: v[index] for k, v in self.arrays.items()} - - if not self.transform: - return data - - result = apply_transform(self.transform, data) + result = self.transform(data) if self.transform is not None else data if isinstance(result, dict) or (isinstance(result, list) and isinstance(result[0], dict)): return result - raise AssertionError("With a dict supplied to apply_transform, should return a dict or a list of dicts.") + raise AssertionError("With a dict supplied to Compose, should return a dict or a list of dicts.") class CSVDataset(Dataset): diff --git a/monai/data/dataset_summary.py b/monai/data/dataset_summary.py index 769ae33b46..5b9e32afca 100644 --- a/monai/data/dataset_summary.py +++ b/monai/data/dataset_summary.py @@ -84,6 +84,7 @@ def collect_meta_data(self): """ for data in self.data_loader: + meta_dict = {} if isinstance(data[self.image_key], MetaTensor): meta_dict = data[self.image_key].meta elif self.meta_key in data: diff --git a/monai/data/image_reader.py b/monai/data/image_reader.py index 2361bb63a7..f5e199e2a3 100644 --- a/monai/data/image_reader.py +++ b/monai/data/image_reader.py @@ -1331,7 +1331,7 @@ def get_data(self, img: NrrdImage | list[NrrdImage]) -> tuple[np.ndarray, dict]: header[MetaKeys.SPACE] = SpaceKeys.LPS # assuming LPS if not specified header[MetaKeys.AFFINE] = header[MetaKeys.ORIGINAL_AFFINE].copy() - header[MetaKeys.SPATIAL_SHAPE] = header["sizes"] + header[MetaKeys.SPATIAL_SHAPE] = header["sizes"].copy() [header.pop(k) for k in ("sizes", "space origin", "space directions")] # rm duplicated data in header if self.channel_dim is None: # default to "no_channel" or -1 diff --git a/monai/data/meta_tensor.py b/monai/data/meta_tensor.py index cad0851a8e..2df4da4a35 100644 --- a/monai/data/meta_tensor.py +++ b/monai/data/meta_tensor.py @@ -505,7 +505,7 @@ def peek_pending_rank(self): a = self.pending_operations[-1].get(LazyAttr.AFFINE, None) if self.pending_operations else self.affine return 1 if a is None else int(max(1, len(a) - 1)) - def new_empty(self, size, dtype=None, device=None, requires_grad=False): + def new_empty(self, size, dtype=None, device=None, requires_grad=False): # type: ignore[override] """ must be defined for deepcopy to work @@ -580,7 +580,7 @@ def ensure_torch_and_prune_meta( img.affine = MetaTensor.get_default_affine() return img - def __repr__(self): + def __repr__(self): # type: ignore[override] """ Prints a representation of the tensor. Prepends "meta" to ``torch.Tensor.__repr__``. diff --git a/monai/data/test_time_augmentation.py b/monai/data/test_time_augmentation.py index 23572dcef4..bcd5ea91a9 100644 --- a/monai/data/test_time_augmentation.py +++ b/monai/data/test_time_augmentation.py @@ -106,6 +106,8 @@ class TestTimeAugmentation: mode, mean, std, vvc = tt_aug(test_data) """ + __test__ = False # indicate to pytest that this class is not intended for collection + def __init__( self, transform: InvertibleTransform, diff --git a/monai/data/torchscript_utils.py b/monai/data/torchscript_utils.py index cabf06ce89..507cf411d6 100644 --- a/monai/data/torchscript_utils.py +++ b/monai/data/torchscript_utils.py @@ -116,7 +116,7 @@ def load_net_with_metadata( Returns: Triple containing loaded object, metadata dict, and extra files dict containing other file data if present """ - extra_files = {f: "" for f in more_extra_files} + extra_files = dict.fromkeys(more_extra_files, "") extra_files[METADATA_FILENAME] = "" jit_obj = torch.jit.load(filename_prefix_or_stream, map_location, extra_files) diff --git a/monai/data/ultrasound_confidence_map.py b/monai/data/ultrasound_confidence_map.py index 03813e7559..865e4a0a0f 100644 --- a/monai/data/ultrasound_confidence_map.py +++ b/monai/data/ultrasound_confidence_map.py @@ -19,9 +19,11 @@ __all__ = ["UltrasoundConfidenceMap"] cv2, _ = optional_import("cv2") -csc_matrix, _ = optional_import("scipy.sparse", "1.7.1", min_version, "csc_matrix") -spsolve, _ = optional_import("scipy.sparse.linalg", "1.7.1", min_version, "spsolve") -hilbert, _ = optional_import("scipy.signal", "1.7.1", min_version, "hilbert") +csc_matrix, _ = optional_import("scipy.sparse", "1.12.0", min_version, "csc_matrix") +spsolve, _ = optional_import("scipy.sparse.linalg", "1.12.0", min_version, "spsolve") +cg, _ = optional_import("scipy.sparse.linalg", "1.12.0", min_version, "cg") +hilbert, _ = optional_import("scipy.signal", "1.12.0", min_version, "hilbert") +ruge_stuben_solver, _ = optional_import("pyamg", "5.0.0", min_version, "ruge_stuben_solver") class UltrasoundConfidenceMap: @@ -30,6 +32,9 @@ class UltrasoundConfidenceMap: It generates a confidence map by setting source and sink points in the image and computing the probability for random walks to reach the source for each pixel. + The official code is available at: + https://campar.in.tum.de/Main/AthanasiosKaramalisCode + Args: alpha (float, optional): Alpha parameter. Defaults to 2.0. beta (float, optional): Beta parameter. Defaults to 90.0. @@ -37,15 +42,33 @@ class UltrasoundConfidenceMap: mode (str, optional): 'RF' or 'B' mode data. Defaults to 'B'. sink_mode (str, optional): Sink mode. Defaults to 'all'. If 'mask' is selected, a mask must be when calling the transform. Can be 'all', 'mid', 'min', or 'mask'. + use_cg (bool, optional): Use Conjugate Gradient method for solving the linear system. Defaults to False. + cg_tol (float, optional): Tolerance for the Conjugate Gradient method. Defaults to 1e-6. + Will be used only if `use_cg` is True. + cg_maxiter (int, optional): Maximum number of iterations for the Conjugate Gradient method. Defaults to 200. + Will be used only if `use_cg` is True. """ - def __init__(self, alpha: float = 2.0, beta: float = 90.0, gamma: float = 0.05, mode="B", sink_mode="all"): + def __init__( + self, + alpha: float = 2.0, + beta: float = 90.0, + gamma: float = 0.05, + mode="B", + sink_mode="all", + use_cg=False, + cg_tol=1e-6, + cg_maxiter=200, + ): # The hyperparameters for confidence map estimation self.alpha = alpha self.beta = beta self.gamma = gamma self.mode = mode self.sink_mode = sink_mode + self.use_cg = use_cg + self.cg_tol = cg_tol + self.cg_maxiter = cg_maxiter # The precision to use for all computations self.eps = np.finfo("float64").eps @@ -228,17 +251,18 @@ def confidence_laplacian(self, padded_index: NDArray, padded_image: NDArray, bet s = self.normalize(s) # Horizontal penalty - s[:vertical_end] += gamma - # s[vertical_end:diagonal_end] += gamma * np.sqrt(2) # --> In the paper it is sqrt(2) - # since the diagonal edges are longer yet does not exist in the original code + s[vertical_end:] += gamma + # Here there is a difference between the official MATLAB code and the paper + # on the edge penalty. We directly implement what the official code does. # Normalize differences s = self.normalize(s) # Gaussian weighting function s = -( - (np.exp(-beta * s, dtype="float64")) + 1.0e-6 - ) # --> This epsilon changes results drastically default: 1.e-6 + (np.exp(-beta * s, dtype="float64")) + 1e-5 + ) # --> This epsilon changes results drastically default: 10e-6 + # Please notice that it is not 1e-6, it is 10e-6 which is actually different. # Create Laplacian, diagonal missing lap = csc_matrix((s, (i, j))) @@ -256,7 +280,14 @@ def confidence_laplacian(self, padded_index: NDArray, padded_image: NDArray, bet return lap def _solve_linear_system(self, lap, rhs): - x = spsolve(lap, rhs) + + if self.use_cg: + lap_sparse = lap.tocsr() + ml = ruge_stuben_solver(lap_sparse, coarse_solver="pinv") + m = ml.aspreconditioner(cycle="V") + x, _ = cg(lap, rhs, rtol=self.cg_tol, maxiter=self.cg_maxiter, M=m) + else: + x = spsolve(lap, rhs) return x diff --git a/monai/data/utils.py b/monai/data/utils.py index 585f02ec9e..7a08300abb 100644 --- a/monai/data/utils.py +++ b/monai/data/utils.py @@ -53,10 +53,6 @@ pytorch_after, ) -if pytorch_after(1, 13): - # import private code for reuse purposes, comment in case things break in the future - from torch.utils.data._utils.collate import collate_tensor_fn, default_collate_fn_map - pd, _ = optional_import("pandas") DataFrame, _ = optional_import("pandas", name="DataFrame") nib, _ = optional_import("nibabel") @@ -454,8 +450,13 @@ def collate_meta_tensor_fn(batch, *, collate_fn_map=None): Collate a sequence of meta tensor into a single batched metatensor. This is called by `collage_meta_tensor` and so should not be used as a collate function directly in dataloaders. """ - collate_fn = collate_tensor_fn if pytorch_after(1, 13) else default_collate - collated = collate_fn(batch) # type: ignore + if pytorch_after(1, 13): + from torch.utils.data._utils.collate import collate_tensor_fn # imported here for pylint/mypy issues + + collated = collate_tensor_fn(batch) + else: + collated = default_collate(batch) + meta_dicts = [i.meta or TraceKeys.NONE for i in batch] common_ = set.intersection(*[set(d.keys()) for d in meta_dicts if isinstance(d, dict)]) if common_: @@ -496,6 +497,8 @@ def list_data_collate(batch: Sequence): if pytorch_after(1, 13): # needs to go here to avoid circular import + from torch.utils.data._utils.collate import default_collate_fn_map + from monai.data.meta_tensor import MetaTensor default_collate_fn_map.update({MetaTensor: collate_meta_tensor_fn}) diff --git a/monai/data/video_dataset.py b/monai/data/video_dataset.py index be3bcf5bd5..031e85db26 100644 --- a/monai/data/video_dataset.py +++ b/monai/data/video_dataset.py @@ -173,15 +173,15 @@ def get_available_codecs() -> dict[str, str]: all_codecs = {"mp4v": ".mp4", "X264": ".avi", "H264": ".mp4", "MP42": ".mp4", "MJPG": ".mjpeg", "DIVX": ".avi"} codecs = {} with SuppressStderr(): - writer = cv2.VideoWriter() with tempfile.TemporaryDirectory() as tmp_dir: for codec, ext in all_codecs.items(): + writer = cv2.VideoWriter() fname = os.path.join(tmp_dir, f"test{ext}") fourcc = cv2.VideoWriter_fourcc(*codec) noviderr = writer.open(fname, fourcc, 1, (10, 10)) if noviderr: codecs[codec] = ext - writer.release() + writer.release() return codecs def get_num_frames(self) -> int: diff --git a/monai/data/wsi_datasets.py b/monai/data/wsi_datasets.py index 3488029a7a..2ee8c9d363 100644 --- a/monai/data/wsi_datasets.py +++ b/monai/data/wsi_datasets.py @@ -23,7 +23,7 @@ from monai.data.utils import iter_patch_position from monai.data.wsi_reader import BaseWSIReader, WSIReader from monai.transforms import ForegroundMask, Randomizable, apply_transform -from monai.utils import convert_to_dst_type, ensure_tuple_rep +from monai.utils import convert_to_dst_type, ensure_tuple, ensure_tuple_rep from monai.utils.enums import CommonKeys, ProbMapKeys, WSIPatchKeys __all__ = ["PatchWSIDataset", "SlidingPatchWSIDataset", "MaskedPatchWSIDataset"] @@ -123,9 +123,9 @@ def _get_label(self, sample: dict): def _get_location(self, sample: dict): if self.center_location: size = self._get_size(sample) - return [sample[WSIPatchKeys.LOCATION][i] - size[i] // 2 for i in range(len(size))] + return ensure_tuple(sample[WSIPatchKeys.LOCATION][i] - size[i] // 2 for i in range(len(size))) else: - return sample[WSIPatchKeys.LOCATION] + return ensure_tuple(sample[WSIPatchKeys.LOCATION]) def _get_level(self, sample: dict): if self.patch_level is None: diff --git a/monai/data/wsi_reader.py b/monai/data/wsi_reader.py index b31d4d9c3a..2a4fe9f7a8 100644 --- a/monai/data/wsi_reader.py +++ b/monai/data/wsi_reader.py @@ -1097,8 +1097,8 @@ def get_mpp(self, wsi, level: int) -> tuple[float, float]: ): unit = wsi.pages[level].tags.get("ResolutionUnit") if unit is not None: - unit = str(unit.value)[8:] - else: + unit = str(unit.value.name) + if unit is None or len(unit) == 0: warnings.warn("The resolution unit is missing. `micrometer` will be used as default.") unit = "micrometer" diff --git a/monai/engines/__init__.py b/monai/engines/__init__.py index d8dc51f620..93cc40e292 100644 --- a/monai/engines/__init__.py +++ b/monai/engines/__init__.py @@ -12,12 +12,14 @@ from __future__ import annotations from .evaluator import EnsembleEvaluator, Evaluator, SupervisedEvaluator -from .trainer import GanTrainer, SupervisedTrainer, Trainer +from .trainer import AdversarialTrainer, GanTrainer, SupervisedTrainer, Trainer from .utils import ( + DiffusionPrepareBatch, IterationEvents, PrepareBatch, PrepareBatchDefault, PrepareBatchExtraInput, + VPredictionPrepareBatch, default_make_latent, default_metric_cmp_fn, default_prepare_batch, diff --git a/monai/engines/trainer.py b/monai/engines/trainer.py index f1513ea73b..c1364fe015 100644 --- a/monai/engines/trainer.py +++ b/monai/engines/trainer.py @@ -24,7 +24,7 @@ from monai.engines.workflow import Workflow from monai.inferers import Inferer, SimpleInferer from monai.transforms import Transform -from monai.utils import GanKeys, min_version, optional_import +from monai.utils import AdversarialIterationEvents, AdversarialKeys, GanKeys, min_version, optional_import from monai.utils.enums import CommonKeys as Keys from monai.utils.enums import EngineStatsKeys as ESKeys from monai.utils.module import pytorch_after @@ -37,7 +37,7 @@ Metric, _ = optional_import("ignite.metrics", IgniteInfo.OPT_IMPORT_VERSION, min_version, "Metric") EventEnum, _ = optional_import("ignite.engine", IgniteInfo.OPT_IMPORT_VERSION, min_version, "EventEnum") -__all__ = ["Trainer", "SupervisedTrainer", "GanTrainer"] +__all__ = ["Trainer", "SupervisedTrainer", "GanTrainer", "AdversarialTrainer"] class Trainer(Workflow): @@ -471,3 +471,282 @@ def _iteration( GanKeys.GLOSS: g_loss.item(), GanKeys.DLOSS: d_total_loss.item(), } + + +class AdversarialTrainer(Trainer): + """ + Standard supervised training workflow for adversarial loss enabled neural networks. + + Args: + device: an object representing the device on which to run. + max_epochs: the total epoch number for engine to run. + train_data_loader: Core ignite engines uses `DataLoader` for training loop batchdata. + g_network: ''generator'' (G) network architecture. + g_optimizer: G optimizer function. + g_loss_function: G loss function for adversarial training. + recon_loss_function: G loss function for reconstructions. + d_network: discriminator (D) network architecture. + d_optimizer: D optimizer function. + d_loss_function: D loss function for adversarial training.. + epoch_length: number of iterations for one epoch, default to `len(train_data_loader)`. + non_blocking: if True and this copy is between CPU and GPU, the copy may occur asynchronously with respect to + the host. For other cases, this argument has no effect. + prepare_batch: function to parse image and label for current iteration. + iteration_update: the callable function for every iteration, expect to accept `engine` and `batchdata` as input + parameters. if not provided, use `self._iteration()` instead. + g_inferer: inference method to execute G model forward. Defaults to ``SimpleInferer()``. + d_inferer: inference method to execute D model forward. Defaults to ``SimpleInferer()``. + postprocessing: execute additional transformation for the model output data. Typically, several Tensor based + transforms composed by `Compose`. Defaults to None + key_train_metric: compute metric when every iteration completed, and save average value to engine.state.metrics + when epoch completed. key_train_metric is the main metric to compare and save the checkpoint into files. + additional_metrics: more Ignite metrics that also attach to Ignite Engine. + metric_cmp_fn: function to compare current key metric with previous best key metric value, it must accept 2 args + (current_metric, previous_best) and return a bool result: if `True`, will update 'best_metric` and + `best_metric_epoch` with current metric and epoch, default to `greater than`. + train_handlers: every handler is a set of Ignite Event-Handlers, must have `attach` function, like: + CheckpointHandler, StatsHandler, etc. + amp: whether to enable auto-mixed-precision training, default is False. + event_names: additional custom ignite events that will register to the engine. + new events can be a list of str or `ignite.engine.events.EventEnum`. + event_to_attr: a dictionary to map an event to a state attribute, then add to `engine.state`. + for more details, check: https://pytorch.org/ignite/generated/ignite.engine.engine.Engine.html + #ignite.engine.engine.Engine.register_events. + decollate: whether to decollate the batch-first data to a list of data after model computation, recommend + `decollate=True` when `postprocessing` uses components from `monai.transforms`. default to `True`. + optim_set_to_none: when calling `optimizer.zero_grad()`, instead of setting to zero, set the grads to None. + more details: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html. + to_kwargs: dict of other args for `prepare_batch` API when converting the input data, except for + `device`, `non_blocking`. + amp_kwargs: dict of the args for `torch.cuda.amp.autocast()` API, for more details: + https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast. + """ + + def __init__( + self, + device: torch.device | str, + max_epochs: int, + train_data_loader: Iterable | DataLoader, + g_network: torch.nn.Module, + g_optimizer: Optimizer, + g_loss_function: Callable, + recon_loss_function: Callable, + d_network: torch.nn.Module, + d_optimizer: Optimizer, + d_loss_function: Callable, + epoch_length: int | None = None, + non_blocking: bool = False, + prepare_batch: Callable = default_prepare_batch, + iteration_update: Callable | None = None, + g_inferer: Inferer | None = None, + d_inferer: Inferer | None = None, + postprocessing: Transform | None = None, + key_train_metric: dict[str, Metric] | None = None, + additional_metrics: dict[str, Metric] | None = None, + metric_cmp_fn: Callable = default_metric_cmp_fn, + train_handlers: Sequence | None = None, + amp: bool = False, + event_names: list[str | EventEnum | type[EventEnum]] | None = None, + event_to_attr: dict | None = None, + decollate: bool = True, + optim_set_to_none: bool = False, + to_kwargs: dict | None = None, + amp_kwargs: dict | None = None, + ): + super().__init__( + device=device, + max_epochs=max_epochs, + data_loader=train_data_loader, + epoch_length=epoch_length, + non_blocking=non_blocking, + prepare_batch=prepare_batch, + iteration_update=iteration_update, + postprocessing=postprocessing, + key_metric=key_train_metric, + additional_metrics=additional_metrics, + metric_cmp_fn=metric_cmp_fn, + handlers=train_handlers, + amp=amp, + event_names=event_names, + event_to_attr=event_to_attr, + decollate=decollate, + to_kwargs=to_kwargs, + amp_kwargs=amp_kwargs, + ) + + self.register_events(*AdversarialIterationEvents) + + self.state.g_network = g_network + self.state.g_optimizer = g_optimizer + self.state.g_loss_function = g_loss_function + self.state.recon_loss_function = recon_loss_function + + self.state.d_network = d_network + self.state.d_optimizer = d_optimizer + self.state.d_loss_function = d_loss_function + + self.g_inferer = SimpleInferer() if g_inferer is None else g_inferer + self.d_inferer = SimpleInferer() if d_inferer is None else d_inferer + + self.state.g_scaler = torch.cuda.amp.GradScaler() if self.amp else None + self.state.d_scaler = torch.cuda.amp.GradScaler() if self.amp else None + + self.optim_set_to_none = optim_set_to_none + self._complete_state_dict_user_keys() + + def _complete_state_dict_user_keys(self) -> None: + """ + This method appends to the _state_dict_user_keys AdversarialTrainer's elements that are required for + checkpoint saving. + + Follows the example found at: + https://pytorch.org/ignite/generated/ignite.engine.engine.Engine.html#ignite.engine.engine.Engine.state_dict + """ + self._state_dict_user_keys.extend( + ["g_network", "g_optimizer", "d_network", "d_optimizer", "g_scaler", "d_scaler"] + ) + + g_loss_state_dict = getattr(self.state.g_loss_function, "state_dict", None) + if callable(g_loss_state_dict): + self._state_dict_user_keys.append("g_loss_function") + + d_loss_state_dict = getattr(self.state.d_loss_function, "state_dict", None) + if callable(d_loss_state_dict): + self._state_dict_user_keys.append("d_loss_function") + + recon_loss_state_dict = getattr(self.state.recon_loss_function, "state_dict", None) + if callable(recon_loss_state_dict): + self._state_dict_user_keys.append("recon_loss_function") + + def _iteration( + self, engine: AdversarialTrainer, batchdata: dict[str, torch.Tensor] + ) -> dict[str, torch.Tensor | int | float | bool]: + """ + Callback function for the Adversarial Training processing logic of 1 iteration in Ignite Engine. + Return below items in a dictionary: + - IMAGE: image Tensor data for model input, already moved to device. + - LABEL: label Tensor data corresponding to the image, already moved to device. In case of Unsupervised + Learning this is equal to IMAGE. + - PRED: prediction result of model. + - LOSS: loss value computed by loss functions of the generator (reconstruction and adversarial summed up). + - AdversarialKeys.REALS: real images from the batch. Are the same as IMAGE. + - AdversarialKeys.FAKES: fake images generated by the generator. Are the same as PRED. + - AdversarialKeys.REAL_LOGITS: logits of the discriminator for the real images. + - AdversarialKeys.FAKE_LOGITS: logits of the discriminator for the fake images. + - AdversarialKeys.RECONSTRUCTION_LOSS: loss value computed by the reconstruction loss function. + - AdversarialKeys.GENERATOR_LOSS: loss value computed by the generator loss function. It is the + discriminator loss for the fake images. That is backpropagated through the generator only. + - AdversarialKeys.DISCRIMINATOR_LOSS: loss value computed by the discriminator loss function. It is the + discriminator loss for the real images and the fake images. That is backpropagated through the + discriminator only. + + Args: + engine: `AdversarialTrainer` to execute operation for an iteration. + batchdata: input data for this iteration, usually can be dictionary or tuple of Tensor data. + + Raises: + ValueError: must provide batch data for current iteration. + + """ + + if batchdata is None: + raise ValueError("Must provide batch data for current iteration.") + batch = engine.prepare_batch(batchdata, engine.state.device, engine.non_blocking, **engine.to_kwargs) + + if len(batch) == 2: + inputs, targets = batch + args: tuple = () + kwargs: dict = {} + else: + inputs, targets, args, kwargs = batch + + engine.state.output = {Keys.IMAGE: inputs, Keys.LABEL: targets, AdversarialKeys.REALS: inputs} + + def _compute_generator_loss() -> None: + engine.state.output[AdversarialKeys.FAKES] = engine.g_inferer( + inputs, engine.state.g_network, *args, **kwargs + ) + engine.state.output[Keys.PRED] = engine.state.output[AdversarialKeys.FAKES] + engine.fire_event(AdversarialIterationEvents.GENERATOR_FORWARD_COMPLETED) + + engine.state.output[AdversarialKeys.FAKE_LOGITS] = engine.d_inferer( + engine.state.output[AdversarialKeys.FAKES].float().contiguous(), engine.state.d_network, *args, **kwargs + ) + engine.fire_event(AdversarialIterationEvents.GENERATOR_DISCRIMINATOR_FORWARD_COMPLETED) + + engine.state.output[AdversarialKeys.RECONSTRUCTION_LOSS] = engine.state.recon_loss_function( + engine.state.output[AdversarialKeys.FAKES], targets + ).mean() + engine.fire_event(AdversarialIterationEvents.RECONSTRUCTION_LOSS_COMPLETED) + + engine.state.output[AdversarialKeys.GENERATOR_LOSS] = engine.state.g_loss_function( + engine.state.output[AdversarialKeys.FAKE_LOGITS] + ).mean() + engine.fire_event(AdversarialIterationEvents.GENERATOR_LOSS_COMPLETED) + + # Train Generator + engine.state.g_network.train() + engine.state.g_optimizer.zero_grad(set_to_none=engine.optim_set_to_none) + + if engine.amp and engine.state.g_scaler is not None: + with torch.cuda.amp.autocast(**engine.amp_kwargs): + _compute_generator_loss() + + engine.state.output[Keys.LOSS] = ( + engine.state.output[AdversarialKeys.RECONSTRUCTION_LOSS] + + engine.state.output[AdversarialKeys.GENERATOR_LOSS] + ) + engine.state.g_scaler.scale(engine.state.output[Keys.LOSS]).backward() + engine.fire_event(AdversarialIterationEvents.GENERATOR_BACKWARD_COMPLETED) + engine.state.g_scaler.step(engine.state.g_optimizer) + engine.state.g_scaler.update() + else: + _compute_generator_loss() + ( + engine.state.output[AdversarialKeys.RECONSTRUCTION_LOSS] + + engine.state.output[AdversarialKeys.GENERATOR_LOSS] + ).backward() + engine.fire_event(AdversarialIterationEvents.GENERATOR_BACKWARD_COMPLETED) + engine.state.g_optimizer.step() + engine.fire_event(AdversarialIterationEvents.GENERATOR_MODEL_COMPLETED) + + def _compute_discriminator_loss() -> None: + engine.state.output[AdversarialKeys.REAL_LOGITS] = engine.d_inferer( + engine.state.output[AdversarialKeys.REALS].contiguous().detach(), + engine.state.d_network, + *args, + **kwargs, + ) + engine.fire_event(AdversarialIterationEvents.DISCRIMINATOR_REALS_FORWARD_COMPLETED) + + engine.state.output[AdversarialKeys.FAKE_LOGITS] = engine.d_inferer( + engine.state.output[AdversarialKeys.FAKES].contiguous().detach(), + engine.state.d_network, + *args, + **kwargs, + ) + engine.fire_event(AdversarialIterationEvents.DISCRIMINATOR_FAKES_FORWARD_COMPLETED) + + engine.state.output[AdversarialKeys.DISCRIMINATOR_LOSS] = engine.state.d_loss_function( + engine.state.output[AdversarialKeys.REAL_LOGITS], engine.state.output[AdversarialKeys.FAKE_LOGITS] + ).mean() + engine.fire_event(AdversarialIterationEvents.DISCRIMINATOR_LOSS_COMPLETED) + + # Train Discriminator + engine.state.d_network.train() + engine.state.d_network.zero_grad(set_to_none=engine.optim_set_to_none) + + if engine.amp and engine.state.d_scaler is not None: + with torch.cuda.amp.autocast(**engine.amp_kwargs): + _compute_discriminator_loss() + + engine.state.d_scaler.scale(engine.state.output[AdversarialKeys.DISCRIMINATOR_LOSS]).backward() + engine.fire_event(AdversarialIterationEvents.DISCRIMINATOR_BACKWARD_COMPLETED) + engine.state.d_scaler.step(engine.state.d_optimizer) + engine.state.d_scaler.update() + else: + _compute_discriminator_loss() + engine.state.output[AdversarialKeys.DISCRIMINATOR_LOSS].backward() + engine.state.d_optimizer.step() + + return engine.state.output diff --git a/monai/engines/utils.py b/monai/engines/utils.py index 02c718cd14..5339d6965a 100644 --- a/monai/engines/utils.py +++ b/monai/engines/utils.py @@ -13,9 +13,10 @@ from abc import ABC, abstractmethod from collections.abc import Callable, Sequence -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, Any, Mapping, cast import torch +import torch.nn as nn from monai.config import IgniteInfo from monai.transforms import apply_transform @@ -36,6 +37,8 @@ "PrepareBatch", "PrepareBatchDefault", "PrepareBatchExtraInput", + "DiffusionPrepareBatch", + "VPredictionPrepareBatch", "default_make_latent", "engine_apply_transform", "default_metric_cmp_fn", @@ -238,6 +241,78 @@ def _get_data(key: str) -> torch.Tensor: return cast(torch.Tensor, image), cast(torch.Tensor, label), tuple(args_), kwargs_ +class DiffusionPrepareBatch(PrepareBatch): + """ + This class is used as a callable for the `prepare_batch` parameter of engine classes for diffusion training. + + Assuming a supervised training process, it will generate a noise field using `get_noise` for an input image, and + return the image and noise field as the image/target pair plus the noise field the kwargs under the key "noise". + This assumes the inferer being used in conjunction with this class expects a "noise" parameter to be provided. + + If the `condition_name` is provided, this must refer to a key in the input dictionary containing the condition + field to be passed to the inferer. This will appear in the keyword arguments under the key "condition". + + """ + + def __init__(self, num_train_timesteps: int, condition_name: str | None = None) -> None: + self.condition_name = condition_name + self.num_train_timesteps = num_train_timesteps + + def get_noise(self, images: torch.Tensor) -> torch.Tensor: + """Returns the noise tensor for input tensor `images`, override this for different noise distributions.""" + return torch.randn_like(images) + + def get_timesteps(self, images: torch.Tensor) -> torch.Tensor: + """Get a timestep, by default this is a random integer between 0 and `self.num_train_timesteps`.""" + return torch.randint(0, self.num_train_timesteps, (images.shape[0],), device=images.device).long() + + def get_target(self, images: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor: + """Return the target for the loss function, this is the `noise` value by default.""" + return noise + + def __call__( + self, + batchdata: dict[str, torch.Tensor], + device: str | torch.device | None = None, + non_blocking: bool = False, + **kwargs: Any, + ) -> tuple[torch.Tensor, torch.Tensor, tuple, dict]: + images, _ = default_prepare_batch(batchdata, device, non_blocking, **kwargs) + noise = self.get_noise(images).to(device, non_blocking=non_blocking, **kwargs) + timesteps = self.get_timesteps(images).to(device, non_blocking=non_blocking, **kwargs) + + target = self.get_target(images, noise, timesteps).to(device, non_blocking=non_blocking, **kwargs) + infer_kwargs = {"noise": noise, "timesteps": timesteps} + + if self.condition_name is not None and isinstance(batchdata, Mapping): + infer_kwargs["condition"] = batchdata[self.condition_name].to(device, non_blocking=non_blocking, **kwargs) + + # return input, target, arguments, and keyword arguments where noise is the target and also a keyword value + return images, target, (), infer_kwargs + + +class VPredictionPrepareBatch(DiffusionPrepareBatch): + """ + This class is used as a callable for the `prepare_batch` parameter of engine classes for diffusion training. + + Assuming a supervised training process, it will generate a noise field using `get_noise` for an input image, and + from this compute the velocity using the provided scheduler. This value is used as the target in place of the + noise field itself although the noise is field is in the kwargs under the key "noise". This assumes the inferer + being used in conjunction with this class expects a "noise" parameter to be provided. + + If the `condition_name` is provided, this must refer to a key in the input dictionary containing the condition + field to be passed to the inferer. This will appear in the keyword arguments under the key "condition". + + """ + + def __init__(self, scheduler: nn.Module, num_train_timesteps: int, condition_name: str | None = None) -> None: + super().__init__(num_train_timesteps=num_train_timesteps, condition_name=condition_name) + self.scheduler = scheduler + + def get_target(self, images, noise, timesteps): + return self.scheduler.get_velocity(images, noise, timesteps) + + def default_make_latent( num_latents: int, latent_size: int, diff --git a/monai/fl/client/monai_algo.py b/monai/fl/client/monai_algo.py index 9acf131bd9..a3ac58c221 100644 --- a/monai/fl/client/monai_algo.py +++ b/monai/fl/client/monai_algo.py @@ -134,12 +134,14 @@ def initialize(self, extra=None): Args: extra: Dict with additional information that should be provided by FL system, - i.e., `ExtraItems.CLIENT_NAME` and `ExtraItems.APP_ROOT`. + i.e., `ExtraItems.CLIENT_NAME`, `ExtraItems.APP_ROOT` and `ExtraItems.LOGGING_FILE`. + You can diable the logging logic in the monai bundle by setting {ExtraItems.LOGGING_FILE} to False. """ if extra is None: extra = {} self.client_name = extra.get(ExtraItems.CLIENT_NAME, "noname") + logging_file = extra.get(ExtraItems.LOGGING_FILE, None) self.logger.info(f"Initializing {self.client_name} ...") # FL platform needs to provide filepath to configuration files @@ -149,7 +151,7 @@ def initialize(self, extra=None): if self.workflow is None: config_train_files = self._add_config_files(self.config_train_filename) self.workflow = ConfigWorkflow( - config_file=config_train_files, meta_file=None, logging_file=None, workflow_type="train" + config_file=config_train_files, meta_file=None, logging_file=logging_file, workflow_type="train" ) self.workflow.initialize() self.workflow.bundle_root = self.bundle_root @@ -412,13 +414,15 @@ def initialize(self, extra=None): Args: extra: Dict with additional information that should be provided by FL system, - i.e., `ExtraItems.CLIENT_NAME` and `ExtraItems.APP_ROOT`. + i.e., `ExtraItems.CLIENT_NAME`, `ExtraItems.APP_ROOT` and `ExtraItems.LOGGING_FILE`. + You can diable the logging logic in the monai bundle by setting {ExtraItems.LOGGING_FILE} to False. """ self._set_cuda_device() if extra is None: extra = {} self.client_name = extra.get(ExtraItems.CLIENT_NAME, "noname") + logging_file = extra.get(ExtraItems.LOGGING_FILE, None) timestamp = time.strftime("%Y%m%d_%H%M%S") self.logger.info(f"Initializing {self.client_name} ...") # FL platform needs to provide filepath to configuration files @@ -434,7 +438,7 @@ def initialize(self, extra=None): self.train_workflow = ConfigWorkflow( config_file=config_train_files, meta_file=None, - logging_file=None, + logging_file=logging_file, workflow_type="train", **self.train_kwargs, ) @@ -459,7 +463,7 @@ def initialize(self, extra=None): self.eval_workflow = ConfigWorkflow( config_file=config_eval_files, meta_file=None, - logging_file=None, + logging_file=logging_file, workflow_type=self.eval_workflow_name, **self.eval_kwargs, ) diff --git a/monai/fl/utils/constants.py b/monai/fl/utils/constants.py index eda1a6b4f9..18beceeaee 100644 --- a/monai/fl/utils/constants.py +++ b/monai/fl/utils/constants.py @@ -30,6 +30,7 @@ class ExtraItems(StrEnum): CLIENT_NAME = "fl_client_name" APP_ROOT = "fl_app_root" STATS_SENDER = "fl_stats_sender" + LOGGING_FILE = "logging_file" class FlPhase(StrEnum): diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index df209c1c8b..6d19579d9e 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -21,6 +21,7 @@ import torch from torch.utils.data import Dataset +from monai.apps.utils import get_logger from monai.config import IgniteInfo from monai.utils import CommonKeys, ensure_tuple, min_version, optional_import @@ -29,6 +30,9 @@ mlflow.entities, _ = optional_import( "mlflow.entities", descriptor="Please install mlflow.entities before using MLFlowHandler." ) +MlflowException, _ = optional_import( + "mlflow.exceptions", name="MlflowException", descriptor="Please install mlflow before using MLFlowHandler." +) pandas, _ = optional_import("pandas", descriptor="Please install pandas for recording the dataset.") tqdm, _ = optional_import("tqdm", "4.47.0", min_version, "tqdm") @@ -41,6 +45,8 @@ DEFAULT_TAG = "Loss" +logger = get_logger(module_name=__name__) + class MLFlowHandler: """ @@ -236,10 +242,21 @@ def start(self, engine: Engine) -> None: def _set_experiment(self): experiment = self.experiment if not experiment: - experiment = self.client.get_experiment_by_name(self.experiment_name) - if not experiment: - experiment_id = self.client.create_experiment(self.experiment_name) - experiment = self.client.get_experiment(experiment_id) + for _retry_time in range(3): + try: + experiment = self.client.get_experiment_by_name(self.experiment_name) + if not experiment: + experiment_id = self.client.create_experiment(self.experiment_name) + experiment = self.client.get_experiment(experiment_id) + break + except MlflowException as e: + if "RESOURCE_ALREADY_EXISTS" in str(e): + logger.warning("Experiment already exists; delaying before retrying.") + time.sleep(1) + if _retry_time == 2: + raise e + else: + raise e if experiment.lifecycle_stage != mlflow.entities.LifecycleStage.ACTIVE: raise ValueError(f"Cannot set a deleted experiment '{self.experiment_name}' as the active experiment") diff --git a/monai/inferers/__init__.py b/monai/inferers/__init__.py index 960380bfb8..fc78b9f7c4 100644 --- a/monai/inferers/__init__.py +++ b/monai/inferers/__init__.py @@ -12,13 +12,18 @@ from __future__ import annotations from .inferer import ( + ControlNetDiffusionInferer, + ControlNetLatentDiffusionInferer, + DiffusionInferer, Inferer, + LatentDiffusionInferer, PatchInferer, SaliencyInferer, SimpleInferer, SliceInferer, SlidingWindowInferer, SlidingWindowInfererAdapt, + VQVAETransformerInferer, ) from .merger import AvgMerger, Merger, ZarrAvgMerger from .splitter import SlidingWindowSplitter, Splitter, WSISlidingWindowSplitter diff --git a/monai/inferers/inferer.py b/monai/inferers/inferer.py index 0b4199938d..769b6cc0e7 100644 --- a/monai/inferers/inferer.py +++ b/monai/inferers/inferer.py @@ -11,24 +11,41 @@ from __future__ import annotations +import math import warnings from abc import ABC, abstractmethod from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence +from functools import partial from pydoc import locate from typing import Any import torch import torch.nn as nn +import torch.nn.functional as F from monai.apps.utils import get_logger +from monai.data import decollate_batch from monai.data.meta_tensor import MetaTensor from monai.data.thread_buffer import ThreadBuffer from monai.inferers.merger import AvgMerger, Merger from monai.inferers.splitter import Splitter from monai.inferers.utils import compute_importance_map, sliding_window_inference -from monai.utils import BlendMode, PatchKeys, PytorchPadMode, ensure_tuple, optional_import +from monai.networks.nets import ( + VQVAE, + AutoencoderKL, + ControlNet, + DecoderOnlyTransformer, + DiffusionModelUNet, + SPADEAutoencoderKL, + SPADEDiffusionModelUNet, +) +from monai.networks.schedulers import Scheduler +from monai.transforms import CenterSpatialCrop, SpatialPad +from monai.utils import BlendMode, Ordering, PatchKeys, PytorchPadMode, ensure_tuple, optional_import from monai.visualize import CAM, GradCAM, GradCAMpp +tqdm, has_tqdm = optional_import("tqdm", name="tqdm") + logger = get_logger(__name__) __all__ = [ @@ -752,3 +769,1264 @@ def network_wrapper( return out return tuple(out_i.unsqueeze(dim=self.spatial_dim + 2) for out_i in out) + + +class DiffusionInferer(Inferer): + """ + DiffusionInferer takes a trained diffusion model and a scheduler and can be used to perform a signal forward pass + for a training iteration, and sample from the model. + + Args: + scheduler: diffusion scheduler. + """ + + def __init__(self, scheduler: Scheduler) -> None: # type: ignore[override] + super().__init__() + + self.scheduler = scheduler + + def __call__( # type: ignore[override] + self, + inputs: torch.Tensor, + diffusion_model: DiffusionModelUNet, + noise: torch.Tensor, + timesteps: torch.Tensor, + condition: torch.Tensor | None = None, + mode: str = "crossattn", + seg: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Implements the forward pass for a supervised training iteration. + + Args: + inputs: Input image to which noise is added. + diffusion_model: diffusion model. + noise: random noise, of the same shape as the input. + timesteps: random timesteps. + condition: Conditioning for network input. + mode: Conditioning mode for the network. + seg: if model is instance of SPADEDiffusionModelUnet, segmentation must be + provided on the forward (for SPADE-like AE or SPADE-like DM) + """ + if mode not in ["crossattn", "concat"]: + raise NotImplementedError(f"{mode} condition is not supported") + + noisy_image: torch.Tensor = self.scheduler.add_noise(original_samples=inputs, noise=noise, timesteps=timesteps) + if mode == "concat": + if condition is None: + raise ValueError("Conditioning is required for concat condition") + else: + noisy_image = torch.cat([noisy_image, condition], dim=1) + condition = None + diffusion_model = ( + partial(diffusion_model, seg=seg) + if isinstance(diffusion_model, SPADEDiffusionModelUNet) + else diffusion_model + ) + prediction: torch.Tensor = diffusion_model(x=noisy_image, timesteps=timesteps, context=condition) + + return prediction + + @torch.no_grad() + def sample( + self, + input_noise: torch.Tensor, + diffusion_model: DiffusionModelUNet, + scheduler: Scheduler | None = None, + save_intermediates: bool | None = False, + intermediate_steps: int | None = 100, + conditioning: torch.Tensor | None = None, + mode: str = "crossattn", + verbose: bool = True, + seg: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """ + Args: + input_noise: random noise, of the same shape as the desired sample. + diffusion_model: model to sample from. + scheduler: diffusion scheduler. If none provided will use the class attribute scheduler + save_intermediates: whether to return intermediates along the sampling change + intermediate_steps: if save_intermediates is True, saves every n steps + conditioning: Conditioning for network input. + mode: Conditioning mode for the network. + verbose: if true, prints the progression bar of the sampling process. + seg: if diffusion model is instance of SPADEDiffusionModel, segmentation must be provided. + """ + if mode not in ["crossattn", "concat"]: + raise NotImplementedError(f"{mode} condition is not supported") + if mode == "concat" and conditioning is None: + raise ValueError("Conditioning must be supplied for if condition mode is concat.") + if not scheduler: + scheduler = self.scheduler + image = input_noise + if verbose and has_tqdm: + progress_bar = tqdm(scheduler.timesteps) + else: + progress_bar = iter(scheduler.timesteps) + intermediates = [] + for t in progress_bar: + # 1. predict noise model_output + diffusion_model = ( + partial(diffusion_model, seg=seg) + if isinstance(diffusion_model, SPADEDiffusionModelUNet) + else diffusion_model + ) + if mode == "concat" and conditioning is not None: + model_input = torch.cat([image, conditioning], dim=1) + model_output = diffusion_model( + model_input, timesteps=torch.Tensor((t,)).to(input_noise.device), context=None + ) + else: + model_output = diffusion_model( + image, timesteps=torch.Tensor((t,)).to(input_noise.device), context=conditioning + ) + + # 2. compute previous image: x_t -> x_t-1 + image, _ = scheduler.step(model_output, t, image) + if save_intermediates and t % intermediate_steps == 0: + intermediates.append(image) + if save_intermediates: + return image, intermediates + else: + return image + + @torch.no_grad() + def get_likelihood( + self, + inputs: torch.Tensor, + diffusion_model: DiffusionModelUNet, + scheduler: Scheduler | None = None, + save_intermediates: bool | None = False, + conditioning: torch.Tensor | None = None, + mode: str = "crossattn", + original_input_range: tuple = (0, 255), + scaled_input_range: tuple = (0, 1), + verbose: bool = True, + seg: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """ + Computes the log-likelihoods for an input. + + Args: + inputs: input images, NxCxHxW[xD] + diffusion_model: model to compute likelihood from + scheduler: diffusion scheduler. If none provided will use the class attribute scheduler. + save_intermediates: save the intermediate spatial KL maps + conditioning: Conditioning for network input. + mode: Conditioning mode for the network. + original_input_range: the [min,max] intensity range of the input data before any scaling was applied. + scaled_input_range: the [min,max] intensity range of the input data after scaling. + verbose: if true, prints the progression bar of the sampling process. + seg: if diffusion model is instance of SPADEDiffusionModel, segmentation must be provided. + """ + + if not scheduler: + scheduler = self.scheduler + if scheduler._get_name() != "DDPMScheduler": + raise NotImplementedError( + f"Likelihood computation is only compatible with DDPMScheduler," + f" you are using {scheduler._get_name()}" + ) + if mode not in ["crossattn", "concat"]: + raise NotImplementedError(f"{mode} condition is not supported") + if mode == "concat" and conditioning is None: + raise ValueError("Conditioning must be supplied for if condition mode is concat.") + if verbose and has_tqdm: + progress_bar = tqdm(scheduler.timesteps) + else: + progress_bar = iter(scheduler.timesteps) + intermediates = [] + noise = torch.randn_like(inputs).to(inputs.device) + total_kl = torch.zeros(inputs.shape[0]).to(inputs.device) + for t in progress_bar: + timesteps = torch.full(inputs.shape[:1], t, device=inputs.device).long() + noisy_image = self.scheduler.add_noise(original_samples=inputs, noise=noise, timesteps=timesteps) + diffusion_model = ( + partial(diffusion_model, seg=seg) + if isinstance(diffusion_model, SPADEDiffusionModelUNet) + else diffusion_model + ) + if mode == "concat" and conditioning is not None: + noisy_image = torch.cat([noisy_image, conditioning], dim=1) + model_output = diffusion_model(noisy_image, timesteps=timesteps, context=None) + else: + model_output = diffusion_model(x=noisy_image, timesteps=timesteps, context=conditioning) + + # get the model's predicted mean, and variance if it is predicted + if model_output.shape[1] == inputs.shape[1] * 2 and scheduler.variance_type in ["learned", "learned_range"]: + model_output, predicted_variance = torch.split(model_output, inputs.shape[1], dim=1) + else: + predicted_variance = None + + # 1. compute alphas, betas + alpha_prod_t = scheduler.alphas_cumprod[t] + alpha_prod_t_prev = scheduler.alphas_cumprod[t - 1] if t > 0 else scheduler.one + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + + # 2. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf + if scheduler.prediction_type == "epsilon": + pred_original_sample = (noisy_image - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + elif scheduler.prediction_type == "sample": + pred_original_sample = model_output + elif scheduler.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * noisy_image - (beta_prod_t**0.5) * model_output + # 3. Clip "predicted x_0" + if scheduler.clip_sample: + pred_original_sample = torch.clamp(pred_original_sample, -1, 1) + + # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t + # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * scheduler.betas[t]) / beta_prod_t + current_sample_coeff = scheduler.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t + + # 5. Compute predicted previous sample µ_t + # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + predicted_mean = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * noisy_image + + # get the posterior mean and variance + posterior_mean = scheduler._get_mean(timestep=t, x_0=inputs, x_t=noisy_image) + posterior_variance = scheduler._get_variance(timestep=t, predicted_variance=predicted_variance) + + log_posterior_variance = torch.log(posterior_variance) + log_predicted_variance = torch.log(predicted_variance) if predicted_variance else log_posterior_variance + + if t == 0: + # compute -log p(x_0|x_1) + kl = -self._get_decoder_log_likelihood( + inputs=inputs, + means=predicted_mean, + log_scales=0.5 * log_predicted_variance, + original_input_range=original_input_range, + scaled_input_range=scaled_input_range, + ) + else: + # compute kl between two normals + kl = 0.5 * ( + -1.0 + + log_predicted_variance + - log_posterior_variance + + torch.exp(log_posterior_variance - log_predicted_variance) + + ((posterior_mean - predicted_mean) ** 2) * torch.exp(-log_predicted_variance) + ) + total_kl += kl.view(kl.shape[0], -1).mean(dim=1) + if save_intermediates: + intermediates.append(kl.cpu()) + + if save_intermediates: + return total_kl, intermediates + else: + return total_kl + + def _approx_standard_normal_cdf(self, x): + """ + A fast approximation of the cumulative distribution function of the + standard normal. Code adapted from https://github.com/openai/improved-diffusion. + """ + + return 0.5 * ( + 1.0 + torch.tanh(torch.sqrt(torch.Tensor([2.0 / math.pi]).to(x.device)) * (x + 0.044715 * torch.pow(x, 3))) + ) + + def _get_decoder_log_likelihood( + self, + inputs: torch.Tensor, + means: torch.Tensor, + log_scales: torch.Tensor, + original_input_range: tuple = (0, 255), + scaled_input_range: tuple = (0, 1), + ) -> torch.Tensor: + """ + Compute the log-likelihood of a Gaussian distribution discretizing to a + given image. Code adapted from https://github.com/openai/improved-diffusion. + + Args: + input: the target images. It is assumed that this was uint8 values, + rescaled to the range [-1, 1]. + means: the Gaussian mean Tensor. + log_scales: the Gaussian log stddev Tensor. + original_input_range: the [min,max] intensity range of the input data before any scaling was applied. + scaled_input_range: the [min,max] intensity range of the input data after scaling. + """ + if inputs.shape != means.shape: + raise ValueError(f"Inputs and means must have the same shape, got {inputs.shape} and {means.shape}") + bin_width = (scaled_input_range[1] - scaled_input_range[0]) / ( + original_input_range[1] - original_input_range[0] + ) + centered_x = inputs - means + inv_stdv = torch.exp(-log_scales) + plus_in = inv_stdv * (centered_x + bin_width / 2) + cdf_plus = self._approx_standard_normal_cdf(plus_in) + min_in = inv_stdv * (centered_x - bin_width / 2) + cdf_min = self._approx_standard_normal_cdf(min_in) + log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12)) + log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12)) + cdf_delta = cdf_plus - cdf_min + log_probs = torch.where( + inputs < -0.999, + log_cdf_plus, + torch.where(inputs > 0.999, log_one_minus_cdf_min, torch.log(cdf_delta.clamp(min=1e-12))), + ) + return log_probs + + +class LatentDiffusionInferer(DiffusionInferer): + """ + LatentDiffusionInferer takes a stage 1 model (VQVAE or AutoencoderKL), diffusion model, and a scheduler, and can + be used to perform a signal forward pass for a training iteration, and sample from the model. + + Args: + scheduler: a scheduler to be used in combination with `unet` to denoise the encoded image latents. + scale_factor: scale factor to multiply the values of the latent representation before processing it by the + second stage. + ldm_latent_shape: desired spatial latent space shape. Used if there is a difference in the autoencoder model's latent shape. + autoencoder_latent_shape: autoencoder_latent_shape: autoencoder spatial latent space shape. Used if there is a + difference between the autoencoder's latent shape and the DM shape. + """ + + def __init__( + self, + scheduler: Scheduler, + scale_factor: float = 1.0, + ldm_latent_shape: list | None = None, + autoencoder_latent_shape: list | None = None, + ) -> None: + super().__init__(scheduler=scheduler) + self.scale_factor = scale_factor + if (ldm_latent_shape is None) ^ (autoencoder_latent_shape is None): + raise ValueError("If ldm_latent_shape is None, autoencoder_latent_shape must be None, and vice versa.") + self.ldm_latent_shape = ldm_latent_shape + self.autoencoder_latent_shape = autoencoder_latent_shape + if self.ldm_latent_shape is not None and self.autoencoder_latent_shape is not None: + self.ldm_resizer = SpatialPad(spatial_size=self.ldm_latent_shape) + self.autoencoder_resizer = CenterSpatialCrop(roi_size=self.autoencoder_latent_shape) + + def __call__( # type: ignore[override] + self, + inputs: torch.Tensor, + autoencoder_model: AutoencoderKL | VQVAE, + diffusion_model: DiffusionModelUNet, + noise: torch.Tensor, + timesteps: torch.Tensor, + condition: torch.Tensor | None = None, + mode: str = "crossattn", + seg: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Implements the forward pass for a supervised training iteration. + + Args: + inputs: input image to which the latent representation will be extracted and noise is added. + autoencoder_model: first stage model. + diffusion_model: diffusion model. + noise: random noise, of the same shape as the latent representation. + timesteps: random timesteps. + condition: conditioning for network input. + mode: Conditioning mode for the network. + seg: if diffusion model is instance of SPADEDiffusionModel, segmentation must be provided. + """ + with torch.no_grad(): + latent = autoencoder_model.encode_stage_2_inputs(inputs) * self.scale_factor + + if self.ldm_latent_shape is not None: + latent = torch.stack([self.ldm_resizer(i) for i in decollate_batch(latent)], 0) + + prediction: torch.Tensor = super().__call__( + inputs=latent, + diffusion_model=diffusion_model, + noise=noise, + timesteps=timesteps, + condition=condition, + mode=mode, + seg=seg, + ) + return prediction + + @torch.no_grad() + def sample( # type: ignore[override] + self, + input_noise: torch.Tensor, + autoencoder_model: AutoencoderKL | VQVAE, + diffusion_model: DiffusionModelUNet, + scheduler: Scheduler | None = None, + save_intermediates: bool | None = False, + intermediate_steps: int | None = 100, + conditioning: torch.Tensor | None = None, + mode: str = "crossattn", + verbose: bool = True, + seg: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """ + Args: + input_noise: random noise, of the same shape as the desired latent representation. + autoencoder_model: first stage model. + diffusion_model: model to sample from. + scheduler: diffusion scheduler. If none provided will use the class attribute scheduler. + save_intermediates: whether to return intermediates along the sampling change + intermediate_steps: if save_intermediates is True, saves every n steps + conditioning: Conditioning for network input. + mode: Conditioning mode for the network. + verbose: if true, prints the progression bar of the sampling process. + seg: if diffusion model is instance of SPADEDiffusionModel, or autoencoder_model + is instance of SPADEAutoencoderKL, segmentation must be provided. + """ + + if ( + isinstance(autoencoder_model, SPADEAutoencoderKL) + and isinstance(diffusion_model, SPADEDiffusionModelUNet) + and autoencoder_model.decoder.label_nc != diffusion_model.label_nc + ): + raise ValueError( + f"If both autoencoder_model and diffusion_model implement SPADE, the number of semantic" + f"labels for each must be compatible, but got {autoencoder_model.decoder.label_nc} and" + f"{diffusion_model.label_nc}" + ) + + outputs = super().sample( + input_noise=input_noise, + diffusion_model=diffusion_model, + scheduler=scheduler, + save_intermediates=save_intermediates, + intermediate_steps=intermediate_steps, + conditioning=conditioning, + mode=mode, + verbose=verbose, + seg=seg, + ) + + if save_intermediates: + latent, latent_intermediates = outputs + else: + latent = outputs + + if self.autoencoder_latent_shape is not None: + latent = torch.stack([self.autoencoder_resizer(i) for i in decollate_batch(latent)], 0) + latent_intermediates = [ + torch.stack([self.autoencoder_resizer(i) for i in decollate_batch(l)], 0) for l in latent_intermediates + ] + + decode = autoencoder_model.decode_stage_2_outputs + if isinstance(autoencoder_model, SPADEAutoencoderKL): + decode = partial(autoencoder_model.decode_stage_2_outputs, seg=seg) + image = decode(latent / self.scale_factor) + + if save_intermediates: + intermediates = [] + for latent_intermediate in latent_intermediates: + decode = autoencoder_model.decode_stage_2_outputs + if isinstance(autoencoder_model, SPADEAutoencoderKL): + decode = partial(autoencoder_model.decode_stage_2_outputs, seg=seg) + intermediates.append(decode(latent_intermediate / self.scale_factor)) + return image, intermediates + + else: + return image + + @torch.no_grad() + def get_likelihood( # type: ignore[override] + self, + inputs: torch.Tensor, + autoencoder_model: AutoencoderKL | VQVAE, + diffusion_model: DiffusionModelUNet, + scheduler: Scheduler | None = None, + save_intermediates: bool | None = False, + conditioning: torch.Tensor | None = None, + mode: str = "crossattn", + original_input_range: tuple | None = (0, 255), + scaled_input_range: tuple | None = (0, 1), + verbose: bool = True, + resample_latent_likelihoods: bool = False, + resample_interpolation_mode: str = "nearest", + seg: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """ + Computes the log-likelihoods of the latent representations of the input. + + Args: + inputs: input images, NxCxHxW[xD] + autoencoder_model: first stage model. + diffusion_model: model to compute likelihood from + scheduler: diffusion scheduler. If none provided will use the class attribute scheduler + save_intermediates: save the intermediate spatial KL maps + conditioning: Conditioning for network input. + mode: Conditioning mode for the network. + original_input_range: the [min,max] intensity range of the input data before any scaling was applied. + scaled_input_range: the [min,max] intensity range of the input data after scaling. + verbose: if true, prints the progression bar of the sampling process. + resample_latent_likelihoods: if true, resamples the intermediate likelihood maps to have the same spatial + dimension as the input images. + resample_interpolation_mode: if use resample_latent_likelihoods, select interpolation 'nearest', 'bilinear', + or 'trilinear; + seg: if diffusion model is instance of SPADEDiffusionModel, or autoencoder_model + is instance of SPADEAutoencoderKL, segmentation must be provided. + """ + if resample_latent_likelihoods and resample_interpolation_mode not in ("nearest", "bilinear", "trilinear"): + raise ValueError( + f"resample_interpolation mode should be either nearest, bilinear, or trilinear, got {resample_interpolation_mode}" + ) + latents = autoencoder_model.encode_stage_2_inputs(inputs) * self.scale_factor + + if self.ldm_latent_shape is not None: + latents = torch.stack([self.ldm_resizer(i) for i in decollate_batch(latents)], 0) + + outputs = super().get_likelihood( + inputs=latents, + diffusion_model=diffusion_model, + scheduler=scheduler, + save_intermediates=save_intermediates, + conditioning=conditioning, + mode=mode, + verbose=verbose, + seg=seg, + ) + + if save_intermediates and resample_latent_likelihoods: + intermediates = outputs[1] + resizer = nn.Upsample(size=inputs.shape[2:], mode=resample_interpolation_mode) + intermediates = [resizer(x) for x in intermediates] + outputs = (outputs[0], intermediates) + return outputs + + +class ControlNetDiffusionInferer(DiffusionInferer): + """ + ControlNetDiffusionInferer takes a trained diffusion model and a scheduler and can be used to perform a signal + forward pass for a training iteration, and sample from the model, supporting ControlNet-based conditioning. + + Args: + scheduler: diffusion scheduler. + """ + + def __init__(self, scheduler: Scheduler) -> None: + Inferer.__init__(self) + self.scheduler = scheduler + + def __call__( # type: ignore[override] + self, + inputs: torch.Tensor, + diffusion_model: DiffusionModelUNet, + controlnet: ControlNet, + noise: torch.Tensor, + timesteps: torch.Tensor, + cn_cond: torch.Tensor, + condition: torch.Tensor | None = None, + mode: str = "crossattn", + seg: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Implements the forward pass for a supervised training iteration. + + Args: + inputs: Input image to which noise is added. + diffusion_model: diffusion model. + controlnet: controlnet sub-network. + noise: random noise, of the same shape as the input. + timesteps: random timesteps. + cn_cond: conditioning image for the ControlNet. + condition: Conditioning for network input. + mode: Conditioning mode for the network. + seg: if model is instance of SPADEDiffusionModelUnet, segmentation must be + provided on the forward (for SPADE-like AE or SPADE-like DM) + """ + if mode not in ["crossattn", "concat"]: + raise NotImplementedError(f"{mode} condition is not supported") + + noisy_image = self.scheduler.add_noise(original_samples=inputs, noise=noise, timesteps=timesteps) + down_block_res_samples, mid_block_res_sample = controlnet( + x=noisy_image, timesteps=timesteps, controlnet_cond=cn_cond + ) + if mode == "concat" and condition is not None: + noisy_image = torch.cat([noisy_image, condition], dim=1) + condition = None + + diffuse = diffusion_model + if isinstance(diffusion_model, SPADEDiffusionModelUNet): + diffuse = partial(diffusion_model, seg=seg) + + prediction: torch.Tensor = diffuse( + x=noisy_image, + timesteps=timesteps, + context=condition, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + ) + + return prediction + + @torch.no_grad() + def sample( # type: ignore[override] + self, + input_noise: torch.Tensor, + diffusion_model: DiffusionModelUNet, + controlnet: ControlNet, + cn_cond: torch.Tensor, + scheduler: Scheduler | None = None, + save_intermediates: bool | None = False, + intermediate_steps: int | None = 100, + conditioning: torch.Tensor | None = None, + mode: str = "crossattn", + verbose: bool = True, + seg: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """ + Args: + input_noise: random noise, of the same shape as the desired sample. + diffusion_model: model to sample from. + controlnet: controlnet sub-network. + cn_cond: conditioning image for the ControlNet. + scheduler: diffusion scheduler. If none provided will use the class attribute scheduler + save_intermediates: whether to return intermediates along the sampling change + intermediate_steps: if save_intermediates is True, saves every n steps + conditioning: Conditioning for network input. + mode: Conditioning mode for the network. + verbose: if true, prints the progression bar of the sampling process. + seg: if diffusion model is instance of SPADEDiffusionModel, segmentation must be provided. + """ + if mode not in ["crossattn", "concat"]: + raise NotImplementedError(f"{mode} condition is not supported") + + if not scheduler: + scheduler = self.scheduler + image = input_noise + if verbose and has_tqdm: + progress_bar = tqdm(scheduler.timesteps) + else: + progress_bar = iter(scheduler.timesteps) + intermediates = [] + for t in progress_bar: + # 1. ControlNet forward + down_block_res_samples, mid_block_res_sample = controlnet( + x=image, timesteps=torch.Tensor((t,)).to(input_noise.device), controlnet_cond=cn_cond + ) + # 2. predict noise model_output + diffuse = diffusion_model + if isinstance(diffusion_model, SPADEDiffusionModelUNet): + diffuse = partial(diffusion_model, seg=seg) + + if mode == "concat" and conditioning is not None: + model_input = torch.cat([image, conditioning], dim=1) + model_output = diffuse( + model_input, + timesteps=torch.Tensor((t,)).to(input_noise.device), + context=None, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + ) + else: + model_output = diffuse( + image, + timesteps=torch.Tensor((t,)).to(input_noise.device), + context=conditioning, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + ) + + # 3. compute previous image: x_t -> x_t-1 + image, _ = scheduler.step(model_output, t, image) + if save_intermediates and t % intermediate_steps == 0: + intermediates.append(image) + if save_intermediates: + return image, intermediates + else: + return image + + @torch.no_grad() + def get_likelihood( # type: ignore[override] + self, + inputs: torch.Tensor, + diffusion_model: DiffusionModelUNet, + controlnet: ControlNet, + cn_cond: torch.Tensor, + scheduler: Scheduler | None = None, + save_intermediates: bool | None = False, + conditioning: torch.Tensor | None = None, + mode: str = "crossattn", + original_input_range: tuple = (0, 255), + scaled_input_range: tuple = (0, 1), + verbose: bool = True, + seg: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """ + Computes the log-likelihoods for an input. + + Args: + inputs: input images, NxCxHxW[xD] + diffusion_model: model to compute likelihood from + controlnet: controlnet sub-network. + cn_cond: conditioning image for the ControlNet. + scheduler: diffusion scheduler. If none provided will use the class attribute scheduler. + save_intermediates: save the intermediate spatial KL maps + conditioning: Conditioning for network input. + mode: Conditioning mode for the network. + original_input_range: the [min,max] intensity range of the input data before any scaling was applied. + scaled_input_range: the [min,max] intensity range of the input data after scaling. + verbose: if true, prints the progression bar of the sampling process. + seg: if diffusion model is instance of SPADEDiffusionModel, segmentation must be provided. + """ + + if not scheduler: + scheduler = self.scheduler + if scheduler._get_name() != "DDPMScheduler": + raise NotImplementedError( + f"Likelihood computation is only compatible with DDPMScheduler," + f" you are using {scheduler._get_name()}" + ) + if mode not in ["crossattn", "concat"]: + raise NotImplementedError(f"{mode} condition is not supported") + if verbose and has_tqdm: + progress_bar = tqdm(scheduler.timesteps) + else: + progress_bar = iter(scheduler.timesteps) + intermediates = [] + noise = torch.randn_like(inputs).to(inputs.device) + total_kl = torch.zeros(inputs.shape[0]).to(inputs.device) + for t in progress_bar: + timesteps = torch.full(inputs.shape[:1], t, device=inputs.device).long() + noisy_image = self.scheduler.add_noise(original_samples=inputs, noise=noise, timesteps=timesteps) + down_block_res_samples, mid_block_res_sample = controlnet( + x=noisy_image, timesteps=torch.Tensor((t,)).to(inputs.device), controlnet_cond=cn_cond + ) + + diffuse = diffusion_model + if isinstance(diffusion_model, SPADEDiffusionModelUNet): + diffuse = partial(diffusion_model, seg=seg) + + if mode == "concat" and conditioning is not None: + noisy_image = torch.cat([noisy_image, conditioning], dim=1) + model_output = diffuse( + noisy_image, + timesteps=timesteps, + context=None, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + ) + else: + model_output = diffuse( + x=noisy_image, + timesteps=timesteps, + context=conditioning, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + ) + # get the model's predicted mean, and variance if it is predicted + if model_output.shape[1] == inputs.shape[1] * 2 and scheduler.variance_type in ["learned", "learned_range"]: + model_output, predicted_variance = torch.split(model_output, inputs.shape[1], dim=1) + else: + predicted_variance = None + + # 1. compute alphas, betas + alpha_prod_t = scheduler.alphas_cumprod[t] + alpha_prod_t_prev = scheduler.alphas_cumprod[t - 1] if t > 0 else scheduler.one + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + + # 2. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf + if scheduler.prediction_type == "epsilon": + pred_original_sample = (noisy_image - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + elif scheduler.prediction_type == "sample": + pred_original_sample = model_output + elif scheduler.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * noisy_image - (beta_prod_t**0.5) * model_output + # 3. Clip "predicted x_0" + if scheduler.clip_sample: + pred_original_sample = torch.clamp(pred_original_sample, -1, 1) + + # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t + # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * scheduler.betas[t]) / beta_prod_t + current_sample_coeff = scheduler.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t + + # 5. Compute predicted previous sample µ_t + # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + predicted_mean = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * noisy_image + + # get the posterior mean and variance + posterior_mean = scheduler._get_mean(timestep=t, x_0=inputs, x_t=noisy_image) + posterior_variance = scheduler._get_variance(timestep=t, predicted_variance=predicted_variance) + + log_posterior_variance = torch.log(posterior_variance) + log_predicted_variance = torch.log(predicted_variance) if predicted_variance else log_posterior_variance + + if t == 0: + # compute -log p(x_0|x_1) + kl = -super()._get_decoder_log_likelihood( + inputs=inputs, + means=predicted_mean, + log_scales=0.5 * log_predicted_variance, + original_input_range=original_input_range, + scaled_input_range=scaled_input_range, + ) + else: + # compute kl between two normals + kl = 0.5 * ( + -1.0 + + log_predicted_variance + - log_posterior_variance + + torch.exp(log_posterior_variance - log_predicted_variance) + + ((posterior_mean - predicted_mean) ** 2) * torch.exp(-log_predicted_variance) + ) + total_kl += kl.view(kl.shape[0], -1).mean(dim=1) + if save_intermediates: + intermediates.append(kl.cpu()) + + if save_intermediates: + return total_kl, intermediates + else: + return total_kl + + +class ControlNetLatentDiffusionInferer(ControlNetDiffusionInferer): + """ + ControlNetLatentDiffusionInferer takes a stage 1 model (VQVAE or AutoencoderKL), diffusion model, controlnet, + and a scheduler, and can be used to perform a signal forward pass for a training iteration, and sample from + the model. + + Args: + scheduler: a scheduler to be used in combination with `unet` to denoise the encoded image latents. + scale_factor: scale factor to multiply the values of the latent representation before processing it by the + second stage. + ldm_latent_shape: desired spatial latent space shape. Used if there is a difference in the autoencoder model's latent shape. + autoencoder_latent_shape: autoencoder_latent_shape: autoencoder spatial latent space shape. Used if there is a + difference between the autoencoder's latent shape and the DM shape. + """ + + def __init__( + self, + scheduler: Scheduler, + scale_factor: float = 1.0, + ldm_latent_shape: list | None = None, + autoencoder_latent_shape: list | None = None, + ) -> None: + super().__init__(scheduler=scheduler) + self.scale_factor = scale_factor + if (ldm_latent_shape is None) ^ (autoencoder_latent_shape is None): + raise ValueError("If ldm_latent_shape is None, autoencoder_latent_shape must be None" "and vice versa.") + self.ldm_latent_shape = ldm_latent_shape + self.autoencoder_latent_shape = autoencoder_latent_shape + if self.ldm_latent_shape is not None and self.autoencoder_latent_shape is not None: + self.ldm_resizer = SpatialPad(spatial_size=self.ldm_latent_shape) + self.autoencoder_resizer = CenterSpatialCrop(roi_size=self.autoencoder_latent_shape) + + def __call__( # type: ignore[override] + self, + inputs: torch.Tensor, + autoencoder_model: AutoencoderKL | VQVAE, + diffusion_model: DiffusionModelUNet, + controlnet: ControlNet, + noise: torch.Tensor, + timesteps: torch.Tensor, + cn_cond: torch.Tensor, + condition: torch.Tensor | None = None, + mode: str = "crossattn", + seg: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Implements the forward pass for a supervised training iteration. + + Args: + inputs: input image to which the latent representation will be extracted and noise is added. + autoencoder_model: first stage model. + diffusion_model: diffusion model. + controlnet: instance of ControlNet model + noise: random noise, of the same shape as the latent representation. + timesteps: random timesteps. + cn_cond: conditioning tensor for the ControlNet network + condition: conditioning for network input. + mode: Conditioning mode for the network. + seg: if diffusion model is instance of SPADEDiffusionModel, segmentation must be provided. + """ + with torch.no_grad(): + latent = autoencoder_model.encode_stage_2_inputs(inputs) * self.scale_factor + + if self.ldm_latent_shape is not None: + latent = torch.stack([self.ldm_resizer(i) for i in decollate_batch(latent)], 0) + + if cn_cond.shape[2:] != latent.shape[2:]: + cn_cond = F.interpolate(cn_cond, latent.shape[2:]) + + prediction = super().__call__( + inputs=latent, + diffusion_model=diffusion_model, + controlnet=controlnet, + noise=noise, + timesteps=timesteps, + cn_cond=cn_cond, + condition=condition, + mode=mode, + seg=seg, + ) + + return prediction + + @torch.no_grad() + def sample( # type: ignore[override] + self, + input_noise: torch.Tensor, + autoencoder_model: AutoencoderKL | VQVAE, + diffusion_model: DiffusionModelUNet, + controlnet: ControlNet, + cn_cond: torch.Tensor, + scheduler: Scheduler | None = None, + save_intermediates: bool | None = False, + intermediate_steps: int | None = 100, + conditioning: torch.Tensor | None = None, + mode: str = "crossattn", + verbose: bool = True, + seg: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """ + Args: + input_noise: random noise, of the same shape as the desired latent representation. + autoencoder_model: first stage model. + diffusion_model: model to sample from. + controlnet: instance of ControlNet model. + cn_cond: conditioning tensor for the ControlNet network. + scheduler: diffusion scheduler. If none provided will use the class attribute scheduler. + save_intermediates: whether to return intermediates along the sampling change + intermediate_steps: if save_intermediates is True, saves every n steps + conditioning: Conditioning for network input. + mode: Conditioning mode for the network. + verbose: if true, prints the progression bar of the sampling process. + seg: if diffusion model is instance of SPADEDiffusionModel, or autoencoder_model + is instance of SPADEAutoencoderKL, segmentation must be provided. + """ + + if ( + isinstance(autoencoder_model, SPADEAutoencoderKL) + and isinstance(diffusion_model, SPADEDiffusionModelUNet) + and autoencoder_model.decoder.label_nc != diffusion_model.label_nc + ): + raise ValueError( + "If both autoencoder_model and diffusion_model implement SPADE, the number of semantic" + "labels for each must be compatible. Got {autoencoder_model.decoder.label_nc} and {diffusion_model.label_nc}" + ) + + if cn_cond.shape[2:] != input_noise.shape[2:]: + cn_cond = F.interpolate(cn_cond, input_noise.shape[2:]) + + outputs = super().sample( + input_noise=input_noise, + diffusion_model=diffusion_model, + controlnet=controlnet, + cn_cond=cn_cond, + scheduler=scheduler, + save_intermediates=save_intermediates, + intermediate_steps=intermediate_steps, + conditioning=conditioning, + mode=mode, + verbose=verbose, + seg=seg, + ) + + if save_intermediates: + latent, latent_intermediates = outputs + else: + latent = outputs + + if self.autoencoder_latent_shape is not None: + latent = torch.stack([self.autoencoder_resizer(i) for i in decollate_batch(latent)], 0) + latent_intermediates = [ + torch.stack([self.autoencoder_resizer(i) for i in decollate_batch(l)], 0) for l in latent_intermediates + ] + + decode = autoencoder_model.decode_stage_2_outputs + if isinstance(autoencoder_model, SPADEAutoencoderKL): + decode = partial(autoencoder_model.decode_stage_2_outputs, seg=seg) + + image = decode(latent / self.scale_factor) + + if save_intermediates: + intermediates = [] + for latent_intermediate in latent_intermediates: + decode = autoencoder_model.decode_stage_2_outputs + if isinstance(autoencoder_model, SPADEAutoencoderKL): + decode = partial(autoencoder_model.decode_stage_2_outputs, seg=seg) + intermediates.append(decode(latent_intermediate / self.scale_factor)) + return image, intermediates + + else: + return image + + @torch.no_grad() + def get_likelihood( # type: ignore[override] + self, + inputs: torch.Tensor, + autoencoder_model: AutoencoderKL | VQVAE, + diffusion_model: DiffusionModelUNet, + controlnet: ControlNet, + cn_cond: torch.Tensor, + scheduler: Scheduler | None = None, + save_intermediates: bool | None = False, + conditioning: torch.Tensor | None = None, + mode: str = "crossattn", + original_input_range: tuple | None = (0, 255), + scaled_input_range: tuple | None = (0, 1), + verbose: bool = True, + resample_latent_likelihoods: bool = False, + resample_interpolation_mode: str = "nearest", + seg: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """ + Computes the log-likelihoods of the latent representations of the input. + + Args: + inputs: input images, NxCxHxW[xD] + autoencoder_model: first stage model. + diffusion_model: model to compute likelihood from + controlnet: instance of ControlNet model. + cn_cond: conditioning tensor for the ControlNet network. + scheduler: diffusion scheduler. If none provided will use the class attribute scheduler + save_intermediates: save the intermediate spatial KL maps + conditioning: Conditioning for network input. + mode: Conditioning mode for the network. + original_input_range: the [min,max] intensity range of the input data before any scaling was applied. + scaled_input_range: the [min,max] intensity range of the input data after scaling. + verbose: if true, prints the progression bar of the sampling process. + resample_latent_likelihoods: if true, resamples the intermediate likelihood maps to have the same spatial + dimension as the input images. + resample_interpolation_mode: if use resample_latent_likelihoods, select interpolation 'nearest', 'bilinear', + or 'trilinear; + seg: if diffusion model is instance of SPADEDiffusionModel, or autoencoder_model + is instance of SPADEAutoencoderKL, segmentation must be provided. + """ + if resample_latent_likelihoods and resample_interpolation_mode not in ("nearest", "bilinear", "trilinear"): + raise ValueError( + f"resample_interpolation mode should be either nearest, bilinear, or trilinear, got {resample_interpolation_mode}" + ) + + latents = autoencoder_model.encode_stage_2_inputs(inputs) * self.scale_factor + + if cn_cond.shape[2:] != latents.shape[2:]: + cn_cond = F.interpolate(cn_cond, latents.shape[2:]) + + if self.ldm_latent_shape is not None: + latents = torch.stack([self.ldm_resizer(i) for i in decollate_batch(latents)], 0) + + outputs = super().get_likelihood( + inputs=latents, + diffusion_model=diffusion_model, + controlnet=controlnet, + cn_cond=cn_cond, + scheduler=scheduler, + save_intermediates=save_intermediates, + conditioning=conditioning, + mode=mode, + verbose=verbose, + seg=seg, + ) + + if save_intermediates and resample_latent_likelihoods: + intermediates = outputs[1] + resizer = nn.Upsample(size=inputs.shape[2:], mode=resample_interpolation_mode) + intermediates = [resizer(x) for x in intermediates] + outputs = (outputs[0], intermediates) + return outputs + + +class VQVAETransformerInferer(nn.Module): + """ + Class to perform inference with a VQVAE + Transformer model. + """ + + def __init__(self) -> None: + Inferer.__init__(self) + + def __call__( + self, + inputs: torch.Tensor, + vqvae_model: VQVAE, + transformer_model: DecoderOnlyTransformer, + ordering: Ordering, + condition: torch.Tensor | None = None, + return_latent: bool = False, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor, tuple]: + """ + Implements the forward pass for a supervised training iteration. + + Args: + inputs: input image to which the latent representation will be extracted. + vqvae_model: first stage model. + transformer_model: autoregressive transformer model. + ordering: ordering of the quantised latent representation. + return_latent: also return latent sequence and spatial dim of the latent. + condition: conditioning for network input. + """ + with torch.no_grad(): + latent = vqvae_model.index_quantize(inputs) + + latent_spatial_dim = tuple(latent.shape[1:]) + latent = latent.reshape(latent.shape[0], -1) + latent = latent[:, ordering.get_sequence_ordering()] + + # get the targets for the loss + target = latent.clone() + # Use the value from vqvae_model's num_embeddings as the starting token, the "Begin Of Sentence" (BOS) token. + # Note the transformer_model must have vqvae_model.num_embeddings + 1 defined as num_tokens. + latent = F.pad(latent, (1, 0), "constant", vqvae_model.num_embeddings) + # crop the last token as we do not need the probability of the token that follows it + latent = latent[:, :-1] + latent = latent.long() + + # train on a part of the sequence if it is longer than max_seq_length + seq_len = latent.shape[1] + max_seq_len = transformer_model.max_seq_len + if max_seq_len < seq_len: + start = int(torch.randint(low=0, high=seq_len + 1 - max_seq_len, size=(1,)).item()) + else: + start = 0 + prediction: torch.Tensor = transformer_model(x=latent[:, start : start + max_seq_len], context=condition) + if return_latent: + return prediction, target[:, start : start + max_seq_len], latent_spatial_dim + else: + return prediction + + @torch.no_grad() + def sample( + self, + latent_spatial_dim: tuple[int, int, int] | tuple[int, int], + starting_tokens: torch.Tensor, + vqvae_model: VQVAE, + transformer_model: DecoderOnlyTransformer, + ordering: Ordering, + conditioning: torch.Tensor | None = None, + temperature: float = 1.0, + top_k: int | None = None, + verbose: bool = True, + ) -> torch.Tensor: + """ + Sampling function for the VQVAE + Transformer model. + + Args: + latent_spatial_dim: shape of the sampled image. + starting_tokens: starting tokens for the sampling. It must be vqvae_model.num_embeddings value. + vqvae_model: first stage model. + transformer_model: model to sample from. + conditioning: Conditioning for network input. + temperature: temperature for sampling. + top_k: top k sampling. + verbose: if true, prints the progression bar of the sampling process. + """ + seq_len = math.prod(latent_spatial_dim) + + if verbose and has_tqdm: + progress_bar = tqdm(range(seq_len)) + else: + progress_bar = iter(range(seq_len)) + + latent_seq = starting_tokens.long() + for _ in progress_bar: + # if the sequence context is growing too long we must crop it at block_size + if latent_seq.size(1) <= transformer_model.max_seq_len: + idx_cond = latent_seq + else: + idx_cond = latent_seq[:, -transformer_model.max_seq_len :] + + # forward the model to get the logits for the index in the sequence + logits = transformer_model(x=idx_cond, context=conditioning) + # pluck the logits at the final step and scale by desired temperature + logits = logits[:, -1, :] / temperature + # optionally crop the logits to only the top k options + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + logits[logits < v[:, [-1]]] = -float("Inf") + # apply softmax to convert logits to (normalized) probabilities + probs = F.softmax(logits, dim=-1) + # remove the chance to be sampled the BOS token + probs[:, vqvae_model.num_embeddings] = 0 + # sample from the distribution + idx_next = torch.multinomial(probs, num_samples=1) + # append sampled index to the running sequence and continue + latent_seq = torch.cat((latent_seq, idx_next), dim=1) + + latent_seq = latent_seq[:, 1:] + latent_seq = latent_seq[:, ordering.get_revert_sequence_ordering()] + latent = latent_seq.reshape((starting_tokens.shape[0],) + latent_spatial_dim) + + return vqvae_model.decode_samples(latent) + + @torch.no_grad() + def get_likelihood( + self, + inputs: torch.Tensor, + vqvae_model: VQVAE, + transformer_model: DecoderOnlyTransformer, + ordering: Ordering, + condition: torch.Tensor | None = None, + resample_latent_likelihoods: bool = False, + resample_interpolation_mode: str = "nearest", + verbose: bool = False, + ) -> torch.Tensor: + """ + Computes the log-likelihoods of the latent representations of the input. + + Args: + inputs: input images, NxCxHxW[xD] + vqvae_model: first stage model. + transformer_model: autoregressive transformer model. + ordering: ordering of the quantised latent representation. + condition: conditioning for network input. + resample_latent_likelihoods: if true, resamples the intermediate likelihood maps to have the same spatial + dimension as the input images. + resample_interpolation_mode: if use resample_latent_likelihoods, select interpolation 'nearest', 'bilinear', + or 'trilinear; + verbose: if true, prints the progression bar of the sampling process. + + """ + if resample_latent_likelihoods and resample_interpolation_mode not in ("nearest", "bilinear", "trilinear"): + raise ValueError( + f"resample_interpolation mode should be either nearest, bilinear, or trilinear, got {resample_interpolation_mode}" + ) + + with torch.no_grad(): + latent = vqvae_model.index_quantize(inputs) + + latent_spatial_dim = tuple(latent.shape[1:]) + latent = latent.reshape(latent.shape[0], -1) + latent = latent[:, ordering.get_sequence_ordering()] + seq_len = math.prod(latent_spatial_dim) + + # Use the value from vqvae_model's num_embeddings as the starting token, the "Begin Of Sentence" (BOS) token. + # Note the transformer_model must have vqvae_model.num_embeddings + 1 defined as num_tokens. + latent = F.pad(latent, (1, 0), "constant", vqvae_model.num_embeddings) + latent = latent.long() + + # get the first batch, up to max_seq_length, efficiently + logits = transformer_model(x=latent[:, : transformer_model.max_seq_len], context=condition) + probs = F.softmax(logits, dim=-1) + # target token for each set of logits is the next token along + target = latent[:, 1:] + probs = torch.gather(probs, 2, target[:, : transformer_model.max_seq_len].unsqueeze(2)).squeeze(2) + + # if we have not covered the full sequence we continue with inefficient looping + if probs.shape[1] < target.shape[1]: + if verbose and has_tqdm: + progress_bar = tqdm(range(transformer_model.max_seq_len, seq_len)) + else: + progress_bar = iter(range(transformer_model.max_seq_len, seq_len)) + + for i in progress_bar: + idx_cond = latent[:, i + 1 - transformer_model.max_seq_len : i + 1] + # forward the model to get the logits for the index in the sequence + logits = transformer_model(x=idx_cond, context=condition) + # pluck the logits at the final step + logits = logits[:, -1, :] + # apply softmax to convert logits to (normalized) probabilities + p = F.softmax(logits, dim=-1) + # select correct values and append + p = torch.gather(p, 1, target[:, i].unsqueeze(1)) + + probs = torch.cat((probs, p), dim=1) + + # convert to log-likelihood + probs = torch.log(probs) + + # reshape + probs = probs[:, ordering.get_revert_sequence_ordering()] + probs_reshaped = probs.reshape((inputs.shape[0],) + latent_spatial_dim) + if resample_latent_likelihoods: + resizer = nn.Upsample(size=inputs.shape[2:], mode=resample_interpolation_mode) + probs_reshaped = resizer(probs_reshaped[:, None, ...]) + + return probs_reshaped diff --git a/monai/losses/__init__.py b/monai/losses/__init__.py index 92898c81ca..41935be204 100644 --- a/monai/losses/__init__.py +++ b/monai/losses/__init__.py @@ -12,6 +12,7 @@ from __future__ import annotations from .adversarial_loss import PatchAdversarialLoss +from .barlow_twins import BarlowTwinsLoss from .cldice import SoftclDiceLoss, SoftDiceclDiceLoss from .contrastive import ContrastiveLoss from .deform import BendingEnergyLoss, DiffusionLoss @@ -36,9 +37,11 @@ from .hausdorff_loss import HausdorffDTLoss, LogHausdorffDTLoss from .image_dissimilarity import GlobalMutualInformationLoss, LocalNormalizedCrossCorrelationLoss from .multi_scale import MultiScaleLoss +from .nacl_loss import NACLLoss from .perceptual import PerceptualLoss from .spatial_mask import MaskedLoss from .spectral_loss import JukeboxLoss from .ssim_loss import SSIMLoss +from .sure_loss import SURELoss from .tversky import TverskyLoss from .unified_focal_loss import AsymmetricUnifiedFocalLoss diff --git a/monai/losses/barlow_twins.py b/monai/losses/barlow_twins.py new file mode 100644 index 0000000000..a61acca66e --- /dev/null +++ b/monai/losses/barlow_twins.py @@ -0,0 +1,84 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import torch +from torch.nn.modules.loss import _Loss + + +class BarlowTwinsLoss(_Loss): + """ + The Barlow Twins cost function takes the representations extracted by a neural network from two + distorted views and seeks to make the cross-correlation matrix of the two representations tend + towards identity. This encourages the neural network to learn similar representations with the least + amount of redundancy. This cost function can be used in particular in multimodal learning to work on + representations from two modalities. The most common use case is for unsupervised learning, where data + augmentations are used to generate 2 distorted views of the same sample to force the encoder to + extract useful features for downstream tasks. + + Zbontar, Jure, et al. "Barlow Twins: Self-Supervised Learning via Redundancy Reduction" International + conference on machine learning. PMLR, 2020. (http://proceedings.mlr.press/v139/zbontar21a/zbontar21a.pdf) + + Adapted from: + https://github.com/facebookresearch/barlowtwins + + """ + + def __init__(self, lambd: float = 5e-3) -> None: + """ + Args: + lamb: Can be any float to handle the informativeness and invariance trade-off. Ideally set to 5e-3. + + Raises: + ValueError: When an input of dimension length > 2 is passed + ValueError: When input and target are of different shapes + ValueError: When batch size is less than or equal to 1 + + """ + super().__init__() + self.lambd = lambd + + def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + """ + Args: + input: the shape should be B[F]. + target: the shape should be B[F]. + """ + if len(target.shape) > 2 or len(input.shape) > 2: + raise ValueError( + f"Either target or input has dimensions greater than 2 where target " + f"shape is ({target.shape}) and input shape is ({input.shape})" + ) + + if target.shape != input.shape: + raise ValueError(f"ground truth has differing shape ({target.shape}) from input ({input.shape})") + + if target.size(0) <= 1: + raise ValueError( + f"Batch size must be greater than 1 to compute Barlow Twins Loss, but got {target.size(0)}" + ) + + lambd_tensor = torch.as_tensor(self.lambd).to(input.device) + batch_size = input.shape[0] + + # normalize input and target + input_norm = (input - input.mean(0)) / input.std(0).add(1e-6) + target_norm = (target - target.mean(0)) / target.std(0).add(1e-6) + + # cross-correlation matrix + c = torch.mm(input_norm.t(), target_norm) / batch_size # input_norm.t() is FxB, target_norm is BxF so c is FxF + + # loss + c_diff = (c - torch.eye(c.size(0), device=c.device)).pow_(2) # FxF + c_diff[~torch.eye(c.size(0), device=c.device).bool()] *= lambd_tensor + + return c_diff.sum() diff --git a/monai/losses/dice.py b/monai/losses/dice.py index b3c0f57c6e..44cde41e5d 100644 --- a/monai/losses/dice.py +++ b/monai/losses/dice.py @@ -666,6 +666,7 @@ def __init__( weight: torch.Tensor | None = None, lambda_dice: float = 1.0, lambda_ce: float = 1.0, + label_smoothing: float = 0.0, ) -> None: """ Args: @@ -704,6 +705,9 @@ def __init__( Defaults to 1.0. lambda_ce: the trade-off weight value for cross entropy loss. The value should be no less than 0.0. Defaults to 1.0. + label_smoothing: a value in [0, 1] range. If > 0, the labels are smoothed + by the given factor to reduce overfitting. + Defaults to 0.0. """ super().__init__() @@ -728,7 +732,12 @@ def __init__( batch=batch, weight=dice_weight, ) - self.cross_entropy = nn.CrossEntropyLoss(weight=weight, reduction=reduction) + if pytorch_after(1, 10): + self.cross_entropy = nn.CrossEntropyLoss( + weight=weight, reduction=reduction, label_smoothing=label_smoothing + ) + else: + self.cross_entropy = nn.CrossEntropyLoss(weight=weight, reduction=reduction) self.binary_cross_entropy = nn.BCEWithLogitsLoss(pos_weight=weight, reduction=reduction) if lambda_dice < 0.0: raise ValueError("lambda_dice should be no less than 0.0.") @@ -778,12 +787,22 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: Raises: ValueError: When number of dimensions for input and target are different. - ValueError: When number of channels for target is neither 1 nor the same as input. + ValueError: When number of channels for target is neither 1 (without one-hot encoding) nor the same as input. + + Returns: + torch.Tensor: value of the loss. """ - if len(input.shape) != len(target.shape): + if input.dim() != target.dim(): raise ValueError( "the number of dimensions for input and target should be the same, " + f"got shape {input.shape} (nb dims: {len(input.shape)}) and {target.shape} (nb dims: {len(target.shape)}). " + "if target is not one-hot encoded, please provide a tensor with shape B1H[WD]." + ) + + if target.shape[1] != 1 and target.shape[1] != input.shape[1]: + raise ValueError( + "number of channels for target is neither 1 (without one-hot encoding) nor the same as input, " f"got shape {input.shape} and {target.shape}." ) @@ -801,7 +820,7 @@ class DiceFocalLoss(_Loss): The details of Focal Loss is shown in ``monai.losses.FocalLoss``. ``gamma`` and ``lambda_focal`` are only used for the focal loss. - ``include_background``, ``weight`` and ``reduction`` are used for both losses + ``include_background``, ``weight``, ``reduction``, and ``alpha`` are used for both losses, and other parameters are only used for dice loss. """ @@ -827,6 +846,7 @@ def __init__( weight: Sequence[float] | float | int | torch.Tensor | None = None, lambda_dice: float = 1.0, lambda_focal: float = 1.0, + alpha: float | None = None, ) -> None: """ Args: @@ -861,7 +881,8 @@ def __init__( Defaults to 1.0. lambda_focal: the trade-off weight value for focal loss. The value should be no less than 0.0. Defaults to 1.0. - + alpha: value of the alpha in the definition of the alpha-balanced Focal loss. The value should be in + [0, 1]. Defaults to None. """ super().__init__() weight = focal_weight if focal_weight is not None else weight @@ -880,7 +901,12 @@ def __init__( weight=weight, ) self.focal = FocalLoss( - include_background=include_background, to_onehot_y=False, gamma=gamma, weight=weight, reduction=reduction + include_background=include_background, + to_onehot_y=False, + gamma=gamma, + weight=weight, + alpha=alpha, + reduction=reduction, ) if lambda_dice < 0.0: raise ValueError("lambda_dice should be no less than 0.0.") @@ -899,14 +925,24 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: Raises: ValueError: When number of dimensions for input and target are different. - ValueError: When number of channels for target is neither 1 nor the same as input. + ValueError: When number of channels for target is neither 1 (without one-hot encoding) nor the same as input. + Returns: + torch.Tensor: value of the loss. """ - if len(input.shape) != len(target.shape): + if input.dim() != target.dim(): raise ValueError( "the number of dimensions for input and target should be the same, " + f"got shape {input.shape} (nb dims: {len(input.shape)}) and {target.shape} (nb dims: {len(target.shape)}). " + "if target is not one-hot encoded, please provide a tensor with shape B1H[WD]." + ) + + if target.shape[1] != 1 and target.shape[1] != input.shape[1]: + raise ValueError( + "number of channels for target is neither 1 (without one-hot encoding) nor the same as input, " f"got shape {input.shape} and {target.shape}." ) + if self.to_onehot_y: n_pred_ch = input.shape[1] if n_pred_ch == 1: @@ -1015,15 +1051,23 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: target (torch.Tensor): the shape should be BNH[WD] or B1H[WD]. Raises: - ValueError: When the input and target tensors have different numbers of dimensions, or the target - channel isn't either one-hot encoded or categorical with the same shape of the input. + ValueError: When number of dimensions for input and target are different. + ValueError: When number of channels for target is neither 1 (without one-hot encoding) nor the same as input. Returns: torch.Tensor: value of the loss. """ if input.dim() != target.dim(): raise ValueError( - f"Input - {input.shape} - and target - {target.shape} - must have the same number of dimensions." + "the number of dimensions for input and target should be the same, " + f"got shape {input.shape} (nb dims: {len(input.shape)}) and {target.shape} (nb dims: {len(target.shape)}). " + "if target is not one-hot encoded, please provide a tensor with shape B1H[WD]." + ) + + if target.shape[1] != 1 and target.shape[1] != input.shape[1]: + raise ValueError( + "number of channels for target is neither 1 (without one-hot encoding) nor the same as input, " + f"got shape {input.shape} and {target.shape}." ) gdl_loss = self.generalized_dice(input, target) diff --git a/monai/losses/ds_loss.py b/monai/losses/ds_loss.py index 57fcff6b87..aacc16874d 100644 --- a/monai/losses/ds_loss.py +++ b/monai/losses/ds_loss.py @@ -33,7 +33,7 @@ def __init__(self, loss: _Loss, weight_mode: str = "exp", weights: list[float] | weight_mode: {``"same"``, ``"exp"``, ``"two"``} Specifies the weights calculation for each image level. Defaults to ``"exp"``. - ``"same"``: all weights are equal to 1. - - ``"exp"``: exponentially decreasing weights by a power of 2: 0, 0.5, 0.25, 0.125, etc . + - ``"exp"``: exponentially decreasing weights by a power of 2: 1, 0.5, 0.25, 0.125, etc . - ``"two"``: equal smaller weights for lower levels: 1, 0.5, 0.5, 0.5, 0.5, etc weights: a list of weights to apply to each deeply supervised sub-loss, if provided, this will be used regardless of the weight_mode diff --git a/monai/losses/focal_loss.py b/monai/losses/focal_loss.py index 98c1a071b6..28d1c0cdc9 100644 --- a/monai/losses/focal_loss.py +++ b/monai/losses/focal_loss.py @@ -234,9 +234,8 @@ def sigmoid_focal_loss( """ # computing binary cross entropy with logits # equivalent to F.binary_cross_entropy_with_logits(input, target, reduction='none') - # see also https://github.com/pytorch/pytorch/blob/v1.9.0/aten/src/ATen/native/Loss.cpp#L231 - max_val = (-input).clamp(min=0) - loss: torch.Tensor = input - input * target + max_val + ((-max_val).exp() + (-input - max_val).exp()).log() + # see also https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/Loss.cpp#L363 + loss: torch.Tensor = input - input * target - F.logsigmoid(input) # sigmoid(-i) if t==1; sigmoid(i) if t==0 <=> # 1-sigmoid(i) if t==1; sigmoid(i) if t==0 <=> diff --git a/monai/losses/nacl_loss.py b/monai/losses/nacl_loss.py new file mode 100644 index 0000000000..27a712d308 --- /dev/null +++ b/monai/losses/nacl_loss.py @@ -0,0 +1,139 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Any + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.modules.loss import _Loss + +from monai.networks.layers import GaussianFilter, MeanFilter + + +class NACLLoss(_Loss): + """ + Neighbor-Aware Calibration Loss (NACL) is primarily developed for developing calibrated models in image segmentation. + NACL computes standard cross-entropy loss with a linear penalty that enforces the logit distributions + to match a soft class proportion of surrounding pixel. + + Murugesan, Balamurali, et al. + "Trust your neighbours: Penalty-based constraints for model calibration." + International Conference on Medical Image Computing and Computer-Assisted Intervention, MICCAI 2023. + https://arxiv.org/abs/2303.06268 + + Murugesan, Balamurali, et al. + "Neighbor-Aware Calibration of Segmentation Networks with Penalty-Based Constraints." + https://arxiv.org/abs/2401.14487 + """ + + def __init__( + self, + classes: int, + dim: int, + kernel_size: int = 3, + kernel_ops: str = "mean", + distance_type: str = "l1", + alpha: float = 0.1, + sigma: float = 1.0, + ) -> None: + """ + Args: + classes: number of classes + dim: dimension of data (supports 2d and 3d) + kernel_size: size of the spatial kernel + distance_type: l1/l2 distance between spatial kernel and predicted logits + alpha: weightage between cross entropy and logit constraint + sigma: sigma of gaussian + """ + + super().__init__() + + if kernel_ops not in ["mean", "gaussian"]: + raise ValueError("Kernel ops must be either mean or gaussian") + + if dim not in [2, 3]: + raise ValueError(f"Support 2d and 3d, got dim={dim}.") + + if distance_type not in ["l1", "l2"]: + raise ValueError(f"Distance type must be either L1 or L2, got {distance_type}") + + self.nc = classes + self.dim = dim + self.cross_entropy = nn.CrossEntropyLoss() + self.distance_type = distance_type + self.alpha = alpha + self.ks = kernel_size + self.svls_layer: Any + + if kernel_ops == "mean": + self.svls_layer = MeanFilter(spatial_dims=dim, size=kernel_size) + self.svls_layer.filter = self.svls_layer.filter / (kernel_size**dim) + if kernel_ops == "gaussian": + self.svls_layer = GaussianFilter(spatial_dims=dim, sigma=sigma) + + def get_constr_target(self, mask: torch.Tensor) -> torch.Tensor: + """ + Converts the mask to one hot represenation and is smoothened with the selected spatial filter. + + Args: + mask: the shape should be BH[WD]. + + Returns: + torch.Tensor: the shape would be BNH[WD], N being number of classes. + """ + rmask: torch.Tensor + + if self.dim == 2: + oh_labels = F.one_hot(mask.to(torch.int64), num_classes=self.nc).permute(0, 3, 1, 2).contiguous().float() + rmask = self.svls_layer(oh_labels) + + if self.dim == 3: + oh_labels = F.one_hot(mask.to(torch.int64), num_classes=self.nc).permute(0, 4, 1, 2, 3).contiguous().float() + rmask = self.svls_layer(oh_labels) + + return rmask + + def forward(self, inputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: + """ + Computes standard cross-entropy loss and constraints it neighbor aware logit penalty. + + Args: + inputs: the shape should be BNH[WD], where N is the number of classes. + targets: the shape should be BH[WD]. + + Returns: + torch.Tensor: value of the loss. + + Example: + >>> import torch + >>> from monai.losses import NACLLoss + >>> B, N, H, W = 8, 3, 64, 64 + >>> input = torch.rand(B, N, H, W) + >>> target = torch.randint(0, N, (B, H, W)) + >>> criterion = NACLLoss(classes = N, dim = 2) + >>> loss = criterion(input, target) + """ + + loss_ce = self.cross_entropy(inputs, targets) + + utargets = self.get_constr_target(targets) + + if self.distance_type == "l1": + loss_conf = utargets.sub(inputs).abs_().mean() + elif self.distance_type == "l2": + loss_conf = utargets.sub(inputs).pow_(2).abs_().mean() + + loss: torch.Tensor = loss_ce + self.alpha * loss_conf + + return loss diff --git a/monai/losses/perceptual.py b/monai/losses/perceptual.py index 2207de5e64..a8ae90993a 100644 --- a/monai/losses/perceptual.py +++ b/monai/losses/perceptual.py @@ -29,7 +29,7 @@ class PercetualNetworkType(StrEnum): squeeze = "squeeze" radimagenet_resnet50 = "radimagenet_resnet50" medicalnet_resnet10_23datasets = "medicalnet_resnet10_23datasets" - medical_resnet50_23datasets = "medical_resnet50_23datasets" + medicalnet_resnet50_23datasets = "medicalnet_resnet50_23datasets" resnet50 = "resnet50" @@ -45,6 +45,7 @@ class PerceptualLoss(nn.Module): The fake 3D implementation is based on a 2.5D approach where we calculate the 2D perceptual loss on slices from all three axes and average. The full 3D approach uses a 3D network to calculate the perceptual loss. + MedicalNet networks are only compatible with 3D inputs and support channel-wise loss. Args: spatial_dims: number of spatial dimensions. @@ -62,6 +63,8 @@ class PerceptualLoss(nn.Module): pretrained_state_dict_key: if `pretrained_path` is not `None`, this argument is used to extract the expected state dict. This argument only works when ``"network_type"`` is "resnet50". Defaults to `None`. + channel_wise: if True, the loss is returned per channel. Otherwise the loss is averaged over the channels. + Defaults to ``False``. """ def __init__( @@ -74,6 +77,7 @@ def __init__( pretrained: bool = True, pretrained_path: str | None = None, pretrained_state_dict_key: str | None = None, + channel_wise: bool = False, ): super().__init__() @@ -86,6 +90,9 @@ def __init__( "Argument is_fake_3d must be set to False." ) + if channel_wise and "medicalnet_" not in network_type: + raise ValueError("Channel-wise loss is only compatible with MedicalNet networks.") + if network_type.lower() not in list(PercetualNetworkType): raise ValueError( "Unrecognised criterion entered for Adversarial Loss. Must be one in: %s" @@ -102,7 +109,9 @@ def __init__( self.spatial_dims = spatial_dims self.perceptual_function: nn.Module if spatial_dims == 3 and is_fake_3d is False: - self.perceptual_function = MedicalNetPerceptualSimilarity(net=network_type, verbose=False) + self.perceptual_function = MedicalNetPerceptualSimilarity( + net=network_type, verbose=False, channel_wise=channel_wise + ) elif "radimagenet_" in network_type: self.perceptual_function = RadImageNetPerceptualSimilarity(net=network_type, verbose=False) elif network_type == "resnet50": @@ -116,6 +125,7 @@ def __init__( self.perceptual_function = LPIPS(pretrained=pretrained, net=network_type, verbose=False) self.is_fake_3d = is_fake_3d self.fake_3d_ratio = fake_3d_ratio + self.channel_wise = channel_wise def _calculate_axis_loss(self, input: torch.Tensor, target: torch.Tensor, spatial_axis: int) -> torch.Tensor: """ @@ -172,7 +182,12 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: # 2D and real 3D cases loss = self.perceptual_function(input, target) - return torch.mean(loss) + if self.channel_wise: + loss = torch.mean(loss.squeeze(), dim=0) + else: + loss = torch.mean(loss) + + return loss class MedicalNetPerceptualSimilarity(nn.Module): @@ -185,14 +200,20 @@ class MedicalNetPerceptualSimilarity(nn.Module): net: {``"medicalnet_resnet10_23datasets"``, ``"medicalnet_resnet50_23datasets"``} Specifies the network architecture to use. Defaults to ``"medicalnet_resnet10_23datasets"``. verbose: if false, mute messages from torch Hub load function. + channel_wise: if True, the loss is returned per channel. Otherwise the loss is averaged over the channels. + Defaults to ``False``. """ - def __init__(self, net: str = "medicalnet_resnet10_23datasets", verbose: bool = False) -> None: + def __init__( + self, net: str = "medicalnet_resnet10_23datasets", verbose: bool = False, channel_wise: bool = False + ) -> None: super().__init__() torch.hub._validate_not_a_forked_repo = lambda a, b, c: True self.model = torch.hub.load("warvito/MedicalNet-models", model=net, verbose=verbose) self.eval() + self.channel_wise = channel_wise + for param in self.parameters(): param.requires_grad = False @@ -206,20 +227,42 @@ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: Args: input: 3D input tensor with shape BCDHW. target: 3D target tensor with shape BCDHW. + """ input = medicalnet_intensity_normalisation(input) target = medicalnet_intensity_normalisation(target) # Get model outputs - outs_input = self.model.forward(input) - outs_target = self.model.forward(target) + feats_per_ch = 0 + for ch_idx in range(input.shape[1]): + input_channel = input[:, ch_idx, ...].unsqueeze(1) + target_channel = target[:, ch_idx, ...].unsqueeze(1) + + if ch_idx == 0: + outs_input = self.model.forward(input_channel) + outs_target = self.model.forward(target_channel) + feats_per_ch = outs_input.shape[1] + else: + outs_input = torch.cat([outs_input, self.model.forward(input_channel)], dim=1) + outs_target = torch.cat([outs_target, self.model.forward(target_channel)], dim=1) # Normalise through the channels feats_input = normalize_tensor(outs_input) feats_target = normalize_tensor(outs_target) - results: torch.Tensor = (feats_input - feats_target) ** 2 - results = spatial_average_3d(results.sum(dim=1, keepdim=True), keepdim=True) + feats_diff: torch.Tensor = (feats_input - feats_target) ** 2 + if self.channel_wise: + results = torch.zeros( + feats_diff.shape[0], input.shape[1], feats_diff.shape[2], feats_diff.shape[3], feats_diff.shape[4] + ) + for i in range(input.shape[1]): + l_idx = i * feats_per_ch + r_idx = (i + 1) * feats_per_ch + results[:, i, ...] = feats_diff[:, l_idx : i + r_idx, ...].sum(dim=1) + else: + results = feats_diff.sum(dim=1, keepdim=True) + + results = spatial_average_3d(results, keepdim=True) return results diff --git a/monai/losses/sure_loss.py b/monai/losses/sure_loss.py new file mode 100644 index 0000000000..ebf25613a6 --- /dev/null +++ b/monai/losses/sure_loss.py @@ -0,0 +1,200 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Callable, Optional + +import torch +import torch.nn as nn +from torch.nn.modules.loss import _Loss + + +def complex_diff_abs_loss(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + """ + First compute the difference in the complex domain, + then get the absolute value and take the mse + + Args: + x, y - B, 2, H, W real valued tensors representing complex numbers + or B,1,H,W complex valued tensors + Returns: + l2_loss - scalar + """ + if not x.is_complex(): + x = torch.view_as_complex(x.permute(0, 2, 3, 1).contiguous()) + if not y.is_complex(): + y = torch.view_as_complex(y.permute(0, 2, 3, 1).contiguous()) + + diff = torch.abs(x - y) + return nn.functional.mse_loss(diff, torch.zeros_like(diff), reduction="mean") + + +def sure_loss_function( + operator: Callable, + x: torch.Tensor, + y_pseudo_gt: torch.Tensor, + y_ref: Optional[torch.Tensor] = None, + eps: Optional[float] = -1.0, + perturb_noise: Optional[torch.Tensor] = None, + complex_input: Optional[bool] = False, +) -> torch.Tensor: + """ + Args: + operator (function): The operator function that takes in an input + tensor x and returns an output tensor y. We will use this to compute + the divergence. More specifically, we will perturb the input x by a + small amount and compute the divergence between the perturbed output + and the reference output + + x (torch.Tensor): The input tensor of shape (B, C, H, W) to the + operator. For complex input, the shape is (B, 2, H, W) aka C=2 real. + For real input, the shape is (B, 1, H, W) real. + + y_pseudo_gt (torch.Tensor): The pseudo ground truth tensor of shape + (B, C, H, W) used to compute the L2 loss. For complex input, the shape is + (B, 2, H, W) aka C=2 real. For real input, the shape is (B, 1, H, W) + real. + + y_ref (torch.Tensor, optional): The reference output tensor of shape + (B, C, H, W) used to compute the divergence. Defaults to None. For + complex input, the shape is (B, 2, H, W) aka C=2 real. For real input, + the shape is (B, 1, H, W) real. + + eps (float, optional): The perturbation scalar. Set to -1 to set it + automatically estimated based on y_pseudo_gtk + + perturb_noise (torch.Tensor, optional): The noise vector of shape (B, C, H, W). + Defaults to None. For complex input, the shape is (B, 2, H, W) aka C=2 real. + For real input, the shape is (B, 1, H, W) real. + + complex_input(bool, optional): Whether the input is complex or not. + Defaults to False. + + Returns: + sure_loss (torch.Tensor): The SURE loss scalar. + """ + # perturb input + if perturb_noise is None: + perturb_noise = torch.randn_like(x) + if eps == -1.0: + eps = float(torch.abs(y_pseudo_gt.max())) / 1000 + # get y_ref if not provided + if y_ref is None: + y_ref = operator(x) + + # get perturbed output + x_perturbed = x + eps * perturb_noise + y_perturbed = operator(x_perturbed) + # divergence + divergence = torch.sum(1.0 / eps * torch.matmul(perturb_noise.permute(0, 1, 3, 2), y_perturbed - y_ref)) # type: ignore + # l2 loss between y_ref, y_pseudo_gt + if complex_input: + l2_loss = complex_diff_abs_loss(y_ref, y_pseudo_gt) + else: + # real input + l2_loss = nn.functional.mse_loss(y_ref, y_pseudo_gt, reduction="mean") + + # sure loss + sure_loss = l2_loss * divergence / (x.shape[0] * x.shape[2] * x.shape[3]) + return sure_loss + + +class SURELoss(_Loss): + """ + Calculate the Stein's Unbiased Risk Estimator (SURE) loss for a given operator. + + This is a differentiable loss function that can be used to train/guide an + operator (e.g. neural network), where the pseudo ground truth is available + but the reference ground truth is not. For example, in the MRI + reconstruction, the pseudo ground truth is the zero-filled reconstruction + and the reference ground truth is the fully sampled reconstruction. Often, + the reference ground truth is not available due to the lack of fully sampled + data. + + The original SURE loss is proposed in [1]. The SURE loss used for guiding + the diffusion model based MRI reconstruction is proposed in [2]. + + Reference + + [1] Stein, C.M.: Estimation of the mean of a multivariate normal distribution. Annals of Statistics + + [2] B. Ozturkler et al. SMRD: SURE-based Robust MRI Reconstruction with Diffusion Models. + (https://arxiv.org/pdf/2310.01799.pdf) + """ + + def __init__(self, perturb_noise: Optional[torch.Tensor] = None, eps: Optional[float] = None) -> None: + """ + Args: + perturb_noise (torch.Tensor, optional): The noise vector of shape + (B, C, H, W). Defaults to None. For complex input, the shape is (B, 2, H, W) aka C=2 real. + For real input, the shape is (B, 1, H, W) real. + + eps (float, optional): The perturbation scalar. Defaults to None. + """ + super().__init__() + self.perturb_noise = perturb_noise + self.eps = eps + + def forward( + self, + operator: Callable, + x: torch.Tensor, + y_pseudo_gt: torch.Tensor, + y_ref: Optional[torch.Tensor] = None, + complex_input: Optional[bool] = False, + ) -> torch.Tensor: + """ + Args: + operator (function): The operator function that takes in an input + tensor x and returns an output tensor y. We will use this to compute + the divergence. More specifically, we will perturb the input x by a + small amount and compute the divergence between the perturbed output + and the reference output + + x (torch.Tensor): The input tensor of shape (B, C, H, W) to the + operator. C=1 or 2: For complex input, the shape is (B, 2, H, W) aka + C=2 real. For real input, the shape is (B, 1, H, W) real. + + y_pseudo_gt (torch.Tensor): The pseudo ground truth tensor of shape + (B, C, H, W) used to compute the L2 loss. C=1 or 2: For complex + input, the shape is (B, 2, H, W) aka C=2 real. For real input, the + shape is (B, 1, H, W) real. + + y_ref (torch.Tensor, optional): The reference output tensor of the + same shape as y_pseudo_gt + + Returns: + sure_loss (torch.Tensor): The SURE loss scalar. + """ + + # check inputs shapes + if x.dim() != 4: + raise ValueError(f"Input tensor x should be 4D, got {x.dim()}.") + if y_pseudo_gt.dim() != 4: + raise ValueError(f"Input tensor y_pseudo_gt should be 4D, but got {y_pseudo_gt.dim()}.") + if y_ref is not None and y_ref.dim() != 4: + raise ValueError(f"Input tensor y_ref should be 4D, but got {y_ref.dim()}.") + if x.shape != y_pseudo_gt.shape: + raise ValueError( + f"Input tensor x and y_pseudo_gt should have the same shape, but got x shape {x.shape}, " + f"y_pseudo_gt shape {y_pseudo_gt.shape}." + ) + if y_ref is not None and y_pseudo_gt.shape != y_ref.shape: + raise ValueError( + f"Input tensor y_pseudo_gt and y_ref should have the same shape, but got y_pseudo_gt shape {y_pseudo_gt.shape}, " + f"y_ref shape {y_ref.shape}." + ) + + # compute loss + loss = sure_loss_function(operator, x, y_pseudo_gt, y_ref, self.eps, self.perturb_noise, complex_input) + + return loss diff --git a/monai/metrics/cumulative_average.py b/monai/metrics/cumulative_average.py index e55e7b8576..dccf7b094b 100644 --- a/monai/metrics/cumulative_average.py +++ b/monai/metrics/cumulative_average.py @@ -65,6 +65,7 @@ def get_current(self, to_numpy: bool = True) -> NdarrayOrTensor: if self.val is None: return 0 + val: NdarrayOrTensor val = self.val.clone() val[~torch.isfinite(val)] = 0 @@ -96,6 +97,7 @@ def aggregate(self, to_numpy: bool = True) -> NdarrayOrTensor: dist.all_reduce(sum) dist.all_reduce(count) + val: NdarrayOrTensor val = torch.where(count > 0, sum / count, sum) if to_numpy: diff --git a/monai/metrics/metric.py b/monai/metrics/metric.py index a6dc1a49a2..249b2dc951 100644 --- a/monai/metrics/metric.py +++ b/monai/metrics/metric.py @@ -37,6 +37,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: """ raise NotImplementedError(f"Subclass {self.__class__.__name__} must implement this method.") + def __str__(self): + return self.__class__.__name__ + class IterationMetric(Metric): """ diff --git a/monai/metrics/panoptic_quality.py b/monai/metrics/panoptic_quality.py index 05175ba0fb..7c9d59c264 100644 --- a/monai/metrics/panoptic_quality.py +++ b/monai/metrics/panoptic_quality.py @@ -274,7 +274,7 @@ def _get_paired_iou( return paired_iou, paired_true, paired_pred - pairwise_iou = pairwise_iou.cpu().numpy() + pairwise_iou = pairwise_iou.cpu().numpy() # type: ignore[assignment] paired_true, paired_pred = linear_sum_assignment(-pairwise_iou) paired_iou = pairwise_iou[paired_true, paired_pred] paired_true = torch.as_tensor(list(paired_true[paired_iou > match_iou_threshold] + 1), device=device) diff --git a/monai/metrics/regression.py b/monai/metrics/regression.py index 9d29654ee3..4c8b8aa71b 100644 --- a/monai/metrics/regression.py +++ b/monai/metrics/regression.py @@ -303,7 +303,7 @@ def _compute_metric(self, y_pred: torch.Tensor, y: torch.Tensor) -> torch.Tensor if self.spatial_dims == 3 and dims != 5: raise ValueError( - f"y_pred should have 4 dimensions (batch, channel, height, width, depth) when using {self.spatial_dims}" + f"y_pred should have 5 dimensions (batch, channel, height, width, depth) when using {self.spatial_dims}" f" spatial dimensions, got {dims}." ) diff --git a/monai/metrics/rocauc.py b/monai/metrics/rocauc.py index 56d9faa9dd..57a8a072b4 100644 --- a/monai/metrics/rocauc.py +++ b/monai/metrics/rocauc.py @@ -88,8 +88,8 @@ def _calculate(y_pred: torch.Tensor, y: torch.Tensor) -> float: n = len(y) indices = y_pred.argsort() - y = y[indices].cpu().numpy() - y_pred = y_pred[indices].cpu().numpy() + y = y[indices].cpu().numpy() # type: ignore[assignment] + y_pred = y_pred[indices].cpu().numpy() # type: ignore[assignment] nneg = auc = tmp_pos = tmp_neg = 0.0 for i in range(n): diff --git a/monai/metrics/utils.py b/monai/metrics/utils.py index e7057256fb..340e54a1d7 100644 --- a/monai/metrics/utils.py +++ b/monai/metrics/utils.py @@ -35,9 +35,9 @@ optional_import, ) -binary_erosion, _ = optional_import("scipy.ndimage.morphology", name="binary_erosion") -distance_transform_edt, _ = optional_import("scipy.ndimage.morphology", name="distance_transform_edt") -distance_transform_cdt, _ = optional_import("scipy.ndimage.morphology", name="distance_transform_cdt") +binary_erosion, _ = optional_import("scipy.ndimage", name="binary_erosion") +distance_transform_edt, _ = optional_import("scipy.ndimage", name="distance_transform_edt") +distance_transform_cdt, _ = optional_import("scipy.ndimage", name="distance_transform_cdt") __all__ = [ "ignore_background", diff --git a/monai/networks/blocks/__init__.py b/monai/networks/blocks/__init__.py index e67cb3376f..47abc4a1c4 100644 --- a/monai/networks/blocks/__init__.py +++ b/monai/networks/blocks/__init__.py @@ -17,6 +17,7 @@ from .backbone_fpn_utils import BackboneWithFPN from .convolutions import Convolution, ResidualUnit from .crf import CRF +from .crossattention import CrossAttentionBlock from .denseblock import ConvDenseBlock, DenseBlock from .dints_block import ActiConvNormBlock, FactorizedIncreaseBlock, FactorizedReduceBlock, P3DActiConvNormBlock from .downsample import MaxAvgPool @@ -30,6 +31,8 @@ from .regunet_block import RegistrationDownSampleBlock, RegistrationExtractionBlock, RegistrationResidualConvBlock from .segresnet_block import ResBlock from .selfattention import SABlock +from .spade_norm import SPADE +from .spatialattention import SpatialAttentionBlock from .squeeze_and_excitation import ( ChannelSELayer, ResidualSELayer, diff --git a/monai/networks/blocks/attention_utils.py b/monai/networks/blocks/attention_utils.py new file mode 100644 index 0000000000..8c9002a16e --- /dev/null +++ b/monai/networks/blocks/attention_utils.py @@ -0,0 +1,128 @@ +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Tuple + +import torch +import torch.nn.functional as F +from torch import nn + + +def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: + """ + Get relative positional embeddings according to the relative positions of + query and key sizes. + + Args: + q_size (int): size of query q. + k_size (int): size of key k. + rel_pos (Tensor): relative position embeddings (L, C). + + Returns: + Extracted positional embeddings according to relative positions. + """ + rel_pos_resized: torch.Tensor = torch.Tensor() + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), size=max_rel_dist, mode="linear" + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + else: + rel_pos_resized = rel_pos + + # Scale the coords with short length if shapes for q and k are different. + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) + + return rel_pos_resized[relative_coords.long()] + + +def add_decomposed_rel_pos( + attn: torch.Tensor, q: torch.Tensor, rel_pos_lst: nn.ParameterList, q_size: Tuple, k_size: Tuple +) -> torch.Tensor: + r""" + Calculate decomposed Relative Positional Embeddings from mvitv2 implementation: + https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py + + Only 2D and 3D are supported. + + Encoding the relative position of tokens in the attention matrix: tokens spaced a distance + `d` apart will have the same embedding value (unlike absolute positional embedding). + + .. math:: + Attn_{logits}(Q, K) = (QK^{T} + E_{rel})*scale + + where + + .. math:: + E_{ij}^{(rel)} = Q_{i}.R_{p(i), p(j)} + + with :math:`R_{p(i), p(j)} \in R^{dim}` and :math:`p(i), p(j)`, + respectively spatial positions of element :math:`i` and :math:`j` + + When using "decomposed" relative positional embedding, positional embedding is defined ("decomposed") as follow: + + .. math:: + R_{p(i), p(j)} = R^{d1}_{d1(i), d1(j)} + ... + R^{dn}_{dn(i), dn(j)} + + with :math:`n = 1...dim` + + Decomposed relative positional embedding reduces the complexity from :math:`\mathcal{O}(d1*...*dn)` to + :math:`\mathcal{O}(d1+...+dn)` compared with classical relative positional embedding. + + Args: + attn (Tensor): attention map. + q (Tensor): query q in the attention layer with shape (B, s_dim_1 * ... * s_dim_n, C). + rel_pos_lst (ParameterList): relative position embeddings for each axis: rel_pos_lst[n] for nth axis. + q_size (Tuple): spatial sequence size of query q with (q_dim_1, ..., q_dim_n). + k_size (Tuple): spatial sequence size of key k with (k_dim_1, ..., k_dim_n). + + Returns: + attn (Tensor): attention logits with added relative positional embeddings. + """ + rh = get_rel_pos(q_size[0], k_size[0], rel_pos_lst[0]) + rw = get_rel_pos(q_size[1], k_size[1], rel_pos_lst[1]) + + batch, _, dim = q.shape + + if len(rel_pos_lst) == 2: + q_h, q_w = q_size[:2] + k_h, k_w = k_size[:2] + r_q = q.reshape(batch, q_h, q_w, dim) + rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, rh) + rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, rw) + + attn = (attn.view(batch, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).view( + batch, q_h * q_w, k_h * k_w + ) + elif len(rel_pos_lst) == 3: + q_h, q_w, q_d = q_size[:3] + k_h, k_w, k_d = k_size[:3] + + rd = get_rel_pos(q_d, k_d, rel_pos_lst[2]) + + r_q = q.reshape(batch, q_h, q_w, q_d, dim) + rel_h = torch.einsum("bhwdc,hkc->bhwdk", r_q, rh) + rel_w = torch.einsum("bhwdc,wkc->bhwdk", r_q, rw) + rel_d = torch.einsum("bhwdc,wkc->bhwdk", r_q, rd) + + attn = ( + attn.view(batch, q_h, q_w, q_d, k_h, k_w, k_d) + + rel_h[:, :, :, :, None, None] + + rel_w[:, :, :, None, :, None] + + rel_d[:, :, :, None, None, :] + ).view(batch, q_h * q_w * q_d, k_h * k_w * k_d) + + return attn diff --git a/monai/networks/blocks/crossattention.py b/monai/networks/blocks/crossattention.py new file mode 100644 index 0000000000..bdecf63168 --- /dev/null +++ b/monai/networks/blocks/crossattention.py @@ -0,0 +1,190 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Optional, Tuple + +import torch +import torch.nn as nn + +from monai.networks.layers.utils import get_rel_pos_embedding_layer +from monai.utils import optional_import, pytorch_after + +Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange") + + +class CrossAttentionBlock(nn.Module): + """ + A cross-attention block, based on: "Dosovitskiy et al., + An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale " + One can setup relative positional embedding as described in + """ + + def __init__( + self, + hidden_size: int, + num_heads: int, + dropout_rate: float = 0.0, + hidden_input_size: int | None = None, + context_input_size: int | None = None, + dim_head: int | None = None, + qkv_bias: bool = False, + save_attn: bool = False, + causal: bool = False, + sequence_length: int | None = None, + rel_pos_embedding: Optional[str] = None, + input_size: Optional[Tuple] = None, + attention_dtype: Optional[torch.dtype] = None, + use_flash_attention: bool = False, + ) -> None: + """ + Args: + hidden_size (int): dimension of hidden layer. + num_heads (int): number of attention heads. + dropout_rate (float, optional): fraction of the input units to drop. Defaults to 0.0. + hidden_input_size (int, optional): dimension of the input tensor. Defaults to hidden_size. + context_input_size (int, optional): dimension of the context tensor. Defaults to hidden_size. + dim_head (int, optional): dimension of each head. Defaults to hidden_size // num_heads. + qkv_bias (bool, optional): bias term for the qkv linear layer. Defaults to False. + save_attn (bool, optional): to make accessible the attention matrix. Defaults to False. + causal (bool, optional): whether to use causal attention. + sequence_length (int, optional): if causal is True, it is necessary to specify the sequence length. + rel_pos_embedding (str, optional): Add relative positional embeddings to the attention map. For now only + "decomposed" is supported (see https://arxiv.org/abs/2112.01526). 2D and 3D are supported. + input_size (tuple(spatial_dim), optional): Input resolution for calculating the relative positional + parameter size. + attention_dtype: cast attention operations to this dtype. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + super().__init__() + + if not (0 <= dropout_rate <= 1): + raise ValueError("dropout_rate should be between 0 and 1.") + + if dim_head: + inner_size = num_heads * dim_head + self.head_dim = dim_head + else: + if hidden_size % num_heads != 0: + raise ValueError("hidden size should be divisible by num_heads.") + inner_size = hidden_size + self.head_dim = hidden_size // num_heads + + if causal and sequence_length is None: + raise ValueError("sequence_length is necessary for causal attention.") + + if use_flash_attention and not pytorch_after(minor=13, major=1, patch=0): + raise ValueError( + "use_flash_attention is only supported for PyTorch versions >= 2.0." + "Upgrade your PyTorch or set the flag to False." + ) + if use_flash_attention and save_attn: + raise ValueError( + "save_attn has been set to True, but use_flash_attention is also set" + "to True. save_attn can only be used if use_flash_attention is False" + ) + + if use_flash_attention and rel_pos_embedding is not None: + raise ValueError("rel_pos_embedding must be None if you are using flash_attention.") + + self.num_heads = num_heads + self.hidden_input_size = hidden_input_size if hidden_input_size else hidden_size + self.context_input_size = context_input_size if context_input_size else hidden_size + self.out_proj = nn.Linear(inner_size, self.hidden_input_size) + # key, query, value projections + self.to_q = nn.Linear(self.hidden_input_size, inner_size, bias=qkv_bias) + self.to_k = nn.Linear(self.context_input_size, inner_size, bias=qkv_bias) + self.to_v = nn.Linear(self.context_input_size, inner_size, bias=qkv_bias) + self.input_rearrange = Rearrange("b h (l d) -> b l h d", l=num_heads) + + self.out_rearrange = Rearrange("b l h d -> b h (l d)") + self.drop_output = nn.Dropout(dropout_rate) + self.drop_weights = nn.Dropout(dropout_rate) + self.dropout_rate = dropout_rate + + self.scale = self.head_dim**-0.5 + self.save_attn = save_attn + self.attention_dtype = attention_dtype + + self.causal = causal + self.sequence_length = sequence_length + self.use_flash_attention = use_flash_attention + + if causal and sequence_length is not None: + # causal mask to ensure that attention is only applied to the left in the input sequence + self.register_buffer( + "causal_mask", + torch.tril(torch.ones(sequence_length, sequence_length)).view(1, 1, sequence_length, sequence_length), + ) + self.causal_mask: torch.Tensor + else: + self.causal_mask = torch.Tensor() + + self.att_mat = torch.Tensor() + self.rel_positional_embedding = ( + get_rel_pos_embedding_layer(rel_pos_embedding, input_size, self.head_dim, self.num_heads) + if rel_pos_embedding is not None + else None + ) + self.input_size = input_size + + def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None): + """ + Args: + x (torch.Tensor): input tensor. B x (s_dim_1 * ... * s_dim_n) x C + context (torch.Tensor, optional): context tensor. B x (s_dim_1 * ... * s_dim_n) x C + + Return: + torch.Tensor: B x (s_dim_1 * ... * s_dim_n) x C + """ + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + b, t, c = x.size() # batch size, sequence length, embedding dimensionality (hidden_size) + + q = self.input_rearrange(self.to_q(x)) + kv = context if context is not None else x + _, kv_t, _ = kv.size() + k = self.input_rearrange(self.to_k(kv)) + v = self.input_rearrange(self.to_v(kv)) + + if self.attention_dtype is not None: + q = q.to(self.attention_dtype) + k = k.to(self.attention_dtype) + + if self.use_flash_attention: + x = torch.nn.functional.scaled_dot_product_attention( + query=q, key=k, value=v, scale=self.scale, dropout_p=self.dropout_rate, is_causal=self.causal + ) + else: + att_mat = torch.einsum("blxd,blyd->blxy", q, k) * self.scale + # apply relative positional embedding if defined + if self.rel_positional_embedding is not None: + att_mat = self.rel_positional_embedding(x, att_mat, q) + + if self.causal: + att_mat = att_mat.masked_fill(self.causal_mask[:, :, :t, :kv_t] == 0, float("-inf")) + + att_mat = att_mat.softmax(dim=-1) + + if self.save_attn: + # no gradients and new tensor; + # https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html + self.att_mat = att_mat.detach() + + att_mat = self.drop_weights(att_mat) + x = torch.einsum("bhxy,bhyd->bhxd", att_mat, v) + + x = self.out_rearrange(x) + x = self.out_proj(x) + x = self.drop_output(x) + return x diff --git a/monai/networks/blocks/mlp.py b/monai/networks/blocks/mlp.py index d3510b64d3..8771711d25 100644 --- a/monai/networks/blocks/mlp.py +++ b/monai/networks/blocks/mlp.py @@ -11,12 +11,15 @@ from __future__ import annotations +from typing import Union + import torch.nn as nn from monai.networks.layers import get_act_layer +from monai.networks.layers.factories import split_args from monai.utils import look_up_option -SUPPORTED_DROPOUT_MODE = {"vit", "swin"} +SUPPORTED_DROPOUT_MODE = {"vit", "swin", "vista3d"} class MLPBlock(nn.Module): @@ -39,7 +42,7 @@ def __init__( https://github.com/google-research/vision_transformer/blob/main/vit_jax/models.py#L87 "swin" corresponds to one instance as implemented in https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_mlp.py#L23 - + "vista3d" mode does not use dropout. """ @@ -48,15 +51,24 @@ def __init__( if not (0 <= dropout_rate <= 1): raise ValueError("dropout_rate should be between 0 and 1.") mlp_dim = mlp_dim or hidden_size - self.linear1 = nn.Linear(hidden_size, mlp_dim) if act != "GEGLU" else nn.Linear(hidden_size, mlp_dim * 2) + act_name, _ = split_args(act) + self.linear1 = nn.Linear(hidden_size, mlp_dim) if act_name != "GEGLU" else nn.Linear(hidden_size, mlp_dim * 2) self.linear2 = nn.Linear(mlp_dim, hidden_size) self.fn = get_act_layer(act) - self.drop1 = nn.Dropout(dropout_rate) + # Use Union[nn.Dropout, nn.Identity] for type annotations + self.drop1: Union[nn.Dropout, nn.Identity] + self.drop2: Union[nn.Dropout, nn.Identity] + dropout_opt = look_up_option(dropout_mode, SUPPORTED_DROPOUT_MODE) if dropout_opt == "vit": + self.drop1 = nn.Dropout(dropout_rate) self.drop2 = nn.Dropout(dropout_rate) elif dropout_opt == "swin": + self.drop1 = nn.Dropout(dropout_rate) self.drop2 = self.drop1 + elif dropout_opt == "vista3d": + self.drop1 = nn.Identity() + self.drop2 = nn.Identity() else: raise ValueError(f"dropout_mode should be one of {SUPPORTED_DROPOUT_MODE}") diff --git a/monai/networks/blocks/patchembedding.py b/monai/networks/blocks/patchembedding.py index 7d56045814..91bd73ebbb 100644 --- a/monai/networks/blocks/patchembedding.py +++ b/monai/networks/blocks/patchembedding.py @@ -120,9 +120,7 @@ def __init__( for in_size, pa_size in zip(img_size, patch_size): grid_size.append(in_size // pa_size) - with torch.no_grad(): - pos_embeddings = build_sincos_position_embedding(grid_size, hidden_size, spatial_dims) - self.position_embeddings.data.copy_(pos_embeddings.float()) + self.position_embeddings = build_sincos_position_embedding(grid_size, hidden_size, spatial_dims) else: raise ValueError(f"pos_embed_type {self.pos_embed_type} not supported.") diff --git a/monai/networks/blocks/pos_embed_utils.py b/monai/networks/blocks/pos_embed_utils.py index e03553307e..21586e56da 100644 --- a/monai/networks/blocks/pos_embed_utils.py +++ b/monai/networks/blocks/pos_embed_utils.py @@ -46,7 +46,7 @@ def build_sincos_position_embedding( temperature (float): The temperature for the sin-cos position embedding. Returns: - pos_embed (nn.Parameter): The sin-cos position embedding as a learnable parameter. + pos_embed (nn.Parameter): The sin-cos position embedding as a fixed parameter. """ if spatial_dims == 2: diff --git a/monai/networks/blocks/rel_pos_embedding.py b/monai/networks/blocks/rel_pos_embedding.py new file mode 100644 index 0000000000..e53e5841b0 --- /dev/null +++ b/monai/networks/blocks/rel_pos_embedding.py @@ -0,0 +1,56 @@ +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Iterable, Tuple + +import torch +from torch import nn + +from monai.networks.blocks.attention_utils import add_decomposed_rel_pos +from monai.utils.misc import ensure_tuple_size + + +class DecomposedRelativePosEmbedding(nn.Module): + def __init__(self, s_input_dims: Tuple[int, int] | Tuple[int, int, int], c_dim: int, num_heads: int) -> None: + """ + Args: + s_input_dims (Tuple): input spatial dimension. (H, W) or (H, W, D) + c_dim (int): channel dimension + num_heads(int): number of attention heads + """ + super().__init__() + + # validate inputs + if not isinstance(s_input_dims, Iterable) or len(s_input_dims) not in [2, 3]: + raise ValueError("s_input_dims must be set as follows: (H, W) or (H, W, D)") + + self.s_input_dims = s_input_dims + self.c_dim = c_dim + self.num_heads = num_heads + self.rel_pos_arr = nn.ParameterList( + [nn.Parameter(torch.zeros(2 * dim_input_size - 1, c_dim)) for dim_input_size in s_input_dims] + ) + + def forward(self, x: torch.Tensor, att_mat: torch.Tensor, q: torch.Tensor) -> torch.Tensor: + """""" + batch = x.shape[0] + h, w, d = ensure_tuple_size(self.s_input_dims, 3, 1) + + att_mat = add_decomposed_rel_pos( + att_mat.contiguous().view(batch * self.num_heads, h * w * d, h * w * d), + q.contiguous().view(batch * self.num_heads, h * w * d, -1), + self.rel_pos_arr, + (h, w) if d == 1 else (h, w, d), + (h, w) if d == 1 else (h, w, d), + ) + + att_mat = att_mat.reshape(batch, self.num_heads, h * w * d, h * w * d) + return att_mat diff --git a/monai/networks/blocks/selfattention.py b/monai/networks/blocks/selfattention.py index 7c81c1704f..ac96b077bd 100644 --- a/monai/networks/blocks/selfattention.py +++ b/monai/networks/blocks/selfattention.py @@ -11,10 +11,14 @@ from __future__ import annotations +from typing import Tuple, Union + import torch import torch.nn as nn +import torch.nn.functional as F -from monai.utils import optional_import +from monai.networks.layers.utils import get_rel_pos_embedding_layer +from monai.utils import optional_import, pytorch_after Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange") @@ -32,6 +36,16 @@ def __init__( dropout_rate: float = 0.0, qkv_bias: bool = False, save_attn: bool = False, + dim_head: int | None = None, + hidden_input_size: int | None = None, + causal: bool = False, + sequence_length: int | None = None, + rel_pos_embedding: str | None = None, + input_size: Tuple | None = None, + attention_dtype: torch.dtype | None = None, + include_fc: bool = True, + use_combined_linear: bool = True, + use_flash_attention: bool = False, ) -> None: """ Args: @@ -40,6 +54,19 @@ def __init__( dropout_rate (float, optional): fraction of the input units to drop. Defaults to 0.0. qkv_bias (bool, optional): bias term for the qkv linear layer. Defaults to False. save_attn (bool, optional): to make accessible the attention matrix. Defaults to False. + dim_head (int, optional): dimension of each head. Defaults to hidden_size // num_heads. + hidden_input_size (int, optional): dimension of the input tensor. Defaults to hidden_size. + causal: whether to use causal attention (see https://arxiv.org/abs/1706.03762). + sequence_length: if causal is True, it is necessary to specify the sequence length. + rel_pos_embedding (str, optional): Add relative positional embeddings to the attention map. + For now only "decomposed" is supported (see https://arxiv.org/abs/2112.01526). 2D and 3D are supported. + input_size (tuple(spatial_dim), optional): Input resolution for calculating the relative + positional parameter size. + attention_dtype: cast attention operations to this dtype. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to True. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). """ @@ -51,30 +78,128 @@ def __init__( if hidden_size % num_heads != 0: raise ValueError("hidden size should be divisible by num_heads.") + if dim_head: + self.inner_dim = num_heads * dim_head + self.dim_head = dim_head + else: + if hidden_size % num_heads != 0: + raise ValueError("hidden size should be divisible by num_heads.") + self.inner_dim = hidden_size + self.dim_head = hidden_size // num_heads + + if causal and sequence_length is None: + raise ValueError("sequence_length is necessary for causal attention.") + + if use_flash_attention and not pytorch_after(minor=13, major=1, patch=0): + raise ValueError( + "use_flash_attention is only supported for PyTorch versions >= 2.0." + "Upgrade your PyTorch or set the flag to False." + ) + if use_flash_attention and save_attn: + raise ValueError( + "save_attn has been set to True, but use_flash_attention is also set" + "to True. save_attn can only be used if use_flash_attention is False." + ) + + if use_flash_attention and rel_pos_embedding is not None: + raise ValueError("rel_pos_embedding must be None if you are using flash_attention.") + self.num_heads = num_heads - self.out_proj = nn.Linear(hidden_size, hidden_size) - self.qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias) - self.input_rearrange = Rearrange("b h (qkv l d) -> qkv b l h d", qkv=3, l=num_heads) - self.out_rearrange = Rearrange("b h l d -> b l (h d)") + self.hidden_input_size = hidden_input_size if hidden_input_size else hidden_size + self.out_proj = nn.Linear(self.inner_dim, self.hidden_input_size) + + self.qkv: Union[nn.Linear, nn.Identity] + self.to_q: Union[nn.Linear, nn.Identity] + self.to_k: Union[nn.Linear, nn.Identity] + self.to_v: Union[nn.Linear, nn.Identity] + + if use_combined_linear: + self.qkv = nn.Linear(self.hidden_input_size, self.inner_dim * 3, bias=qkv_bias) + self.to_q = self.to_k = self.to_v = nn.Identity() # add to enable torchscript + self.input_rearrange = Rearrange("b h (qkv l d) -> qkv b l h d", qkv=3, l=num_heads) + else: + self.to_q = nn.Linear(self.hidden_input_size, self.inner_dim, bias=qkv_bias) + self.to_k = nn.Linear(self.hidden_input_size, self.inner_dim, bias=qkv_bias) + self.to_v = nn.Linear(self.hidden_input_size, self.inner_dim, bias=qkv_bias) + self.qkv = nn.Identity() # add to enable torchscript + self.input_rearrange = Rearrange("b h (l d) -> b l h d", l=num_heads) + self.out_rearrange = Rearrange("b l h d -> b h (l d)") self.drop_output = nn.Dropout(dropout_rate) self.drop_weights = nn.Dropout(dropout_rate) - self.head_dim = hidden_size // num_heads - self.scale = self.head_dim**-0.5 + self.dropout_rate = dropout_rate + self.scale = self.dim_head**-0.5 self.save_attn = save_attn self.att_mat = torch.Tensor() + self.attention_dtype = attention_dtype + self.causal = causal + self.sequence_length = sequence_length + self.include_fc = include_fc + self.use_combined_linear = use_combined_linear + self.use_flash_attention = use_flash_attention + + if causal and sequence_length is not None: + # causal mask to ensure that attention is only applied to the left in the input sequence + self.register_buffer( + "causal_mask", + torch.tril(torch.ones(sequence_length, sequence_length)).view(1, 1, sequence_length, sequence_length), + ) + self.causal_mask: torch.Tensor + else: + self.causal_mask = torch.Tensor() + + self.rel_positional_embedding = ( + get_rel_pos_embedding_layer(rel_pos_embedding, input_size, self.dim_head, self.num_heads) + if rel_pos_embedding is not None + else None + ) + self.input_size = input_size def forward(self, x): - output = self.input_rearrange(self.qkv(x)) - q, k, v = output[0], output[1], output[2] - att_mat = (torch.einsum("blxd,blyd->blxy", q, k) * self.scale).softmax(dim=-1) - if self.save_attn: - # no gradients and new tensor; - # https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html - self.att_mat = att_mat.detach() - - att_mat = self.drop_weights(att_mat) - x = torch.einsum("bhxy,bhyd->bhxd", att_mat, v) + """ + Args: + x (torch.Tensor): input tensor. B x (s_dim_1 * ... * s_dim_n) x C + + Return: + torch.Tensor: B x (s_dim_1 * ... * s_dim_n) x C + """ + if self.use_combined_linear: + output = self.input_rearrange(self.qkv(x)) + q, k, v = output[0], output[1], output[2] + else: + q = self.input_rearrange(self.to_q(x)) + k = self.input_rearrange(self.to_k(x)) + v = self.input_rearrange(self.to_v(x)) + + if self.attention_dtype is not None: + q = q.to(self.attention_dtype) + k = k.to(self.attention_dtype) + + if self.use_flash_attention: + x = F.scaled_dot_product_attention( + query=q, key=k, value=v, scale=self.scale, dropout_p=self.dropout_rate, is_causal=self.causal + ) + else: + att_mat = torch.einsum("blxd,blyd->blxy", q, k) * self.scale + + # apply relative positional embedding if defined + if self.rel_positional_embedding is not None: + att_mat = self.rel_positional_embedding(x, att_mat, q) + + if self.causal: + att_mat = att_mat.masked_fill(self.causal_mask[:, :, : x.shape[-2], : x.shape[-2]] == 0, float("-inf")) + + att_mat = att_mat.softmax(dim=-1) + + if self.save_attn: + # no gradients and new tensor; + # https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html + self.att_mat = att_mat.detach() + + att_mat = self.drop_weights(att_mat) + x = torch.einsum("bhxy,bhyd->bhxd", att_mat, v) + x = self.out_rearrange(x) - x = self.out_proj(x) + if self.include_fc: + x = self.out_proj(x) x = self.drop_output(x) return x diff --git a/monai/networks/blocks/spade_norm.py b/monai/networks/blocks/spade_norm.py new file mode 100644 index 0000000000..343dfa9ec0 --- /dev/null +++ b/monai/networks/blocks/spade_norm.py @@ -0,0 +1,95 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from monai.networks.blocks import Convolution +from monai.networks.layers.utils import get_norm_layer + + +class SPADE(nn.Module): + """ + Spatially Adaptive Normalization (SPADE) block, allowing for normalization of activations conditioned on a + semantic map. This block is used in SPADE-based image-to-image translation models, as described in + Semantic Image Synthesis with Spatially-Adaptive Normalization (https://arxiv.org/abs/1903.07291). + + Args: + label_nc: number of semantic labels + norm_nc: number of output channels + kernel_size: kernel size + spatial_dims: number of spatial dimensions + hidden_channels: number of channels in the intermediate gamma and beta layers + norm: type of base normalisation used before applying the SPADE normalisation + norm_params: parameters for the base normalisation + """ + + def __init__( + self, + label_nc: int, + norm_nc: int, + kernel_size: int = 3, + spatial_dims: int = 2, + hidden_channels: int = 64, + norm: str | tuple = "INSTANCE", + norm_params: dict | None = None, + ) -> None: + super().__init__() + + if norm_params is None: + norm_params = {} + if len(norm_params) != 0: + norm = (norm, norm_params) + self.param_free_norm = get_norm_layer(norm, spatial_dims=spatial_dims, channels=norm_nc) + self.mlp_shared = Convolution( + spatial_dims=spatial_dims, + in_channels=label_nc, + out_channels=hidden_channels, + kernel_size=kernel_size, + norm=None, + act="LEAKYRELU", + ) + self.mlp_gamma = Convolution( + spatial_dims=spatial_dims, + in_channels=hidden_channels, + out_channels=norm_nc, + kernel_size=kernel_size, + act=None, + ) + self.mlp_beta = Convolution( + spatial_dims=spatial_dims, + in_channels=hidden_channels, + out_channels=norm_nc, + kernel_size=kernel_size, + act=None, + ) + + def forward(self, x: torch.Tensor, segmap: torch.Tensor) -> torch.Tensor: + """ + Args: + x: input tensor with shape (B, C, [spatial-dimensions]) where C is the number of semantic channels. + segmap: input segmentation map (B, C, [spatial-dimensions]) where C is the number of semantic channels. + The map will be interpolated to the dimension of x internally. + """ + + # Part 1. generate parameter-free normalized activations + normalized = self.param_free_norm(x.contiguous()) + + # Part 2. produce scaling and bias conditioned on semantic map + segmap = F.interpolate(segmap, size=x.size()[2:], mode="nearest") + actv = self.mlp_shared(segmap) + gamma = self.mlp_gamma(actv) + beta = self.mlp_beta(actv) + out: torch.Tensor = normalized * (1 + gamma) + beta + return out diff --git a/monai/networks/blocks/spatialattention.py b/monai/networks/blocks/spatialattention.py new file mode 100644 index 0000000000..665442b55e --- /dev/null +++ b/monai/networks/blocks/spatialattention.py @@ -0,0 +1,97 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Optional + +import torch +import torch.nn as nn + +from monai.networks.blocks import SABlock +from monai.utils import optional_import + +Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange") + + +class SpatialAttentionBlock(nn.Module): + """Perform spatial self-attention on the input tensor. + + The input tensor is reshaped to B x (x_dim * y_dim [ * z_dim]) x C, where C is the number of channels, and then + self-attention is performed on the reshaped tensor. The output tensor is reshaped back to the original shape. + + Args: + spatial_dims: number of spatial dimensions, could be 1, 2, or 3. + num_channels: number of input channels. Must be divisible by num_head_channels. + num_head_channels: number of channels per head. + norm_num_groups: Number of groups for the group norm layer. + norm_eps: Epsilon for the normalization. + attention_dtype: cast attention operations to this dtype. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + + """ + + def __init__( + self, + spatial_dims: int, + num_channels: int, + num_head_channels: int | None = None, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + attention_dtype: Optional[torch.dtype] = None, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + + self.spatial_dims = spatial_dims + self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=num_channels, eps=norm_eps, affine=True) + # check num_head_channels is divisible by num_channels + if num_head_channels is not None and num_channels % num_head_channels != 0: + raise ValueError("num_channels must be divisible by num_head_channels") + num_heads = num_channels // num_head_channels if num_head_channels is not None else 1 + self.attn = SABlock( + hidden_size=num_channels, + num_heads=num_heads, + qkv_bias=True, + attention_dtype=attention_dtype, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + def forward(self, x: torch.Tensor): + residual = x + + if self.spatial_dims == 1: + h = x.shape[2] + rearrange_input = Rearrange("b c h -> b h c") + rearrange_output = Rearrange("b h c -> b c h", h=h) + if self.spatial_dims == 2: + h, w = x.shape[2], x.shape[3] + rearrange_input = Rearrange("b c h w -> b (h w) c") + rearrange_output = Rearrange("b (h w) c -> b c h w", h=h, w=w) + else: + h, w, d = x.shape[2], x.shape[3], x.shape[4] + rearrange_input = Rearrange("b c h w d -> b (h w d) c") + rearrange_output = Rearrange("b (h w d) c -> b c h w d", h=h, w=w, d=d) + + x = self.norm(x) + x = rearrange_input(x) # B x (x_dim * y_dim [ * z_dim]) x C + + x = self.attn(x) + x = rearrange_output(x) # B x x C x x_dim * y_dim * [z_dim] + x = x + residual + return x diff --git a/monai/networks/blocks/transformerblock.py b/monai/networks/blocks/transformerblock.py index ddf959dad2..05eb3b07ab 100644 --- a/monai/networks/blocks/transformerblock.py +++ b/monai/networks/blocks/transformerblock.py @@ -11,10 +11,12 @@ from __future__ import annotations +from typing import Optional + +import torch import torch.nn as nn -from monai.networks.blocks.mlp import MLPBlock -from monai.networks.blocks.selfattention import SABlock +from monai.networks.blocks import CrossAttentionBlock, MLPBlock, SABlock class TransformerBlock(nn.Module): @@ -31,6 +33,12 @@ def __init__( dropout_rate: float = 0.0, qkv_bias: bool = False, save_attn: bool = False, + causal: bool = False, + sequence_length: int | None = None, + with_cross_attention: bool = False, + use_flash_attention: bool = False, + include_fc: bool = True, + use_combined_linear: bool = True, ) -> None: """ Args: @@ -38,8 +46,12 @@ def __init__( mlp_dim (int): dimension of feedforward layer. num_heads (int): number of attention heads. dropout_rate (float, optional): fraction of the input units to drop. Defaults to 0.0. - qkv_bias (bool, optional): apply bias term for the qkv linear layer. Defaults to False. + qkv_bias(bool, optional): apply bias term for the qkv linear layer. Defaults to False. save_attn (bool, optional): to make accessible the attention matrix. Defaults to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to True. """ @@ -53,10 +65,34 @@ def __init__( self.mlp = MLPBlock(hidden_size, mlp_dim, dropout_rate) self.norm1 = nn.LayerNorm(hidden_size) - self.attn = SABlock(hidden_size, num_heads, dropout_rate, qkv_bias, save_attn) + self.attn = SABlock( + hidden_size, + num_heads, + dropout_rate, + qkv_bias=qkv_bias, + save_attn=save_attn, + causal=causal, + sequence_length=sequence_length, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) self.norm2 = nn.LayerNorm(hidden_size) + self.with_cross_attention = with_cross_attention + + self.norm_cross_attn = nn.LayerNorm(hidden_size) + self.cross_attn = CrossAttentionBlock( + hidden_size=hidden_size, + num_heads=num_heads, + dropout_rate=dropout_rate, + qkv_bias=qkv_bias, + causal=False, + use_flash_attention=use_flash_attention, + ) - def forward(self, x): + def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None) -> torch.Tensor: x = x + self.attn(self.norm1(x)) + if self.with_cross_attention: + x = x + self.cross_attn(self.norm_cross_attn(x), context=context) x = x + self.mlp(self.norm2(x)) return x diff --git a/monai/networks/blocks/upsample.py b/monai/networks/blocks/upsample.py index dee9966919..50fd39a70b 100644 --- a/monai/networks/blocks/upsample.py +++ b/monai/networks/blocks/upsample.py @@ -17,8 +17,8 @@ import torch.nn as nn from monai.networks.layers.factories import Conv, Pad, Pool -from monai.networks.utils import icnr_init, pixelshuffle -from monai.utils import InterpolateMode, UpsampleMode, ensure_tuple_rep, look_up_option +from monai.networks.utils import CastTempType, icnr_init, pixelshuffle +from monai.utils import InterpolateMode, UpsampleMode, ensure_tuple_rep, look_up_option, pytorch_after __all__ = ["Upsample", "UpSample", "SubpixelUpsample", "Subpixelupsample", "SubpixelUpSample"] @@ -50,6 +50,7 @@ def __init__( size: tuple[int] | int | None = None, mode: UpsampleMode | str = UpsampleMode.DECONV, pre_conv: nn.Module | str | None = "default", + post_conv: nn.Module | None = None, interp_mode: str = InterpolateMode.LINEAR, align_corners: bool | None = True, bias: bool = True, @@ -71,6 +72,7 @@ def __init__( pre_conv: a conv block applied before upsampling. Defaults to "default". When ``conv_block`` is ``"default"``, one reserved conv layer will be utilized when Only used in the "nontrainable" or "pixelshuffle" mode. + post_conv: a conv block applied after upsampling. Defaults to None. Only used in the "nontrainable" mode. interp_mode: {``"nearest"``, ``"linear"``, ``"bilinear"``, ``"bicubic"``, ``"trilinear"``} Only used in the "nontrainable" mode. If ends with ``"linear"`` will use ``spatial dims`` to determine the correct interpolation. @@ -154,15 +156,25 @@ def __init__( linear_mode = [InterpolateMode.LINEAR, InterpolateMode.BILINEAR, InterpolateMode.TRILINEAR] if interp_mode in linear_mode: # choose mode based on dimensions interp_mode = linear_mode[spatial_dims - 1] - self.add_module( - "upsample_non_trainable", - nn.Upsample( - size=size, - scale_factor=None if size else scale_factor_, - mode=interp_mode.value, - align_corners=align_corners, - ), + + upsample = nn.Upsample( + size=size, + scale_factor=None if size else scale_factor_, + mode=interp_mode.value, + align_corners=align_corners, ) + + # Cast to float32 as 'upsample_nearest2d_out_frame' op does not support bfloat16 + # https://github.com/pytorch/pytorch/issues/86679. This issue is solved in PyTorch 2.1 + if pytorch_after(major=2, minor=1): + self.add_module("upsample_non_trainable", upsample) + else: + self.add_module( + "upsample_non_trainable", + CastTempType(initial_type=torch.bfloat16, temporary_type=torch.float32, submodule=upsample), + ) + if post_conv: + self.add_module("postconv", post_conv) elif up_mode == UpsampleMode.PIXELSHUFFLE: self.add_module( "pixelshuffle", diff --git a/monai/networks/layers/__init__.py b/monai/networks/layers/__init__.py index d61ed57f7f..48c10270b1 100644 --- a/monai/networks/layers/__init__.py +++ b/monai/networks/layers/__init__.py @@ -11,9 +11,10 @@ from __future__ import annotations +from .conjugate_gradient import ConjugateGradient from .convutils import calculate_out_shape, gaussian_1d, polyval, same_padding, stride_minus_kernel_padding from .drop_path import DropPath -from .factories import Act, Conv, Dropout, LayerFactory, Norm, Pad, Pool, split_args +from .factories import Act, Conv, Dropout, LayerFactory, Norm, Pad, Pool, RelPosEmbedding, split_args from .filtering import BilateralFilter, PHLFilter, TrainableBilateralFilter, TrainableJointBilateralFilter from .gmm import GaussianMixtureModel from .simplelayers import ( @@ -37,4 +38,5 @@ ) from .spatial_transforms import AffineTransform, grid_count, grid_grad, grid_pull, grid_push from .utils import get_act_layer, get_dropout_layer, get_norm_layer, get_pool_layer +from .vector_quantizer import EMAQuantizer, VectorQuantizer from .weight_init import _no_grad_trunc_normal_, trunc_normal_ diff --git a/monai/networks/layers/conjugate_gradient.py b/monai/networks/layers/conjugate_gradient.py new file mode 100644 index 0000000000..93a45930d7 --- /dev/null +++ b/monai/networks/layers/conjugate_gradient.py @@ -0,0 +1,112 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Callable + +import torch +from torch import nn + + +def _zdot(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: + """ + Complex dot product between tensors x1 and x2: sum(x1.*x2) + """ + if torch.is_complex(x1): + assert torch.is_complex(x2), "x1 and x2 must both be complex" + return torch.sum(x1.conj() * x2) + else: + return torch.sum(x1 * x2) + + +def _zdot_single(x: torch.Tensor) -> torch.Tensor: + """ + Complex dot product between tensor x and itself + """ + res = _zdot(x, x) + if torch.is_complex(res): + return res.real + else: + return res + + +class ConjugateGradient(nn.Module): + """ + Congugate Gradient (CG) solver for linear systems Ax = y. + + For linear_op that is positive definite and self-adjoint, CG is + guaranteed to converge CG is often used to solve linear systems of the form + Ax = y, where A is too large to store explicitly, but can be computed via a + linear operator. + + As a result, here we won't set A explicitly as a matrix, but rather as a + linear operator. For example, A could be a FFT/IFFT operation + """ + + def __init__(self, linear_op: Callable, num_iter: int): + """ + Args: + linear_op: Linear operator + num_iter: Number of iterations to run CG + """ + super().__init__() + + self.linear_op = linear_op + self.num_iter = num_iter + + def update( + self, x: torch.Tensor, p: torch.Tensor, r: torch.Tensor, rsold: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + perform one iteration of the CG method. It takes the current solution x, + the current search direction p, the current residual r, and the old + residual norm rsold as inputs. Then it computes the new solution, search + direction, residual, and residual norm, and returns them. + """ + + dy = self.linear_op(p) + p_dot_dy = _zdot(p, dy) + alpha = rsold / p_dot_dy + x = x + alpha * p + r = r - alpha * dy + rsnew = _zdot_single(r) + beta = rsnew / rsold + rsold = rsnew + p = beta * p + r + return x, p, r, rsold + + def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: + """ + run conjugate gradient for num_iter iterations to solve Ax = y + + Args: + x: tensor (real or complex); Initial guess for linear system Ax = y. + The size of x should be applicable to the linear operator. For + example, if the linear operator is FFT, then x is HCHW; if the + linear operator is a matrix multiplication, then x is a vector + + y: tensor (real or complex); Measurement. Same size as x + + Returns: + x: Solution to Ax = y + """ + # Compute residual + r = y - self.linear_op(x) + rsold = _zdot_single(r) + p = r + + # Update + for _i in range(self.num_iter): + x, p, r, rsold = self.update(x, p, r, rsold) + if rsold < 1e-10: + break + return x diff --git a/monai/networks/layers/factories.py b/monai/networks/layers/factories.py index 4fc2c16f73..29b72a4f37 100644 --- a/monai/networks/layers/factories.py +++ b/monai/networks/layers/factories.py @@ -70,7 +70,7 @@ def use_factory(fact_args): from monai.networks.utils import has_nvfuser_instance_norm from monai.utils import ComponentStore, look_up_option, optional_import -__all__ = ["LayerFactory", "Dropout", "Norm", "Act", "Conv", "Pool", "Pad", "split_args"] +__all__ = ["LayerFactory", "Dropout", "Norm", "Act", "Conv", "Pool", "Pad", "RelPosEmbedding", "split_args"] class LayerFactory(ComponentStore): @@ -201,6 +201,10 @@ def split_args(args): Conv = LayerFactory(name="Convolution layers", description="Factory for creating convolution layers.") Pool = LayerFactory(name="Pooling layers", description="Factory for creating pooling layers.") Pad = LayerFactory(name="Padding layers", description="Factory for creating padding layers.") +RelPosEmbedding = LayerFactory( + name="Relative positional embedding layers", + description="Factory for creating relative positional embedding factory", +) @Dropout.factory_function("dropout") @@ -468,3 +472,10 @@ def constant_pad_factory(dim: int) -> type[nn.ConstantPad1d | nn.ConstantPad2d | """ types = (nn.ConstantPad1d, nn.ConstantPad2d, nn.ConstantPad3d) return types[dim - 1] + + +@RelPosEmbedding.factory_function("decomposed") +def decomposed_rel_pos_embedding() -> type[nn.Module]: + from monai.networks.blocks.rel_pos_embedding import DecomposedRelativePosEmbedding + + return DecomposedRelativePosEmbedding diff --git a/monai/networks/layers/simplelayers.py b/monai/networks/layers/simplelayers.py index 4ac621967f..4acd4a3622 100644 --- a/monai/networks/layers/simplelayers.py +++ b/monai/networks/layers/simplelayers.py @@ -452,7 +452,7 @@ def get_binary_kernel(window_size: Sequence[int], dtype=torch.float, device=None def median_filter( in_tensor: torch.Tensor, - kernel_size: Sequence[int] = (3, 3, 3), + kernel_size: Sequence[int] | int = (3, 3, 3), spatial_dims: int = 3, kernel: torch.Tensor | None = None, **kwargs, diff --git a/monai/networks/layers/utils.py b/monai/networks/layers/utils.py index ace1af27b6..8676f74638 100644 --- a/monai/networks/layers/utils.py +++ b/monai/networks/layers/utils.py @@ -11,9 +11,11 @@ from __future__ import annotations +from typing import Optional + import torch.nn -from monai.networks.layers.factories import Act, Dropout, Norm, Pool, split_args +from monai.networks.layers.factories import Act, Dropout, Norm, Pool, RelPosEmbedding, split_args from monai.utils import has_option __all__ = ["get_norm_layer", "get_act_layer", "get_dropout_layer", "get_pool_layer"] @@ -124,3 +126,14 @@ def get_pool_layer(name: tuple | str, spatial_dims: int | None = 1): pool_name, pool_args = split_args(name) pool_type = Pool[pool_name, spatial_dims] return pool_type(**pool_args) + + +def get_rel_pos_embedding_layer(name: tuple | str, s_input_dims: Optional[tuple], c_dim: int, num_heads: int): + embedding_name, embedding_args = split_args(name) + embedding_type = RelPosEmbedding[embedding_name] + # create a dictionary with the default values which can be overridden by embedding_args + kw_args = {"s_input_dims": s_input_dims, "c_dim": c_dim, "num_heads": num_heads, **embedding_args} + # filter out unused argument names + kw_args = {k: v for k, v in kw_args.items() if has_option(embedding_type, k)} + + return embedding_type(**kw_args) diff --git a/monai/networks/layers/vector_quantizer.py b/monai/networks/layers/vector_quantizer.py new file mode 100644 index 0000000000..9c354e1009 --- /dev/null +++ b/monai/networks/layers/vector_quantizer.py @@ -0,0 +1,233 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Sequence, Tuple + +import torch +from torch import nn + +__all__ = ["VectorQuantizer", "EMAQuantizer"] + + +class EMAQuantizer(nn.Module): + """ + Vector Quantization module using Exponential Moving Average (EMA) to learn the codebook parameters based on Neural + Discrete Representation Learning by Oord et al. (https://arxiv.org/abs/1711.00937) and the official implementation + that can be found at https://github.com/deepmind/sonnet/blob/v2/sonnet/src/nets/vqvae.py#L148 and commit + 58d9a2746493717a7c9252938da7efa6006f3739. + + This module is not compatible with TorchScript while working in a Distributed Data Parallelism Module. This is due + to lack of TorchScript support for torch.distributed module as per https://github.com/pytorch/pytorch/issues/41353 + on 22/10/2022. If you want to TorchScript your model, please turn set `ddp_sync` to False. + + Args: + spatial_dims: number of spatial dimensions of the input. + num_embeddings: number of atomic elements in the codebook. + embedding_dim: number of channels of the input and atomic elements. + commitment_cost: scaling factor of the MSE loss between input and its quantized version. Defaults to 0.25. + decay: EMA decay. Defaults to 0.99. + epsilon: epsilon value. Defaults to 1e-5. + embedding_init: initialization method for the codebook. Defaults to "normal". + ddp_sync: whether to synchronize the codebook across processes. Defaults to True. + """ + + def __init__( + self, + spatial_dims: int, + num_embeddings: int, + embedding_dim: int, + commitment_cost: float = 0.25, + decay: float = 0.99, + epsilon: float = 1e-5, + embedding_init: str = "normal", + ddp_sync: bool = True, + ): + super().__init__() + self.spatial_dims: int = spatial_dims + self.embedding_dim: int = embedding_dim + self.num_embeddings: int = num_embeddings + + assert self.spatial_dims in [2, 3], ValueError( + f"EMAQuantizer only supports 4D and 5D tensor inputs but received spatial dims {spatial_dims}." + ) + + self.embedding: torch.nn.Embedding = torch.nn.Embedding(self.num_embeddings, self.embedding_dim) + if embedding_init == "normal": + # Initialization is passed since the default one is normal inside the nn.Embedding + pass + elif embedding_init == "kaiming_uniform": + torch.nn.init.kaiming_uniform_(self.embedding.weight.data, mode="fan_in", nonlinearity="linear") + self.embedding.weight.requires_grad = False + + self.commitment_cost: float = commitment_cost + + self.register_buffer("ema_cluster_size", torch.zeros(self.num_embeddings)) + self.register_buffer("ema_w", self.embedding.weight.data.clone()) + # declare types for mypy + self.ema_cluster_size: torch.Tensor + self.ema_w: torch.Tensor + self.decay: float = decay + self.epsilon: float = epsilon + + self.ddp_sync: bool = ddp_sync + + # Precalculating required permutation shapes + self.flatten_permutation = [0] + list(range(2, self.spatial_dims + 2)) + [1] + self.quantization_permutation: Sequence[int] = [0, self.spatial_dims + 1] + list( + range(1, self.spatial_dims + 1) + ) + + def quantize(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Given an input it projects it to the quantized space and returns additional tensors needed for EMA loss. + + Args: + inputs: Encoding space tensors of shape [B, C, H, W, D]. + + Returns: + torch.Tensor: Flatten version of the input of shape [B*H*W*D, C]. + torch.Tensor: One-hot representation of the quantization indices of shape [B*H*W*D, self.num_embeddings]. + torch.Tensor: Quantization indices of shape [B,H,W,D,1] + + """ + with torch.cuda.amp.autocast(enabled=False): + encoding_indices_view = list(inputs.shape) + del encoding_indices_view[1] + + inputs = inputs.float() + + # Converting to channel last format + flat_input = inputs.permute(self.flatten_permutation).contiguous().view(-1, self.embedding_dim) + + # Calculate Euclidean distances + distances = ( + (flat_input**2).sum(dim=1, keepdim=True) + + (self.embedding.weight.t() ** 2).sum(dim=0, keepdim=True) + - 2 * torch.mm(flat_input, self.embedding.weight.t()) + ) + + # Mapping distances to indexes + encoding_indices = torch.max(-distances, dim=1)[1] + encodings = torch.nn.functional.one_hot(encoding_indices, self.num_embeddings).float() + + # Quantize and reshape + encoding_indices = encoding_indices.view(encoding_indices_view) + + return flat_input, encodings, encoding_indices + + def embed(self, embedding_indices: torch.Tensor) -> torch.Tensor: + """ + Given encoding indices of shape [B,D,H,W,1] embeds them in the quantized space + [B, D, H, W, self.embedding_dim] and reshapes them to [B, self.embedding_dim, D, H, W] to be fed to the + decoder. + + Args: + embedding_indices: Tensor in channel last format which holds indices referencing atomic + elements from self.embedding + + Returns: + torch.Tensor: Quantize space representation of encoding_indices in channel first format. + """ + with torch.cuda.amp.autocast(enabled=False): + embedding: torch.Tensor = ( + self.embedding(embedding_indices).permute(self.quantization_permutation).contiguous() + ) + return embedding + + def distributed_synchronization(self, encodings_sum: torch.Tensor, dw: torch.Tensor) -> None: + """ + TorchScript does not support torch.distributed.all_reduce. This function is a bypassing trick based on the + example: https://pytorch.org/docs/stable/generated/torch.jit.unused.html#torch.jit.unused + + Args: + encodings_sum: The summation of one hot representation of what encoding was used for each + position. + dw: The multiplication of the one hot representation of what encoding was used for each + position with the flattened input. + + Returns: + None + """ + if self.ddp_sync and torch.distributed.is_initialized(): + torch.distributed.all_reduce(tensor=encodings_sum, op=torch.distributed.ReduceOp.SUM) + torch.distributed.all_reduce(tensor=dw, op=torch.distributed.ReduceOp.SUM) + else: + pass + + def forward(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + flat_input, encodings, encoding_indices = self.quantize(inputs) + quantized = self.embed(encoding_indices) + + # Use EMA to update the embedding vectors + if self.training: + with torch.no_grad(): + encodings_sum = encodings.sum(0) + dw = torch.mm(encodings.t(), flat_input) + + if self.ddp_sync: + self.distributed_synchronization(encodings_sum, dw) + + self.ema_cluster_size.data.mul_(self.decay).add_(torch.mul(encodings_sum, 1 - self.decay)) + + # Laplace smoothing of the cluster size + n = self.ema_cluster_size.sum() + weights = (self.ema_cluster_size + self.epsilon) / (n + self.num_embeddings * self.epsilon) * n + self.ema_w.data.mul_(self.decay).add_(torch.mul(dw, 1 - self.decay)) + self.embedding.weight.data.copy_(self.ema_w / weights.unsqueeze(1)) + + # Encoding Loss + loss = self.commitment_cost * torch.nn.functional.mse_loss(quantized.detach(), inputs) + + # Straight Through Estimator + quantized = inputs + (quantized - inputs).detach() + + return quantized, loss, encoding_indices + + +class VectorQuantizer(torch.nn.Module): + """ + Vector Quantization wrapper that is needed as a workaround for the AMP to isolate the non fp16 compatible parts of + the quantization in their own class. + + Args: + quantizer (torch.nn.Module): Quantizer module that needs to return its quantized representation, loss and index + based quantized representation. + """ + + def __init__(self, quantizer: EMAQuantizer): + super().__init__() + + self.quantizer: EMAQuantizer = quantizer + + self.perplexity: torch.Tensor = torch.rand(1) + + def forward(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + quantized, loss, encoding_indices = self.quantizer(inputs) + # Perplexity calculations + avg_probs = ( + torch.histc(encoding_indices.float(), bins=self.quantizer.num_embeddings, max=self.quantizer.num_embeddings) + .float() + .div(encoding_indices.numel()) + ) + + self.perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10))) + + return loss, quantized + + def embed(self, embedding_indices: torch.Tensor) -> torch.Tensor: + return self.quantizer.embed(embedding_indices=embedding_indices) + + def quantize(self, encodings: torch.Tensor) -> torch.Tensor: + output = self.quantizer(encodings) + encoding_indices: torch.Tensor = output[2] + return encoding_indices diff --git a/monai/networks/nets/__init__.py b/monai/networks/nets/__init__.py index 9247aaee85..0570c9fcc1 100644 --- a/monai/networks/nets/__init__.py +++ b/monai/networks/nets/__init__.py @@ -14,9 +14,11 @@ from .ahnet import AHnet, Ahnet, AHNet from .attentionunet import AttentionUnet from .autoencoder import AutoEncoder +from .autoencoderkl import AutoencoderKL from .basic_unet import BasicUNet, BasicUnet, Basicunet, basicunet from .basic_unetplusplus import BasicUNetPlusPlus, BasicUnetPlusPlus, BasicunetPlusPlus, basicunetplusplus from .classifier import Classifier, Critic, Discriminator +from .controlnet import ControlNet from .daf3d import DAF3D from .densenet import ( DenseNet, @@ -34,6 +36,7 @@ densenet201, densenet264, ) +from .diffusion_model_unet import DiffusionModelUNet from .dints import DiNTS, TopologyConstruction, TopologyInstance, TopologySearch from .dynunet import DynUNet, DynUnet, Dynunet from .efficientnet import ( @@ -52,6 +55,7 @@ from .hovernet import Hovernet, HoVernet, HoVerNet, HoverNet from .milmodel import MILModel from .netadapter import NetAdapter +from .patchgan_discriminator import MultiScalePatchDiscriminator, PatchDiscriminator from .quicknat import Quicknat from .regressor import Regressor from .regunet import GlobalNet, LocalNet, RegUNet @@ -59,6 +63,8 @@ ResNet, ResNetBlock, ResNetBottleneck, + ResNetEncoder, + ResNetFeatures, get_medicalnet_pretrained_resnet_args, get_pretrained_resnet_medicalnet, resnet10, @@ -70,7 +76,7 @@ resnet200, ) from .segresnet import SegResNet, SegResNetVAE -from .segresnet_ds import SegResNetDS +from .segresnet_ds import SegResNetDS, SegResNetDS2 from .senet import ( SENet, SEnet, @@ -102,13 +108,19 @@ seresnext50, seresnext101, ) +from .spade_autoencoderkl import SPADEAutoencoderKL +from .spade_diffusion_model_unet import SPADEDiffusionModelUNet +from .spade_network import SPADENet from .swin_unetr import PatchMerging, PatchMergingV2, SwinUNETR from .torchvision_fc import TorchVisionFCModel from .transchex import BertAttention, BertMixedLayer, BertOutput, BertPreTrainedModel, MultiModal, Pooler, Transchex +from .transformer import DecoderOnlyTransformer from .unet import UNet, Unet from .unetr import UNETR from .varautoencoder import VarAutoEncoder +from .vista3d import VISTA3D, vista3d132 from .vit import ViT from .vitautoenc import ViTAutoEnc from .vnet import VNet from .voxelmorph import VoxelMorph, VoxelMorphUNet +from .vqvae import VQVAE diff --git a/monai/networks/nets/attentionunet.py b/monai/networks/nets/attentionunet.py index 5689cf1071..fdf31d9701 100644 --- a/monai/networks/nets/attentionunet.py +++ b/monai/networks/nets/attentionunet.py @@ -29,7 +29,7 @@ def __init__( spatial_dims: int, in_channels: int, out_channels: int, - kernel_size: int = 3, + kernel_size: Sequence[int] | int = 3, strides: int = 1, dropout=0.0, ): @@ -219,7 +219,13 @@ def __init__( self.kernel_size = kernel_size self.dropout = dropout - head = ConvBlock(spatial_dims=spatial_dims, in_channels=in_channels, out_channels=channels[0], dropout=dropout) + head = ConvBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=channels[0], + dropout=dropout, + kernel_size=self.kernel_size, + ) reduce_channels = Convolution( spatial_dims=spatial_dims, in_channels=channels[0], @@ -245,6 +251,7 @@ def _create_block(channels: Sequence[int], strides: Sequence[int]) -> nn.Module: out_channels=channels[1], strides=strides[0], dropout=self.dropout, + kernel_size=self.kernel_size, ), subblock, ), @@ -271,6 +278,7 @@ def _get_bottom_layer(self, in_channels: int, out_channels: int, strides: int) - out_channels=out_channels, strides=strides, dropout=self.dropout, + kernel_size=self.kernel_size, ), up_kernel_size=self.up_kernel_size, strides=strides, diff --git a/monai/networks/nets/autoencoderkl.py b/monai/networks/nets/autoencoderkl.py new file mode 100644 index 0000000000..af191e748b --- /dev/null +++ b/monai/networks/nets/autoencoderkl.py @@ -0,0 +1,735 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections.abc import Sequence +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from monai.networks.blocks import Convolution, SpatialAttentionBlock, Upsample +from monai.utils import ensure_tuple_rep, optional_import + +Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange") + +__all__ = ["AutoencoderKL"] + + +class AsymmetricPad(nn.Module): + """ + Pad the input tensor asymmetrically along every spatial dimension. + + Args: + spatial_dims: number of spatial dimensions, could be 1, 2, or 3. + """ + + def __init__(self, spatial_dims: int) -> None: + super().__init__() + self.pad = (0, 1) * spatial_dims + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = nn.functional.pad(x, self.pad, mode="constant", value=0.0) + return x + + +class AEKLDownsample(nn.Module): + """ + Convolution-based downsampling layer. + + Args: + spatial_dims: number of spatial dimensions (1D, 2D, 3D). + in_channels: number of input channels. + """ + + def __init__(self, spatial_dims: int, in_channels: int) -> None: + super().__init__() + self.pad = AsymmetricPad(spatial_dims=spatial_dims) + + self.conv = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=in_channels, + strides=2, + kernel_size=3, + padding=0, + conv_only=True, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.pad(x) + x = self.conv(x) + return x + + +class AEKLResBlock(nn.Module): + """ + Residual block consisting of a cascade of 2 convolutions + activation + normalisation block, and a + residual connection between input and output. + + Args: + spatial_dims: number of spatial dimensions, could be 1, 2, or 3. + in_channels: input channels to the layer. + norm_num_groups: number of groups involved for the group normalisation layer. Ensure that your number of + channels is divisible by this number. + norm_eps: epsilon for the normalisation. + out_channels: number of output channels. + """ + + def __init__( + self, spatial_dims: int, in_channels: int, norm_num_groups: int, norm_eps: float, out_channels: int + ) -> None: + super().__init__() + self.in_channels = in_channels + self.out_channels = in_channels if out_channels is None else out_channels + + self.norm1 = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=norm_eps, affine=True) + self.conv1 = Convolution( + spatial_dims=spatial_dims, + in_channels=self.in_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + self.norm2 = nn.GroupNorm(num_groups=norm_num_groups, num_channels=out_channels, eps=norm_eps, affine=True) + self.conv2 = Convolution( + spatial_dims=spatial_dims, + in_channels=self.out_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + + self.nin_shortcut: nn.Module + if self.in_channels != self.out_channels: + self.nin_shortcut = Convolution( + spatial_dims=spatial_dims, + in_channels=self.in_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + else: + self.nin_shortcut = nn.Identity() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + h = x + h = self.norm1(h) + h = F.silu(h) + h = self.conv1(h) + + h = self.norm2(h) + h = F.silu(h) + h = self.conv2(h) + + x = self.nin_shortcut(x) + + return x + h + + +class Encoder(nn.Module): + """ + Convolutional cascade that downsamples the image into a spatial latent space. + + Args: + spatial_dims: number of spatial dimensions, could be 1, 2, or 3. + in_channels: number of input channels. + channels: sequence of block output channels. + out_channels: number of channels in the bottom layer (latent space) of the autoencoder. + num_res_blocks: number of residual blocks (see _ResBlock) per level. + norm_num_groups: number of groups for the GroupNorm layers, num_channels must be divisible by this number. + norm_eps: epsilon for the normalization. + attention_levels: indicate which level from num_channels contain an attention block. + with_nonlocal_attn: if True use non-local attention block. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + channels: Sequence[int], + out_channels: int, + num_res_blocks: Sequence[int], + norm_num_groups: int, + norm_eps: float, + attention_levels: Sequence[bool], + with_nonlocal_attn: bool = True, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.spatial_dims = spatial_dims + self.in_channels = in_channels + self.channels = channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.norm_num_groups = norm_num_groups + self.norm_eps = norm_eps + self.attention_levels = attention_levels + + blocks: List[nn.Module] = [] + # Initial convolution + blocks.append( + Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + + # Residual and downsampling blocks + output_channel = channels[0] + for i in range(len(channels)): + input_channel = output_channel + output_channel = channels[i] + is_final_block = i == len(channels) - 1 + + for _ in range(self.num_res_blocks[i]): + blocks.append( + AEKLResBlock( + spatial_dims=spatial_dims, + in_channels=input_channel, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=output_channel, + ) + ) + input_channel = output_channel + if attention_levels[i]: + blocks.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=input_channel, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + if not is_final_block: + blocks.append(AEKLDownsample(spatial_dims=spatial_dims, in_channels=input_channel)) + # Non-local attention block + if with_nonlocal_attn is True: + blocks.append( + AEKLResBlock( + spatial_dims=spatial_dims, + in_channels=channels[-1], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=channels[-1], + ) + ) + + blocks.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=channels[-1], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + blocks.append( + AEKLResBlock( + spatial_dims=spatial_dims, + in_channels=channels[-1], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=channels[-1], + ) + ) + # Normalise and convert to latent size + blocks.append(nn.GroupNorm(num_groups=norm_num_groups, num_channels=channels[-1], eps=norm_eps, affine=True)) + blocks.append( + Convolution( + spatial_dims=self.spatial_dims, + in_channels=channels[-1], + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + + self.blocks = nn.ModuleList(blocks) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + for block in self.blocks: + x = block(x) + return x + + +class Decoder(nn.Module): + """ + Convolutional cascade upsampling from a spatial latent space into an image space. + + Args: + spatial_dims: number of spatial dimensions, could be 1, 2, or 3. + channels: sequence of block output channels. + in_channels: number of channels in the bottom layer (latent space) of the autoencoder. + out_channels: number of output channels. + num_res_blocks: number of residual blocks (see _ResBlock) per level. + norm_num_groups: number of groups for the GroupNorm layers, num_channels must be divisible by this number. + norm_eps: epsilon for the normalization. + attention_levels: indicate which level from num_channels contain an attention block. + with_nonlocal_attn: if True use non-local attention block. + use_convtranspose: if True, use ConvTranspose to upsample feature maps in decoder. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + channels: Sequence[int], + in_channels: int, + out_channels: int, + num_res_blocks: Sequence[int], + norm_num_groups: int, + norm_eps: float, + attention_levels: Sequence[bool], + with_nonlocal_attn: bool = True, + use_convtranspose: bool = False, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.spatial_dims = spatial_dims + self.channels = channels + self.in_channels = in_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.norm_num_groups = norm_num_groups + self.norm_eps = norm_eps + self.attention_levels = attention_levels + + reversed_block_out_channels = list(reversed(channels)) + + blocks: List[nn.Module] = [] + + # Initial convolution + blocks.append( + Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=reversed_block_out_channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + + # Non-local attention block + if with_nonlocal_attn is True: + blocks.append( + AEKLResBlock( + spatial_dims=spatial_dims, + in_channels=reversed_block_out_channels[0], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=reversed_block_out_channels[0], + ) + ) + blocks.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=reversed_block_out_channels[0], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + blocks.append( + AEKLResBlock( + spatial_dims=spatial_dims, + in_channels=reversed_block_out_channels[0], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=reversed_block_out_channels[0], + ) + ) + + reversed_attention_levels = list(reversed(attention_levels)) + reversed_num_res_blocks = list(reversed(num_res_blocks)) + block_out_ch = reversed_block_out_channels[0] + for i in range(len(reversed_block_out_channels)): + block_in_ch = block_out_ch + block_out_ch = reversed_block_out_channels[i] + is_final_block = i == len(channels) - 1 + + for _ in range(reversed_num_res_blocks[i]): + blocks.append( + AEKLResBlock( + spatial_dims=spatial_dims, + in_channels=block_in_ch, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=block_out_ch, + ) + ) + block_in_ch = block_out_ch + + if reversed_attention_levels[i]: + blocks.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=block_in_ch, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + if not is_final_block: + if use_convtranspose: + blocks.append( + Upsample( + spatial_dims=spatial_dims, mode="deconv", in_channels=block_in_ch, out_channels=block_in_ch + ) + ) + else: + post_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=block_in_ch, + out_channels=block_in_ch, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + blocks.append( + Upsample( + spatial_dims=spatial_dims, + mode="nontrainable", + in_channels=block_in_ch, + out_channels=block_in_ch, + interp_mode="nearest", + scale_factor=2.0, + post_conv=post_conv, + align_corners=None, + ) + ) + + blocks.append(nn.GroupNorm(num_groups=norm_num_groups, num_channels=block_in_ch, eps=norm_eps, affine=True)) + blocks.append( + Convolution( + spatial_dims=spatial_dims, + in_channels=block_in_ch, + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + + self.blocks = nn.ModuleList(blocks) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + for block in self.blocks: + x = block(x) + return x + + +class AutoencoderKL(nn.Module): + """ + Autoencoder model with KL-regularized latent space based on + Rombach et al. "High-Resolution Image Synthesis with Latent Diffusion Models" https://arxiv.org/abs/2112.10752 + and Pinaya et al. "Brain Imaging Generation with Latent Diffusion Models" https://arxiv.org/abs/2209.07162 + + Args: + spatial_dims: number of spatial dimensions, could be 1, 2, or 3. + in_channels: number of input channels. + out_channels: number of output channels. + num_res_blocks: number of residual blocks (see _ResBlock) per level. + channels: number of output channels for each block. + attention_levels: sequence of levels to add attention. + latent_channels: latent embedding dimension. + norm_num_groups: number of groups for the GroupNorm layers, num_channels must be divisible by this number. + norm_eps: epsilon for the normalization. + with_encoder_nonlocal_attn: if True use non-local attention block in the encoder. + with_decoder_nonlocal_attn: if True use non-local attention block in the decoder. + use_checkpoint: if True, use activation checkpoint to save memory. + use_convtranspose: if True, use ConvTranspose to upsample feature maps in decoder. + include_fc: whether to include the final linear layer in the attention block. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection in the attention block, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int = 1, + out_channels: int = 1, + num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), + channels: Sequence[int] = (32, 64, 64, 64), + attention_levels: Sequence[bool] = (False, False, True, True), + latent_channels: int = 3, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + with_encoder_nonlocal_attn: bool = True, + with_decoder_nonlocal_attn: bool = True, + use_checkpoint: bool = False, + use_convtranspose: bool = False, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + + # All number of channels should be multiple of num_groups + if any((out_channel % norm_num_groups) != 0 for out_channel in channels): + raise ValueError("AutoencoderKL expects all num_channels being multiple of norm_num_groups") + + if len(channels) != len(attention_levels): + raise ValueError("AutoencoderKL expects num_channels being same size of attention_levels") + + if isinstance(num_res_blocks, int): + num_res_blocks = ensure_tuple_rep(num_res_blocks, len(channels)) + + if len(num_res_blocks) != len(channels): + raise ValueError( + "`num_res_blocks` should be a single integer or a tuple of integers with the same length as " + "`num_channels`." + ) + + self.encoder: nn.Module = Encoder( + spatial_dims=spatial_dims, + in_channels=in_channels, + channels=channels, + out_channels=latent_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + attention_levels=attention_levels, + with_nonlocal_attn=with_encoder_nonlocal_attn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + self.decoder: nn.Module = Decoder( + spatial_dims=spatial_dims, + channels=channels, + in_channels=latent_channels, + out_channels=out_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + attention_levels=attention_levels, + with_nonlocal_attn=with_decoder_nonlocal_attn, + use_convtranspose=use_convtranspose, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + self.quant_conv_mu = Convolution( + spatial_dims=spatial_dims, + in_channels=latent_channels, + out_channels=latent_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + self.quant_conv_log_sigma = Convolution( + spatial_dims=spatial_dims, + in_channels=latent_channels, + out_channels=latent_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + self.post_quant_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=latent_channels, + out_channels=latent_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + self.latent_channels = latent_channels + self.use_checkpoint = use_checkpoint + + def encode(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """ + Forwards an image through the spatial encoder, obtaining the latent mean and sigma representations. + + Args: + x: BxCx[SPATIAL DIMS] tensor + + """ + if self.use_checkpoint: + h = torch.utils.checkpoint.checkpoint(self.encoder, x, use_reentrant=False) + else: + h = self.encoder(x) + + z_mu = self.quant_conv_mu(h) + z_log_var = self.quant_conv_log_sigma(h) + z_log_var = torch.clamp(z_log_var, -30.0, 20.0) + z_sigma = torch.exp(z_log_var / 2) + + return z_mu, z_sigma + + def sampling(self, z_mu: torch.Tensor, z_sigma: torch.Tensor) -> torch.Tensor: + """ + From the mean and sigma representations resulting of encoding an image through the latent space, + obtains a noise sample resulting from sampling gaussian noise, multiplying by the variance (sigma) and + adding the mean. + + Args: + z_mu: Bx[Z_CHANNELS]x[LATENT SPACE SIZE] mean vector obtained by the encoder when you encode an image + z_sigma: Bx[Z_CHANNELS]x[LATENT SPACE SIZE] variance vector obtained by the encoder when you encode an image + + Returns: + sample of shape Bx[Z_CHANNELS]x[LATENT SPACE SIZE] + """ + eps = torch.randn_like(z_sigma) + z_vae = z_mu + eps * z_sigma + return z_vae + + def reconstruct(self, x: torch.Tensor) -> torch.Tensor: + """ + Encodes and decodes an input image. + + Args: + x: BxCx[SPATIAL DIMENSIONS] tensor. + + Returns: + reconstructed image, of the same shape as input + """ + z_mu, _ = self.encode(x) + reconstruction = self.decode(z_mu) + return reconstruction + + def decode(self, z: torch.Tensor) -> torch.Tensor: + """ + Based on a latent space sample, forwards it through the Decoder. + + Args: + z: Bx[Z_CHANNELS]x[LATENT SPACE SHAPE] + + Returns: + decoded image tensor + """ + z = self.post_quant_conv(z) + dec: torch.Tensor + if self.use_checkpoint: + dec = torch.utils.checkpoint.checkpoint(self.decoder, z, use_reentrant=False) + else: + dec = self.decoder(z) + return dec + + def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + z_mu, z_sigma = self.encode(x) + z = self.sampling(z_mu, z_sigma) + reconstruction = self.decode(z) + return reconstruction, z_mu, z_sigma + + def encode_stage_2_inputs(self, x: torch.Tensor) -> torch.Tensor: + z_mu, z_sigma = self.encode(x) + z = self.sampling(z_mu, z_sigma) + return z + + def decode_stage_2_outputs(self, z: torch.Tensor) -> torch.Tensor: + image = self.decode(z) + return image + + def load_old_state_dict(self, old_state_dict: dict, verbose=False) -> None: + """ + Load a state dict from an AutoencoderKL trained with [MONAI Generative](https://github.com/Project-MONAI/GenerativeModels). + + Args: + old_state_dict: state dict from the old AutoencoderKL model. + """ + + new_state_dict = self.state_dict() + # if all keys match, just load the state dict + if all(k in new_state_dict for k in old_state_dict): + print("All keys match, loading state dict.") + self.load_state_dict(old_state_dict) + return + + if verbose: + # print all new_state_dict keys that are not in old_state_dict + for k in new_state_dict: + if k not in old_state_dict: + print(f"key {k} not found in old state dict") + # and vice versa + print("----------------------------------------------") + for k in old_state_dict: + if k not in new_state_dict: + print(f"key {k} not found in new state dict") + + # copy over all matching keys + for k in new_state_dict: + if k in old_state_dict: + new_state_dict[k] = old_state_dict.pop(k) + + # fix the attention blocks + attention_blocks = [k.replace(".attn.to_q.weight", "") for k in new_state_dict if "attn.to_q.weight" in k] + for block in attention_blocks: + new_state_dict[f"{block}.attn.to_q.weight"] = old_state_dict.pop(f"{block}.to_q.weight") + new_state_dict[f"{block}.attn.to_k.weight"] = old_state_dict.pop(f"{block}.to_k.weight") + new_state_dict[f"{block}.attn.to_v.weight"] = old_state_dict.pop(f"{block}.to_v.weight") + new_state_dict[f"{block}.attn.to_q.bias"] = old_state_dict.pop(f"{block}.to_q.bias") + new_state_dict[f"{block}.attn.to_k.bias"] = old_state_dict.pop(f"{block}.to_k.bias") + new_state_dict[f"{block}.attn.to_v.bias"] = old_state_dict.pop(f"{block}.to_v.bias") + + # old version did not have a projection so set these to the identity + new_state_dict[f"{block}.attn.out_proj.weight"] = torch.eye( + new_state_dict[f"{block}.attn.out_proj.weight"].shape[0] + ) + new_state_dict[f"{block}.attn.out_proj.bias"] = torch.zeros( + new_state_dict[f"{block}.attn.out_proj.bias"].shape + ) + + # fix the upsample conv blocks which were renamed postconv + for k in new_state_dict: + if "postconv" in k: + old_name = k.replace("postconv", "conv") + new_state_dict[k] = old_state_dict.pop(old_name) + if verbose: + # print all remaining keys in old_state_dict + print("remaining keys in old_state_dict:", old_state_dict.keys()) + self.load_state_dict(new_state_dict, strict=True) diff --git a/monai/networks/nets/cell_sam_wrapper.py b/monai/networks/nets/cell_sam_wrapper.py new file mode 100644 index 0000000000..308c3a6bcb --- /dev/null +++ b/monai/networks/nets/cell_sam_wrapper.py @@ -0,0 +1,92 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import torch +from torch import nn +from torch.nn import functional as F + +from monai.utils import optional_import + +build_sam_vit_b, has_sam = optional_import("segment_anything.build_sam", name="build_sam_vit_b") + +_all__ = ["CellSamWrapper"] + + +class CellSamWrapper(torch.nn.Module): + """ + CellSamWrapper is thin wrapper around SAM model https://github.com/facebookresearch/segment-anything + with an image only decoder, that can be used for segmentation tasks. + + + Args: + auto_resize_inputs: whether to resize inputs before passing to the network. + (usually they need be resized, unless they are already at the expected size) + network_resize_roi: expected input size for the network. + (currently SAM expects 1024x1024) + checkpoint: checkpoint file to load the SAM weights from. + (this can be downloaded from SAM repo https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth) + return_features: whether to return features from SAM encoder + (without using decoder/upsampling to the original input size) + + """ + + def __init__( + self, + auto_resize_inputs=True, + network_resize_roi=(1024, 1024), + checkpoint="sam_vit_b_01ec64.pth", + return_features=False, + *args, + **kwargs, + ) -> None: + super().__init__(*args, **kwargs) + + self.network_resize_roi = network_resize_roi + self.auto_resize_inputs = auto_resize_inputs + self.return_features = return_features + + if not has_sam: + raise ValueError( + "SAM is not installed, please run: pip install git+https://github.com/facebookresearch/segment-anything.git" + ) + + model = build_sam_vit_b(checkpoint=checkpoint) + + model.prompt_encoder = None + model.mask_decoder = None + + model.mask_decoder = nn.Sequential( + nn.BatchNorm2d(num_features=256), + nn.ReLU(inplace=True), + nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, output_padding=1, bias=False), + nn.BatchNorm2d(num_features=128), + nn.ReLU(inplace=True), + nn.ConvTranspose2d(128, 3, kernel_size=3, stride=2, padding=1, output_padding=1, bias=True), + ) + + self.model = model + + def forward(self, x): + sh = x.shape[2:] + + if self.auto_resize_inputs: + x = F.interpolate(x, size=self.network_resize_roi, mode="bilinear") + + x = self.model.image_encoder(x) + + if not self.return_features: + x = self.model.mask_decoder(x) + if self.auto_resize_inputs: + x = F.interpolate(x, size=sh, mode="bilinear") + + return x diff --git a/monai/networks/nets/controlnet.py b/monai/networks/nets/controlnet.py new file mode 100644 index 0000000000..8b8813597f --- /dev/null +++ b/monai/networks/nets/controlnet.py @@ -0,0 +1,467 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ========================================================================= +# Adapted from https://github.com/huggingface/diffusers +# which has the following license: +# https://github.com/huggingface/diffusers/blob/main/LICENSE +# +# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + +from __future__ import annotations + +from collections.abc import Sequence + +import torch +from torch import nn + +from monai.networks.blocks import Convolution +from monai.networks.nets.diffusion_model_unet import get_down_block, get_mid_block, get_timestep_embedding +from monai.utils import ensure_tuple_rep + + +class ControlNetConditioningEmbedding(nn.Module): + """ + Network to encode the conditioning into a latent space. + """ + + def __init__(self, spatial_dims: int, in_channels: int, out_channels: int, channels: Sequence[int]): + super().__init__() + + self.conv_in = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=channels[0], + strides=1, + kernel_size=3, + padding=1, + adn_ordering="A", + act="SWISH", + ) + + self.blocks = nn.ModuleList([]) + + for i in range(len(channels) - 1): + channel_in = channels[i] + channel_out = channels[i + 1] + self.blocks.append( + Convolution( + spatial_dims=spatial_dims, + in_channels=channel_in, + out_channels=channel_in, + strides=1, + kernel_size=3, + padding=1, + adn_ordering="A", + act="SWISH", + ) + ) + + self.blocks.append( + Convolution( + spatial_dims=spatial_dims, + in_channels=channel_in, + out_channels=channel_out, + strides=2, + kernel_size=3, + padding=1, + adn_ordering="A", + act="SWISH", + ) + ) + + self.conv_out = zero_module( + Convolution( + spatial_dims=spatial_dims, + in_channels=channels[-1], + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + + def forward(self, conditioning): + embedding = self.conv_in(conditioning) + + for block in self.blocks: + embedding = block(embedding) + + embedding = self.conv_out(embedding) + + return embedding + + +def zero_module(module): + for p in module.parameters(): + nn.init.zeros_(p) + return module + + +class ControlNet(nn.Module): + """ + Control network for diffusion models based on Zhang and Agrawala "Adding Conditional Control to Text-to-Image + Diffusion Models" (https://arxiv.org/abs/2302.05543) + + Args: + spatial_dims: number of spatial dimensions. + in_channels: number of input channels. + num_res_blocks: number of residual blocks (see ResnetBlock) per level. + channels: tuple of block output channels. + attention_levels: list of levels to add attention. + norm_num_groups: number of groups for the normalization. + norm_eps: epsilon for the normalization. + resblock_updown: if True use residual blocks for up/downsampling. + num_head_channels: number of channels in each attention head. + with_conditioning: if True add spatial transformers to perform conditioning. + transformer_num_layers: number of layers of Transformer blocks to use. + cross_attention_dim: number of context dimensions to use. + num_class_embeds: if specified (as an int), then this model will be class-conditional with `num_class_embeds` + classes. + upcast_attention: if True, upcast attention operations to full precision. + conditioning_embedding_in_channels: number of input channels for the conditioning embedding. + conditioning_embedding_num_channels: number of channels for the blocks in the conditioning embedding. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to True. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), + channels: Sequence[int] = (32, 64, 64, 64), + attention_levels: Sequence[bool] = (False, False, True, True), + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + resblock_updown: bool = False, + num_head_channels: int | Sequence[int] = 8, + with_conditioning: bool = False, + transformer_num_layers: int = 1, + cross_attention_dim: int | None = None, + num_class_embeds: int | None = None, + upcast_attention: bool = False, + conditioning_embedding_in_channels: int = 1, + conditioning_embedding_num_channels: Sequence[int] = (16, 32, 96, 256), + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + if with_conditioning is True and cross_attention_dim is None: + raise ValueError( + "ControlNet expects dimension of the cross-attention conditioning (cross_attention_dim) " + "to be specified when with_conditioning=True." + ) + if cross_attention_dim is not None and with_conditioning is False: + raise ValueError("ControlNet expects with_conditioning=True when specifying the cross_attention_dim.") + + # All number of channels should be multiple of num_groups + if any((out_channel % norm_num_groups) != 0 for out_channel in channels): + raise ValueError( + f"ControlNet expects all channels to be a multiple of norm_num_groups, but got" + f" channels={channels} and norm_num_groups={norm_num_groups}" + ) + + if len(channels) != len(attention_levels): + raise ValueError( + f"ControlNet expects channels to have the same length as attention_levels, but got " + f"channels={channels} and attention_levels={attention_levels}" + ) + + if isinstance(num_head_channels, int): + num_head_channels = ensure_tuple_rep(num_head_channels, len(attention_levels)) + + if len(num_head_channels) != len(attention_levels): + raise ValueError( + f"num_head_channels should have the same length as attention_levels, but got channels={channels} and " + f"attention_levels={attention_levels} . For the i levels without attention," + " i.e. `attention_level[i]=False`, the num_head_channels[i] will be ignored." + ) + + if isinstance(num_res_blocks, int): + num_res_blocks = ensure_tuple_rep(num_res_blocks, len(channels)) + + if len(num_res_blocks) != len(channels): + raise ValueError( + f"`num_res_blocks` should be a single integer or a tuple of integers with the same length as " + f"`num_channels`, but got num_res_blocks={num_res_blocks} and channels={channels}." + ) + + self.in_channels = in_channels + self.block_out_channels = channels + self.num_res_blocks = num_res_blocks + self.attention_levels = attention_levels + self.num_head_channels = num_head_channels + self.with_conditioning = with_conditioning + + # input + self.conv_in = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + + # time + time_embed_dim = channels[0] * 4 + self.time_embed = nn.Sequential( + nn.Linear(channels[0], time_embed_dim), nn.SiLU(), nn.Linear(time_embed_dim, time_embed_dim) + ) + + # class embedding + self.num_class_embeds = num_class_embeds + if num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) + + # control net conditioning embedding + self.controlnet_cond_embedding = ControlNetConditioningEmbedding( + spatial_dims=spatial_dims, + in_channels=conditioning_embedding_in_channels, + channels=conditioning_embedding_num_channels, + out_channels=channels[0], + ) + + # down + self.down_blocks = nn.ModuleList([]) + self.controlnet_down_blocks = nn.ModuleList([]) + output_channel = channels[0] + + controlnet_block = Convolution( + spatial_dims=spatial_dims, + in_channels=output_channel, + out_channels=output_channel, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + controlnet_block = zero_module(controlnet_block.conv) + self.controlnet_down_blocks.append(controlnet_block) + + for i in range(len(channels)): + input_channel = output_channel + output_channel = channels[i] + is_final_block = i == len(channels) - 1 + + down_block = get_down_block( + spatial_dims=spatial_dims, + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + num_res_blocks=num_res_blocks[i], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_downsample=not is_final_block, + resblock_updown=resblock_updown, + with_attn=(attention_levels[i] and not with_conditioning), + with_cross_attn=(attention_levels[i] and with_conditioning), + num_head_channels=num_head_channels[i], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + self.down_blocks.append(down_block) + + for _ in range(num_res_blocks[i]): + controlnet_block = Convolution( + spatial_dims=spatial_dims, + in_channels=output_channel, + out_channels=output_channel, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + controlnet_block = zero_module(controlnet_block) + self.controlnet_down_blocks.append(controlnet_block) + # + if not is_final_block: + controlnet_block = Convolution( + spatial_dims=spatial_dims, + in_channels=output_channel, + out_channels=output_channel, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + controlnet_block = zero_module(controlnet_block) + self.controlnet_down_blocks.append(controlnet_block) + + # mid + mid_block_channel = channels[-1] + + self.middle_block = get_mid_block( + spatial_dims=spatial_dims, + in_channels=mid_block_channel, + temb_channels=time_embed_dim, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + with_conditioning=with_conditioning, + num_head_channels=num_head_channels[-1], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + controlnet_block = Convolution( + spatial_dims=spatial_dims, + in_channels=output_channel, + out_channels=output_channel, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + controlnet_block = zero_module(controlnet_block) + self.controlnet_mid_block = controlnet_block + + def forward( + self, + x: torch.Tensor, + timesteps: torch.Tensor, + controlnet_cond: torch.Tensor, + conditioning_scale: float = 1.0, + context: torch.Tensor | None = None, + class_labels: torch.Tensor | None = None, + ) -> tuple[list[torch.Tensor], torch.Tensor]: + """ + Args: + x: input tensor (N, C, H, W, [D]). + timesteps: timestep tensor (N,). + controlnet_cond: controlnet conditioning tensor (N, C, H, W, [D]) + conditioning_scale: conditioning scale. + context: context tensor (N, 1, cross_attention_dim), where cross_attention_dim is specified in the model init. + class_labels: context tensor (N, ). + """ + # 1. time + t_emb = get_timestep_embedding(timesteps, self.block_out_channels[0]) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=x.dtype) + emb = self.time_embed(t_emb) + + # 2. class + if self.num_class_embeds is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + class_emb = self.class_embedding(class_labels) + class_emb = class_emb.to(dtype=x.dtype) + emb = emb + class_emb + + # 3. initial convolution + h = self.conv_in(x) + + controlnet_cond = self.controlnet_cond_embedding(controlnet_cond) + + h += controlnet_cond + + # 4. down + if context is not None and self.with_conditioning is False: + raise ValueError("model should have with_conditioning = True if context is provided") + down_block_res_samples: list[torch.Tensor] = [h] + for downsample_block in self.down_blocks: + h, res_samples = downsample_block(hidden_states=h, temb=emb, context=context) + for residual in res_samples: + down_block_res_samples.append(residual) + + # 5. mid + h = self.middle_block(hidden_states=h, temb=emb, context=context) + + # 6. Control net blocks + controlnet_down_block_res_samples = [] + + for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks): + down_block_res_sample = controlnet_block(down_block_res_sample) + controlnet_down_block_res_samples.append(down_block_res_sample) + + down_block_res_samples = controlnet_down_block_res_samples + + mid_block_res_sample: torch.Tensor = self.controlnet_mid_block(h) + + # 6. scaling + down_block_res_samples = [h * conditioning_scale for h in down_block_res_samples] + mid_block_res_sample *= conditioning_scale + + return down_block_res_samples, mid_block_res_sample + + def load_old_state_dict(self, old_state_dict: dict, verbose=False) -> None: + """ + Load a state dict from a ControlNet trained with + [MONAI Generative](https://github.com/Project-MONAI/GenerativeModels). + + Args: + old_state_dict: state dict from the old ControlNet model. + """ + + new_state_dict = self.state_dict() + # if all keys match, just load the state dict + if all(k in new_state_dict for k in old_state_dict): + print("All keys match, loading state dict.") + self.load_state_dict(old_state_dict) + return + + if verbose: + # print all new_state_dict keys that are not in old_state_dict + for k in new_state_dict: + if k not in old_state_dict: + print(f"key {k} not found in old state dict") + # and vice versa + print("----------------------------------------------") + for k in old_state_dict: + if k not in new_state_dict: + print(f"key {k} not found in new state dict") + + # copy over all matching keys + for k in new_state_dict: + if k in old_state_dict: + new_state_dict[k] = old_state_dict.pop(k) + + # fix the attention blocks + attention_blocks = [k.replace(".out_proj.weight", "") for k in new_state_dict if "out_proj.weight" in k] + for block in attention_blocks: + # projection + new_state_dict[f"{block}.out_proj.weight"] = old_state_dict.pop(f"{block}.to_out.0.weight") + new_state_dict[f"{block}.out_proj.bias"] = old_state_dict.pop(f"{block}.to_out.0.bias") + + if verbose: + # print all remaining keys in old_state_dict + print("remaining keys in old_state_dict:", old_state_dict.keys()) + self.load_state_dict(new_state_dict) diff --git a/monai/networks/nets/daf3d.py b/monai/networks/nets/daf3d.py index c9a18c746a..02e5bb022a 100644 --- a/monai/networks/nets/daf3d.py +++ b/monai/networks/nets/daf3d.py @@ -13,6 +13,7 @@ from collections import OrderedDict from collections.abc import Callable, Sequence +from functools import partial import torch import torch.nn as nn @@ -25,6 +26,7 @@ from monai.networks.blocks.convolutions import Convolution from monai.networks.blocks.feature_pyramid_network import ExtraFPNBlock, FeaturePyramidNetwork from monai.networks.layers.factories import Conv, Norm +from monai.networks.layers.utils import get_norm_layer from monai.networks.nets.resnet import ResNet, ResNetBottleneck __all__ = [ @@ -170,33 +172,37 @@ class Daf3dResNetBottleneck(ResNetBottleneck): spatial_dims: number of spatial dimensions of the input image. stride: stride to use for second conv layer. downsample: which downsample layer to use. + norm: which normalization layer to use. Defaults to group. """ expansion = 2 - def __init__(self, in_planes, planes, spatial_dims=3, stride=1, downsample=None): - norm_type: Callable = Norm[Norm.GROUP, spatial_dims] + def __init__( + self, in_planes, planes, spatial_dims=3, stride=1, downsample=None, norm=("group", {"num_groups": 32}) + ): conv_type: Callable = Conv[Conv.CONV, spatial_dims] + norm_layer = partial(get_norm_layer, name=norm, spatial_dims=spatial_dims) + # in case downsample uses batch norm, change to group norm if isinstance(downsample, nn.Sequential): downsample = nn.Sequential( conv_type(in_planes, planes * self.expansion, kernel_size=1, stride=stride, bias=False), - norm_type(num_groups=32, num_channels=planes * self.expansion), + norm_layer(channels=planes * self.expansion), ) super().__init__(in_planes, planes, spatial_dims, stride, downsample) # change norm from batch to group norm - self.bn1 = norm_type(num_groups=32, num_channels=planes) - self.bn2 = norm_type(num_groups=32, num_channels=planes) - self.bn3 = norm_type(num_groups=32, num_channels=planes * self.expansion) + self.bn1 = norm_layer(channels=planes) + self.bn2 = norm_layer(channels=planes) + self.bn3 = norm_layer(channels=planes * self.expansion) # adapt second convolution to work with groups self.conv2 = conv_type(planes, planes, kernel_size=3, padding=1, stride=stride, groups=32, bias=False) # adapt activation function - self.relu = nn.PReLU() # type: ignore + self.relu = nn.PReLU() class Daf3dResNetDilatedBottleneck(Daf3dResNetBottleneck): @@ -212,8 +218,10 @@ class Daf3dResNetDilatedBottleneck(Daf3dResNetBottleneck): downsample: which downsample layer to use. """ - def __init__(self, in_planes, planes, spatial_dims=3, stride=1, downsample=None): - super().__init__(in_planes, planes, spatial_dims, stride, downsample) + def __init__( + self, in_planes, planes, spatial_dims=3, stride=1, downsample=None, norm=("group", {"num_groups": 32}) + ): + super().__init__(in_planes, planes, spatial_dims, stride, downsample, norm) # add dilation in second convolution conv_type: Callable = Conv[Conv.CONV, spatial_dims] @@ -287,7 +295,7 @@ def __init__( n_input_channels, self.in_planes, kernel_size=7, stride=(1, 2, 2), padding=(3, 3, 3), bias=False ) self.bn1 = norm_type(32, 64) - self.relu = nn.PReLU() # type: ignore + self.relu = nn.PReLU() # adapt layers to our needs self.layer1 = self._make_layer(Daf3dResNetBottleneck, block_inplanes[0], layers[0], spatial_dims, shortcut_type) diff --git a/monai/networks/nets/diffusion_model_unet.py b/monai/networks/nets/diffusion_model_unet.py new file mode 100644 index 0000000000..f57fe251d2 --- /dev/null +++ b/monai/networks/nets/diffusion_model_unet.py @@ -0,0 +1,2036 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ========================================================================= +# Adapted from https://github.com/huggingface/diffusers +# which has the following license: +# https://github.com/huggingface/diffusers/blob/main/LICENSE +# +# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + +from __future__ import annotations + +import math +from collections.abc import Sequence + +import torch +from torch import nn + +from monai.networks.blocks import Convolution, CrossAttentionBlock, MLPBlock, SABlock, SpatialAttentionBlock, Upsample +from monai.networks.layers.factories import Pool +from monai.utils import ensure_tuple_rep, optional_import + +Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange") + +__all__ = ["DiffusionModelUNet"] + + +def zero_module(module: nn.Module) -> nn.Module: + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +class DiffusionUNetTransformerBlock(nn.Module): + """ + A Transformer block that allows for the input dimension to differ from the hidden dimension. + + Args: + num_channels: number of channels in the input and output. + num_attention_heads: number of heads to use for multi-head attention. + num_head_channels: number of channels in each attention head. + dropout: dropout probability to use. + cross_attention_dim: size of the context vector for cross attention. + upcast_attention: if True, upcast attention operations to full precision. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + + """ + + def __init__( + self, + num_channels: int, + num_attention_heads: int, + num_head_channels: int, + dropout: float = 0.0, + cross_attention_dim: int | None = None, + upcast_attention: bool = False, + use_flash_attention: bool = False, + include_fc: bool = True, + use_combined_linear: bool = False, + ) -> None: + super().__init__() + self.attn1 = SABlock( + hidden_size=num_attention_heads * num_head_channels, + hidden_input_size=num_channels, + num_heads=num_attention_heads, + dim_head=num_head_channels, + dropout_rate=dropout, + attention_dtype=torch.float if upcast_attention else None, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + self.ff = MLPBlock(hidden_size=num_channels, mlp_dim=num_channels * 4, act="GEGLU", dropout_rate=dropout) + self.attn2 = CrossAttentionBlock( + hidden_size=num_attention_heads * num_head_channels, + num_heads=num_attention_heads, + hidden_input_size=num_channels, + context_input_size=cross_attention_dim, + dim_head=num_head_channels, + dropout_rate=dropout, + attention_dtype=torch.float if upcast_attention else None, + use_flash_attention=use_flash_attention, + ) + self.norm1 = nn.LayerNorm(num_channels) + self.norm2 = nn.LayerNorm(num_channels) + self.norm3 = nn.LayerNorm(num_channels) + + def forward(self, x: torch.Tensor, context: torch.Tensor | None = None) -> torch.Tensor: + # 1. Self-Attention + x = self.attn1(self.norm1(x)) + x + + # 2. Cross-Attention + x = self.attn2(self.norm2(x), context=context) + x + + # 3. Feed-forward + x = self.ff(self.norm3(x)) + x + return x + + +class SpatialTransformer(nn.Module): + """ + Transformer block for image-like data. First, project the input (aka embedding) and reshape to b, t, d. Then apply + standard transformer action. Finally, reshape to image. + + Args: + spatial_dims: number of spatial dimensions. + in_channels: number of channels in the input and output. + num_attention_heads: number of heads to use for multi-head attention. + num_head_channels: number of channels in each attention head. + num_layers: number of layers of Transformer blocks to use. + dropout: dropout probability to use. + norm_num_groups: number of groups for the normalization. + norm_eps: epsilon for the normalization. + cross_attention_dim: number of context dimensions to use. + upcast_attention: if True, upcast attention operations to full precision. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + num_attention_heads: int, + num_head_channels: int, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + cross_attention_dim: int | None = None, + upcast_attention: bool = False, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.spatial_dims = spatial_dims + self.in_channels = in_channels + inner_dim = num_attention_heads * num_head_channels + + self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=norm_eps, affine=True) + + self.proj_in = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=inner_dim, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + + self.transformer_blocks = nn.ModuleList( + [ + DiffusionUNetTransformerBlock( + num_channels=inner_dim, + num_attention_heads=num_attention_heads, + num_head_channels=num_head_channels, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + for _ in range(num_layers) + ] + ) + + self.proj_out = zero_module( + Convolution( + spatial_dims=spatial_dims, + in_channels=inner_dim, + out_channels=in_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + ) + + def forward(self, x: torch.Tensor, context: torch.Tensor | None = None) -> torch.Tensor: + # note: if no context is given, cross-attention defaults to self-attention + batch = channel = height = width = depth = -1 + if self.spatial_dims == 2: + batch, channel, height, width = x.shape + if self.spatial_dims == 3: + batch, channel, height, width, depth = x.shape + + residual = x + x = self.norm(x) + x = self.proj_in(x) + + inner_dim = x.shape[1] + + if self.spatial_dims == 2: + x = x.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim) + if self.spatial_dims == 3: + x = x.permute(0, 2, 3, 4, 1).reshape(batch, height * width * depth, inner_dim) + + for block in self.transformer_blocks: + x = block(x, context=context) + + if self.spatial_dims == 2: + x = x.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous() + if self.spatial_dims == 3: + x = x.reshape(batch, height, width, depth, inner_dim).permute(0, 4, 1, 2, 3).contiguous() + + x = self.proj_out(x) + return x + residual + + +def get_timestep_embedding(timesteps: torch.Tensor, embedding_dim: int, max_period: int = 10000) -> torch.Tensor: + """ + Create sinusoidal timestep embeddings following the implementation in Ho et al. "Denoising Diffusion Probabilistic + Models" https://arxiv.org/abs/2006.11239. + + Args: + timesteps: a 1-D Tensor of N indices, one per batch element. + embedding_dim: the dimension of the output. + max_period: controls the minimum frequency of the embeddings. + """ + if timesteps.ndim != 1: + raise ValueError("Timesteps should be a 1d-array") + + half_dim = embedding_dim // 2 + exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device) + freqs = torch.exp(exponent / half_dim) + + args = timesteps[:, None].float() * freqs[None, :] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + + # zero pad + if embedding_dim % 2 == 1: + embedding = torch.nn.functional.pad(embedding, (0, 1, 0, 0)) + + return embedding + + +class DiffusionUnetDownsample(nn.Module): + """ + Downsampling layer. + + Args: + spatial_dims: number of spatial dimensions. + num_channels: number of input channels. + use_conv: if True uses Convolution instead of Pool average to perform downsampling. In case that use_conv is + False, the number of output channels must be the same as the number of input channels. + out_channels: number of output channels. + padding: controls the amount of implicit zero-paddings on both sides for padding number of points + for each dimension. + """ + + def __init__( + self, spatial_dims: int, num_channels: int, use_conv: bool, out_channels: int | None = None, padding: int = 1 + ) -> None: + super().__init__() + self.num_channels = num_channels + self.out_channels = out_channels or num_channels + self.use_conv = use_conv + if use_conv: + self.op = Convolution( + spatial_dims=spatial_dims, + in_channels=self.num_channels, + out_channels=self.out_channels, + strides=2, + kernel_size=3, + padding=padding, + conv_only=True, + ) + else: + if self.num_channels != self.out_channels: + raise ValueError("num_channels and out_channels must be equal when use_conv=False") + self.op = Pool[Pool.AVG, spatial_dims](kernel_size=2, stride=2) + + def forward(self, x: torch.Tensor, emb: torch.Tensor | None = None) -> torch.Tensor: + del emb + if x.shape[1] != self.num_channels: + raise ValueError( + f"Input number of channels ({x.shape[1]}) is not equal to expected number of channels " + f"({self.num_channels})" + ) + output: torch.Tensor = self.op(x) + return output + + +class WrappedUpsample(Upsample): + """ + Wraps MONAI upsample block to allow for calling with timestep embeddings. + """ + + def forward(self, x: torch.Tensor, emb: torch.Tensor | None = None) -> torch.Tensor: + del emb + upsampled: torch.Tensor = super().forward(x) + return upsampled + + +class DiffusionUNetResnetBlock(nn.Module): + """ + Residual block with timestep conditioning. + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + temb_channels: number of timestep embedding channels. + out_channels: number of output channels. + up: if True, performs upsampling. + down: if True, performs downsampling. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + temb_channels: int, + out_channels: int | None = None, + up: bool = False, + down: bool = False, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + ) -> None: + super().__init__() + self.spatial_dims = spatial_dims + self.channels = in_channels + self.emb_channels = temb_channels + self.out_channels = out_channels or in_channels + self.up = up + self.down = down + + self.norm1 = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=norm_eps, affine=True) + self.nonlinearity = nn.SiLU() + self.conv1 = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + + self.upsample = self.downsample = None + if self.up: + self.upsample = WrappedUpsample( + spatial_dims=spatial_dims, + mode="nontrainable", + in_channels=in_channels, + out_channels=in_channels, + interp_mode="nearest", + scale_factor=2.0, + align_corners=None, + ) + elif down: + self.downsample = DiffusionUnetDownsample(spatial_dims, in_channels, use_conv=False) + + self.time_emb_proj = nn.Linear(temb_channels, self.out_channels) + + self.norm2 = nn.GroupNorm(num_groups=norm_num_groups, num_channels=self.out_channels, eps=norm_eps, affine=True) + self.conv2 = zero_module( + Convolution( + spatial_dims=spatial_dims, + in_channels=self.out_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + self.skip_connection: nn.Module + if self.out_channels == in_channels: + self.skip_connection = nn.Identity() + else: + self.skip_connection = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + + def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor: + h = x + h = self.norm1(h) + h = self.nonlinearity(h) + + if self.upsample is not None: + x = self.upsample(x) + h = self.upsample(h) + elif self.downsample is not None: + x = self.downsample(x) + h = self.downsample(h) + + h = self.conv1(h) + + if self.spatial_dims == 2: + temb = self.time_emb_proj(self.nonlinearity(emb))[:, :, None, None] + else: + temb = self.time_emb_proj(self.nonlinearity(emb))[:, :, None, None, None] + h = h + temb + + h = self.norm2(h) + h = self.nonlinearity(h) + h = self.conv2(h) + output: torch.Tensor = self.skip_connection(x) + h + return output + + +class DownBlock(nn.Module): + """ + Unet's down block containing resnet and downsamplers blocks. + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + out_channels: number of output channels. + temb_channels: number of timestep embedding channels. + num_res_blocks: number of residual blocks. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + add_downsample: if True add downsample block. + resblock_updown: if True use residual blocks for downsampling. + downsample_padding: padding used in the downsampling block. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + temb_channels: int, + num_res_blocks: int = 1, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + add_downsample: bool = True, + resblock_updown: bool = False, + downsample_padding: int = 1, + ) -> None: + super().__init__() + self.resblock_updown = resblock_updown + + resnets = [] + + for i in range(num_res_blocks): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + ) + ) + + self.resnets = nn.ModuleList(resnets) + + if add_downsample: + self.downsampler: nn.Module | None + if resblock_updown: + self.downsampler = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + down=True, + ) + else: + self.downsampler = DiffusionUnetDownsample( + spatial_dims=spatial_dims, + num_channels=out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + ) + else: + self.downsampler = None + + def forward( + self, hidden_states: torch.Tensor, temb: torch.Tensor, context: torch.Tensor | None = None + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + del context + output_states = [] + + for resnet in self.resnets: + hidden_states = resnet(hidden_states, temb) + output_states.append(hidden_states) + + if self.downsampler is not None: + hidden_states = self.downsampler(hidden_states, temb) + output_states.append(hidden_states) + + return hidden_states, output_states + + +class AttnDownBlock(nn.Module): + """ + Unet's down block containing resnet, downsamplers and self-attention blocks. + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + out_channels: number of output channels. + temb_channels: number of timestep embedding channels. + num_res_blocks: number of residual blocks. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + add_downsample: if True add downsample block. + resblock_updown: if True use residual blocks for downsampling. + downsample_padding: padding used in the downsampling block. + num_head_channels: number of channels in each attention head. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + temb_channels: int, + num_res_blocks: int = 1, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + add_downsample: bool = True, + resblock_updown: bool = False, + downsample_padding: int = 1, + num_head_channels: int = 1, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.resblock_updown = resblock_updown + + resnets = [] + attentions = [] + + for i in range(num_res_blocks): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + ) + ) + attentions.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=out_channels, + num_head_channels=num_head_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + + self.downsampler: nn.Module | None + if add_downsample: + if resblock_updown: + self.downsampler = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + down=True, + ) + else: + self.downsampler = DiffusionUnetDownsample( + spatial_dims=spatial_dims, + num_channels=out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + ) + else: + self.downsampler = None + + def forward( + self, hidden_states: torch.Tensor, temb: torch.Tensor, context: torch.Tensor | None = None + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + del context + output_states = [] + + for resnet, attn in zip(self.resnets, self.attentions): + hidden_states = resnet(hidden_states, temb) + hidden_states = attn(hidden_states).contiguous() + output_states.append(hidden_states) + + if self.downsampler is not None: + hidden_states = self.downsampler(hidden_states, temb) + output_states.append(hidden_states) + + return hidden_states, output_states + + +class CrossAttnDownBlock(nn.Module): + """ + Unet's down block containing resnet, downsamplers and cross-attention blocks. + + Args: + spatial_dims: number of spatial dimensions. + in_channels: number of input channels. + out_channels: number of output channels. + temb_channels: number of timestep embedding channels. + num_res_blocks: number of residual blocks. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + add_downsample: if True add downsample block. + resblock_updown: if True use residual blocks for downsampling. + downsample_padding: padding used in the downsampling block. + num_head_channels: number of channels in each attention head. + transformer_num_layers: number of layers of Transformer blocks to use. + cross_attention_dim: number of context dimensions to use. + upcast_attention: if True, upcast attention operations to full precision. + dropout_cattn: if different from zero, this will be the dropout value for the cross-attention layers. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + temb_channels: int, + num_res_blocks: int = 1, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + add_downsample: bool = True, + resblock_updown: bool = False, + downsample_padding: int = 1, + num_head_channels: int = 1, + transformer_num_layers: int = 1, + cross_attention_dim: int | None = None, + upcast_attention: bool = False, + dropout_cattn: float = 0.0, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.resblock_updown = resblock_updown + + resnets = [] + attentions = [] + + for i in range(num_res_blocks): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + ) + ) + + attentions.append( + SpatialTransformer( + spatial_dims=spatial_dims, + in_channels=out_channels, + num_attention_heads=out_channels // num_head_channels, + num_head_channels=num_head_channels, + num_layers=transformer_num_layers, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + dropout=dropout_cattn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + + self.downsampler: nn.Module | None + if add_downsample: + if resblock_updown: + self.downsampler = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + down=True, + ) + else: + self.downsampler = DiffusionUnetDownsample( + spatial_dims=spatial_dims, + num_channels=out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + ) + else: + self.downsampler = None + + def forward( + self, hidden_states: torch.Tensor, temb: torch.Tensor, context: torch.Tensor | None = None + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + output_states = [] + + for resnet, attn in zip(self.resnets, self.attentions): + hidden_states = resnet(hidden_states, temb) + hidden_states = attn(hidden_states, context=context).contiguous() + output_states.append(hidden_states) + + if self.downsampler is not None: + hidden_states = self.downsampler(hidden_states, temb) + output_states.append(hidden_states) + + return hidden_states, output_states + + +class AttnMidBlock(nn.Module): + """ + Unet's mid block containing resnet and self-attention blocks. + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + temb_channels: number of timestep embedding channels. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + num_head_channels: number of channels in each attention head. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + temb_channels: int, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + num_head_channels: int = 1, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + + self.resnet_1 = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + ) + self.attention = SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=in_channels, + num_head_channels=num_head_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + self.resnet_2 = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + ) + + def forward( + self, hidden_states: torch.Tensor, temb: torch.Tensor, context: torch.Tensor | None = None + ) -> torch.Tensor: + del context + hidden_states = self.resnet_1(hidden_states, temb) + hidden_states = self.attention(hidden_states).contiguous() + hidden_states = self.resnet_2(hidden_states, temb) + + return hidden_states + + +class CrossAttnMidBlock(nn.Module): + """ + Unet's mid block containing resnet and cross-attention blocks. + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + temb_channels: number of timestep embedding channels + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + num_head_channels: number of channels in each attention head. + transformer_num_layers: number of layers of Transformer blocks to use. + cross_attention_dim: number of context dimensions to use. + upcast_attention: if True, upcast attention operations to full precision. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + temb_channels: int, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + num_head_channels: int = 1, + transformer_num_layers: int = 1, + cross_attention_dim: int | None = None, + upcast_attention: bool = False, + dropout_cattn: float = 0.0, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + + self.resnet_1 = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + ) + self.attention = SpatialTransformer( + spatial_dims=spatial_dims, + in_channels=in_channels, + num_attention_heads=in_channels // num_head_channels, + num_head_channels=num_head_channels, + num_layers=transformer_num_layers, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + dropout=dropout_cattn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + self.resnet_2 = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + ) + + def forward( + self, hidden_states: torch.Tensor, temb: torch.Tensor, context: torch.Tensor | None = None + ) -> torch.Tensor: + hidden_states = self.resnet_1(hidden_states, temb) + hidden_states = self.attention(hidden_states, context=context) + hidden_states = self.resnet_2(hidden_states, temb) + + return hidden_states + + +class UpBlock(nn.Module): + """ + Unet's up block containing resnet and upsamplers blocks. + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + prev_output_channel: number of channels from residual connection. + out_channels: number of output channels. + temb_channels: number of timestep embedding channels. + num_res_blocks: number of residual blocks. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + add_upsample: if True add downsample block. + resblock_updown: if True use residual blocks for upsampling. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + num_res_blocks: int = 1, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + add_upsample: bool = True, + resblock_updown: bool = False, + ) -> None: + super().__init__() + self.resblock_updown = resblock_updown + resnets = [] + + for i in range(num_res_blocks): + res_skip_channels = in_channels if (i == num_res_blocks - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + ) + ) + + self.resnets = nn.ModuleList(resnets) + + self.upsampler: nn.Module | None + if add_upsample: + if resblock_updown: + self.upsampler = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + up=True, + ) + else: + post_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + self.upsampler = WrappedUpsample( + spatial_dims=spatial_dims, + mode="nontrainable", + in_channels=out_channels, + out_channels=out_channels, + interp_mode="nearest", + scale_factor=2.0, + post_conv=post_conv, + align_corners=None, + ) + + else: + self.upsampler = None + + def forward( + self, + hidden_states: torch.Tensor, + res_hidden_states_list: list[torch.Tensor], + temb: torch.Tensor, + context: torch.Tensor | None = None, + ) -> torch.Tensor: + del context + for resnet in self.resnets: + # pop res hidden states + res_hidden_states = res_hidden_states_list[-1] + res_hidden_states_list = res_hidden_states_list[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + hidden_states = resnet(hidden_states, temb) + + if self.upsampler is not None: + hidden_states = self.upsampler(hidden_states, temb) + + return hidden_states + + +class AttnUpBlock(nn.Module): + """ + Unet's up block containing resnet, upsamplers, and self-attention blocks. + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + prev_output_channel: number of channels from residual connection. + out_channels: number of output channels. + temb_channels: number of timestep embedding channels. + num_res_blocks: number of residual blocks. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + add_upsample: if True add downsample block. + resblock_updown: if True use residual blocks for upsampling. + num_head_channels: number of channels in each attention head. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + num_res_blocks: int = 1, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + add_upsample: bool = True, + resblock_updown: bool = False, + num_head_channels: int = 1, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.resblock_updown = resblock_updown + + resnets = [] + attentions = [] + + for i in range(num_res_blocks): + res_skip_channels = in_channels if (i == num_res_blocks - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + ) + ) + attentions.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=out_channels, + num_head_channels=num_head_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + self.resnets = nn.ModuleList(resnets) + self.attentions = nn.ModuleList(attentions) + + self.upsampler: nn.Module | None + if add_upsample: + if resblock_updown: + self.upsampler = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + up=True, + ) + else: + + post_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + self.upsampler = WrappedUpsample( + spatial_dims=spatial_dims, + mode="nontrainable", + in_channels=out_channels, + out_channels=out_channels, + interp_mode="nearest", + scale_factor=2.0, + post_conv=post_conv, + align_corners=None, + ) + else: + self.upsampler = None + + def forward( + self, + hidden_states: torch.Tensor, + res_hidden_states_list: list[torch.Tensor], + temb: torch.Tensor, + context: torch.Tensor | None = None, + ) -> torch.Tensor: + del context + for resnet, attn in zip(self.resnets, self.attentions): + # pop res hidden states + res_hidden_states = res_hidden_states_list[-1] + res_hidden_states_list = res_hidden_states_list[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + hidden_states = resnet(hidden_states, temb) + hidden_states = attn(hidden_states).contiguous() + + if self.upsampler is not None: + hidden_states = self.upsampler(hidden_states, temb) + + return hidden_states + + +class CrossAttnUpBlock(nn.Module): + """ + Unet's up block containing resnet, upsamplers, and self-attention blocks. + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + prev_output_channel: number of channels from residual connection. + out_channels: number of output channels. + temb_channels: number of timestep embedding channels. + num_res_blocks: number of residual blocks. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + add_upsample: if True add downsample block. + resblock_updown: if True use residual blocks for upsampling. + num_head_channels: number of channels in each attention head. + transformer_num_layers: number of layers of Transformer blocks to use. + cross_attention_dim: number of context dimensions to use. + upcast_attention: if True, upcast attention operations to full precision. + dropout_cattn: if different from zero, this will be the dropout value for the cross-attention layers. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + num_res_blocks: int = 1, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + add_upsample: bool = True, + resblock_updown: bool = False, + num_head_channels: int = 1, + transformer_num_layers: int = 1, + cross_attention_dim: int | None = None, + upcast_attention: bool = False, + dropout_cattn: float = 0.0, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.resblock_updown = resblock_updown + + resnets = [] + attentions = [] + + for i in range(num_res_blocks): + res_skip_channels = in_channels if (i == num_res_blocks - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + ) + ) + attentions.append( + SpatialTransformer( + spatial_dims=spatial_dims, + in_channels=out_channels, + num_attention_heads=out_channels // num_head_channels, + num_head_channels=num_head_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + dropout=dropout_cattn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + + self.upsampler: nn.Module | None + if add_upsample: + if resblock_updown: + self.upsampler = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + up=True, + ) + else: + + post_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + self.upsampler = WrappedUpsample( + spatial_dims=spatial_dims, + mode="nontrainable", + in_channels=out_channels, + out_channels=out_channels, + interp_mode="nearest", + scale_factor=2.0, + post_conv=post_conv, + align_corners=None, + ) + else: + self.upsampler = None + + def forward( + self, + hidden_states: torch.Tensor, + res_hidden_states_list: list[torch.Tensor], + temb: torch.Tensor, + context: torch.Tensor | None = None, + ) -> torch.Tensor: + for resnet, attn in zip(self.resnets, self.attentions): + # pop res hidden states + res_hidden_states = res_hidden_states_list[-1] + res_hidden_states_list = res_hidden_states_list[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + hidden_states = resnet(hidden_states, temb) + hidden_states = attn(hidden_states, context=context) + + if self.upsampler is not None: + hidden_states = self.upsampler(hidden_states, temb) + + return hidden_states + + +def get_down_block( + spatial_dims: int, + in_channels: int, + out_channels: int, + temb_channels: int, + num_res_blocks: int, + norm_num_groups: int, + norm_eps: float, + add_downsample: bool, + resblock_updown: bool, + with_attn: bool, + with_cross_attn: bool, + num_head_channels: int, + transformer_num_layers: int, + cross_attention_dim: int | None, + upcast_attention: bool = False, + dropout_cattn: float = 0.0, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, +) -> nn.Module: + if with_attn: + return AttnDownBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_downsample=add_downsample, + resblock_updown=resblock_updown, + num_head_channels=num_head_channels, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + elif with_cross_attn: + return CrossAttnDownBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_downsample=add_downsample, + resblock_updown=resblock_updown, + num_head_channels=num_head_channels, + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + dropout_cattn=dropout_cattn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + else: + return DownBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_downsample=add_downsample, + resblock_updown=resblock_updown, + ) + + +def get_mid_block( + spatial_dims: int, + in_channels: int, + temb_channels: int, + norm_num_groups: int, + norm_eps: float, + with_conditioning: bool, + num_head_channels: int, + transformer_num_layers: int, + cross_attention_dim: int | None, + upcast_attention: bool = False, + dropout_cattn: float = 0.0, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, +) -> nn.Module: + if with_conditioning: + return CrossAttnMidBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + num_head_channels=num_head_channels, + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + dropout_cattn=dropout_cattn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + else: + return AttnMidBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + num_head_channels=num_head_channels, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + +def get_up_block( + spatial_dims: int, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + num_res_blocks: int, + norm_num_groups: int, + norm_eps: float, + add_upsample: bool, + resblock_updown: bool, + with_attn: bool, + with_cross_attn: bool, + num_head_channels: int, + transformer_num_layers: int, + cross_attention_dim: int | None, + upcast_attention: bool = False, + dropout_cattn: float = 0.0, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, +) -> nn.Module: + if with_attn: + return AttnUpBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + prev_output_channel=prev_output_channel, + out_channels=out_channels, + temb_channels=temb_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_upsample=add_upsample, + resblock_updown=resblock_updown, + num_head_channels=num_head_channels, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + elif with_cross_attn: + return CrossAttnUpBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + prev_output_channel=prev_output_channel, + out_channels=out_channels, + temb_channels=temb_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_upsample=add_upsample, + resblock_updown=resblock_updown, + num_head_channels=num_head_channels, + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + dropout_cattn=dropout_cattn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + else: + return UpBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + prev_output_channel=prev_output_channel, + out_channels=out_channels, + temb_channels=temb_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_upsample=add_upsample, + resblock_updown=resblock_updown, + ) + + +class DiffusionModelUNet(nn.Module): + """ + Unet network with timestep embedding and attention mechanisms for conditioning based on + Rombach et al. "High-Resolution Image Synthesis with Latent Diffusion Models" https://arxiv.org/abs/2112.10752 + and Pinaya et al. "Brain Imaging Generation with Latent Diffusion Models" https://arxiv.org/abs/2209.07162 + + Args: + spatial_dims: number of spatial dimensions. + in_channels: number of input channels. + out_channels: number of output channels. + num_res_blocks: number of residual blocks (see _ResnetBlock) per level. + channels: tuple of block output channels. + attention_levels: list of levels to add attention. + norm_num_groups: number of groups for the normalization. + norm_eps: epsilon for the normalization. + resblock_updown: if True use residual blocks for up/downsampling. + num_head_channels: number of channels in each attention head. + with_conditioning: if True add spatial transformers to perform conditioning. + transformer_num_layers: number of layers of Transformer blocks to use. + cross_attention_dim: number of context dimensions to use. + num_class_embeds: if specified (as an int), then this model will be class-conditional with `num_class_embeds` + classes. + upcast_attention: if True, upcast attention operations to full precision. + dropout_cattn: if different from zero, this will be the dropout value for the cross-attention layers. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to True. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), + channels: Sequence[int] = (32, 64, 64, 64), + attention_levels: Sequence[bool] = (False, False, True, True), + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + resblock_updown: bool = False, + num_head_channels: int | Sequence[int] = 8, + with_conditioning: bool = False, + transformer_num_layers: int = 1, + cross_attention_dim: int | None = None, + num_class_embeds: int | None = None, + upcast_attention: bool = False, + dropout_cattn: float = 0.0, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + if with_conditioning is True and cross_attention_dim is None: + raise ValueError( + "DiffusionModelUNet expects dimension of the cross-attention conditioning (cross_attention_dim) " + "when using with_conditioning." + ) + if cross_attention_dim is not None and with_conditioning is False: + raise ValueError( + "DiffusionModelUNet expects with_conditioning=True when specifying the cross_attention_dim." + ) + if dropout_cattn > 1.0 or dropout_cattn < 0.0: + raise ValueError("Dropout cannot be negative or >1.0!") + + # All number of channels should be multiple of num_groups + if any((out_channel % norm_num_groups) != 0 for out_channel in channels): + raise ValueError("DiffusionModelUNet expects all num_channels being multiple of norm_num_groups") + + if len(channels) != len(attention_levels): + raise ValueError("DiffusionModelUNet expects num_channels being same size of attention_levels") + + if isinstance(num_head_channels, int): + num_head_channels = ensure_tuple_rep(num_head_channels, len(attention_levels)) + + if len(num_head_channels) != len(attention_levels): + raise ValueError( + "num_head_channels should have the same length as attention_levels. For the i levels without attention," + " i.e. `attention_level[i]=False`, the num_head_channels[i] will be ignored." + ) + + if isinstance(num_res_blocks, int): + num_res_blocks = ensure_tuple_rep(num_res_blocks, len(channels)) + + if len(num_res_blocks) != len(channels): + raise ValueError( + "`num_res_blocks` should be a single integer or a tuple of integers with the same length as " + "`num_channels`." + ) + + self.in_channels = in_channels + self.block_out_channels = channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_levels = attention_levels + self.num_head_channels = num_head_channels + self.with_conditioning = with_conditioning + + # input + self.conv_in = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + + # time + time_embed_dim = channels[0] * 4 + self.time_embed = nn.Sequential( + nn.Linear(channels[0], time_embed_dim), nn.SiLU(), nn.Linear(time_embed_dim, time_embed_dim) + ) + + # class embedding + self.num_class_embeds = num_class_embeds + if num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) + + # down + self.down_blocks = nn.ModuleList([]) + output_channel = channels[0] + for i in range(len(channels)): + input_channel = output_channel + output_channel = channels[i] + is_final_block = i == len(channels) - 1 + + down_block = get_down_block( + spatial_dims=spatial_dims, + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + num_res_blocks=num_res_blocks[i], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_downsample=not is_final_block, + resblock_updown=resblock_updown, + with_attn=(attention_levels[i] and not with_conditioning), + with_cross_attn=(attention_levels[i] and with_conditioning), + num_head_channels=num_head_channels[i], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + dropout_cattn=dropout_cattn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + self.down_blocks.append(down_block) + + # mid + self.middle_block = get_mid_block( + spatial_dims=spatial_dims, + in_channels=channels[-1], + temb_channels=time_embed_dim, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + with_conditioning=with_conditioning, + num_head_channels=num_head_channels[-1], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + dropout_cattn=dropout_cattn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + # up + self.up_blocks = nn.ModuleList([]) + reversed_block_out_channels = list(reversed(channels)) + reversed_num_res_blocks = list(reversed(num_res_blocks)) + reversed_attention_levels = list(reversed(attention_levels)) + reversed_num_head_channels = list(reversed(num_head_channels)) + output_channel = reversed_block_out_channels[0] + for i in range(len(reversed_block_out_channels)): + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[min(i + 1, len(channels) - 1)] + + is_final_block = i == len(channels) - 1 + + up_block = get_up_block( + spatial_dims=spatial_dims, + in_channels=input_channel, + prev_output_channel=prev_output_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + num_res_blocks=reversed_num_res_blocks[i] + 1, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_upsample=not is_final_block, + resblock_updown=resblock_updown, + with_attn=(reversed_attention_levels[i] and not with_conditioning), + with_cross_attn=(reversed_attention_levels[i] and with_conditioning), + num_head_channels=reversed_num_head_channels[i], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + dropout_cattn=dropout_cattn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + self.up_blocks.append(up_block) + + # out + self.out = nn.Sequential( + nn.GroupNorm(num_groups=norm_num_groups, num_channels=channels[0], eps=norm_eps, affine=True), + nn.SiLU(), + zero_module( + Convolution( + spatial_dims=spatial_dims, + in_channels=channels[0], + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ), + ) + + def forward( + self, + x: torch.Tensor, + timesteps: torch.Tensor, + context: torch.Tensor | None = None, + class_labels: torch.Tensor | None = None, + down_block_additional_residuals: tuple[torch.Tensor] | None = None, + mid_block_additional_residual: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Args: + x: input tensor (N, C, SpatialDims). + timesteps: timestep tensor (N,). + context: context tensor (N, 1, ContextDim). + class_labels: context tensor (N, ). + down_block_additional_residuals: additional residual tensors for down blocks (N, C, FeatureMapsDims). + mid_block_additional_residual: additional residual tensor for mid block (N, C, FeatureMapsDims). + """ + # 1. time + t_emb = get_timestep_embedding(timesteps, self.block_out_channels[0]) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=x.dtype) + emb = self.time_embed(t_emb) + + # 2. class + if self.num_class_embeds is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + class_emb = self.class_embedding(class_labels) + class_emb = class_emb.to(dtype=x.dtype) + emb = emb + class_emb + + # 3. initial convolution + h = self.conv_in(x) + + # 4. down + if context is not None and self.with_conditioning is False: + raise ValueError("model should have with_conditioning = True if context is provided") + down_block_res_samples: list[torch.Tensor] = [h] + for downsample_block in self.down_blocks: + h, res_samples = downsample_block(hidden_states=h, temb=emb, context=context) + for residual in res_samples: + down_block_res_samples.append(residual) + + # Additional residual conections for Controlnets + if down_block_additional_residuals is not None: + new_down_block_res_samples: list[torch.Tensor] = [] + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples += [down_block_res_sample] + + down_block_res_samples = new_down_block_res_samples + + # 5. mid + h = self.middle_block(hidden_states=h, temb=emb, context=context) + + # Additional residual conections for Controlnets + if mid_block_additional_residual is not None: + h = h + mid_block_additional_residual + + # 6. up + for upsample_block in self.up_blocks: + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + h = upsample_block(hidden_states=h, res_hidden_states_list=res_samples, temb=emb, context=context) + + # 7. output block + output: torch.Tensor = self.out(h) + + return output + + def load_old_state_dict(self, old_state_dict: dict, verbose=False) -> None: + """ + Load a state dict from a DiffusionModelUNet trained with + [MONAI Generative](https://github.com/Project-MONAI/GenerativeModels). + + Args: + old_state_dict: state dict from the old DecoderOnlyTransformer model. + """ + + new_state_dict = self.state_dict() + # if all keys match, just load the state dict + if all(k in new_state_dict for k in old_state_dict): + print("All keys match, loading state dict.") + self.load_state_dict(old_state_dict) + return + + if verbose: + # print all new_state_dict keys that are not in old_state_dict + for k in new_state_dict: + if k not in old_state_dict: + print(f"key {k} not found in old state dict") + # and vice versa + print("----------------------------------------------") + for k in old_state_dict: + if k not in new_state_dict: + print(f"key {k} not found in new state dict") + + # copy over all matching keys + for k in new_state_dict: + if k in old_state_dict: + new_state_dict[k] = old_state_dict.pop(k) + + # fix the attention blocks + attention_blocks = [k.replace(".out_proj.weight", "") for k in new_state_dict if "out_proj.weight" in k] + for block in attention_blocks: + # projection + new_state_dict[f"{block}.out_proj.weight"] = old_state_dict.pop(f"{block}.to_out.0.weight") + new_state_dict[f"{block}.out_proj.bias"] = old_state_dict.pop(f"{block}.to_out.0.bias") + + # fix the upsample conv blocks which were renamed postconv + for k in new_state_dict: + if "postconv" in k: + old_name = k.replace("postconv", "conv") + new_state_dict[k] = old_state_dict.pop(old_name) + if verbose: + # print all remaining keys in old_state_dict + print("remaining keys in old_state_dict:", old_state_dict.keys()) + self.load_state_dict(new_state_dict) + + +class DiffusionModelEncoder(nn.Module): + """ + Classification Network based on the Encoder of the Diffusion Model, followed by fully connected layers. This network is based on + Wolleb et al. "Diffusion Models for Medical Anomaly Detection" (https://arxiv.org/abs/2203.04306). + + Args: + spatial_dims: number of spatial dimensions. + in_channels: number of input channels. + out_channels: number of output channels. + num_res_blocks: number of residual blocks (see _ResnetBlock) per level. + channels: tuple of block output channels. + attention_levels: list of levels to add attention. + norm_num_groups: number of groups for the normalization. + norm_eps: epsilon for the normalization. + resblock_updown: if True use residual blocks for downsampling. + num_head_channels: number of channels in each attention head. + with_conditioning: if True add spatial transformers to perform conditioning. + transformer_num_layers: number of layers of Transformer blocks to use. + cross_attention_dim: number of context dimensions to use. + num_class_embeds: if specified (as an int), then this model will be class-conditional with `num_class_embeds` classes. + upcast_attention: if True, upcast attention operations to full precision. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), + channels: Sequence[int] = (32, 64, 64, 64), + attention_levels: Sequence[bool] = (False, False, True, True), + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + resblock_updown: bool = False, + num_head_channels: int | Sequence[int] = 8, + with_conditioning: bool = False, + transformer_num_layers: int = 1, + cross_attention_dim: int | None = None, + num_class_embeds: int | None = None, + upcast_attention: bool = False, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + if with_conditioning is True and cross_attention_dim is None: + raise ValueError( + "DiffusionModelEncoder expects dimension of the cross-attention conditioning (cross_attention_dim) " + "when using with_conditioning." + ) + if cross_attention_dim is not None and with_conditioning is False: + raise ValueError( + "DiffusionModelEncoder expects with_conditioning=True when specifying the cross_attention_dim." + ) + + # All number of channels should be multiple of num_groups + if any((out_channel % norm_num_groups) != 0 for out_channel in channels): + raise ValueError("DiffusionModelEncoder expects all num_channels being multiple of norm_num_groups") + if len(channels) != len(attention_levels): + raise ValueError("DiffusionModelEncoder expects num_channels being same size of attention_levels") + + if isinstance(num_head_channels, int): + num_head_channels = ensure_tuple_rep(num_head_channels, len(attention_levels)) + + if isinstance(num_res_blocks, int): + num_res_blocks = ensure_tuple_rep(num_res_blocks, len(channels)) + + if len(num_head_channels) != len(attention_levels): + raise ValueError( + "num_head_channels should have the same length as attention_levels. For the i levels without attention," + " i.e. `attention_level[i]=False`, the num_head_channels[i] will be ignored." + ) + + self.in_channels = in_channels + self.block_out_channels = channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_levels = attention_levels + self.num_head_channels = num_head_channels + self.with_conditioning = with_conditioning + + # input + self.conv_in = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + + # time + time_embed_dim = channels[0] * 4 + self.time_embed = nn.Sequential( + nn.Linear(channels[0], time_embed_dim), nn.SiLU(), nn.Linear(time_embed_dim, time_embed_dim) + ) + + # class embedding + self.num_class_embeds = num_class_embeds + if num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) + + # down + self.down_blocks = nn.ModuleList([]) + output_channel = channels[0] + for i in range(len(channels)): + input_channel = output_channel + output_channel = channels[i] + is_final_block = i == len(channels) # - 1 + + down_block = get_down_block( + spatial_dims=spatial_dims, + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + num_res_blocks=num_res_blocks[i], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_downsample=not is_final_block, + resblock_updown=resblock_updown, + with_attn=(attention_levels[i] and not with_conditioning), + with_cross_attn=(attention_levels[i] and with_conditioning), + num_head_channels=num_head_channels[i], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + self.down_blocks.append(down_block) + + self.out = nn.Sequential(nn.Linear(4096, 512), nn.ReLU(), nn.Dropout(0.1), nn.Linear(512, self.out_channels)) + + def forward( + self, + x: torch.Tensor, + timesteps: torch.Tensor, + context: torch.Tensor | None = None, + class_labels: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Args: + x: input tensor (N, C, SpatialDims). + timesteps: timestep tensor (N,). + context: context tensor (N, 1, ContextDim). + class_labels: context tensor (N, ). + """ + # 1. time + t_emb = get_timestep_embedding(timesteps, self.block_out_channels[0]) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=x.dtype) + emb = self.time_embed(t_emb) + + # 2. class + if self.num_class_embeds is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + class_emb = self.class_embedding(class_labels) + class_emb = class_emb.to(dtype=x.dtype) + emb = emb + class_emb + + # 3. initial convolution + h = self.conv_in(x) + + # 4. down + if context is not None and self.with_conditioning is False: + raise ValueError("model should have with_conditioning = True if context is provided") + for downsample_block in self.down_blocks: + h, _ = downsample_block(hidden_states=h, temb=emb, context=context) + + h = h.reshape(h.shape[0], -1) + output: torch.Tensor = self.out(h) + + return output diff --git a/monai/networks/nets/flexible_unet.py b/monai/networks/nets/flexible_unet.py index ac2124b5f9..c27b0fc17b 100644 --- a/monai/networks/nets/flexible_unet.py +++ b/monai/networks/nets/flexible_unet.py @@ -24,6 +24,7 @@ from monai.networks.layers.utils import get_act_layer from monai.networks.nets import EfficientNetEncoder from monai.networks.nets.basic_unet import UpCat +from monai.networks.nets.resnet import ResNetEncoder from monai.utils import InterpolateMode, optional_import __all__ = ["FlexibleUNet", "FlexUNet", "FLEXUNET_BACKBONE", "FlexUNetEncoderRegister"] @@ -78,6 +79,7 @@ def register_class(self, name: type[Any] | str): FLEXUNET_BACKBONE = FlexUNetEncoderRegister() FLEXUNET_BACKBONE.register_class(EfficientNetEncoder) +FLEXUNET_BACKBONE.register_class(ResNetEncoder) class UNetDecoder(nn.Module): @@ -238,7 +240,7 @@ def __init__( ) -> None: """ A flexible implement of UNet, in which the backbone/encoder can be replaced with - any efficient network. Currently the input must have a 2 or 3 spatial dimension + any efficient or residual network. Currently the input must have a 2 or 3 spatial dimension and the spatial size of each dimension must be a multiple of 32 if is_pad parameter is False. Please notice each output of backbone must be 2x downsample in spatial dimension @@ -248,10 +250,11 @@ def __init__( Args: in_channels: number of input channels. out_channels: number of output channels. - backbone: name of backbones to initialize, only support efficientnet right now, - can be from [efficientnet-b0,..., efficientnet-b8, efficientnet-l2]. - pretrained: whether to initialize pretrained ImageNet weights, only available - for spatial_dims=2 and batch norm is used, default to False. + backbone: name of backbones to initialize, only support efficientnet and resnet right now, + can be from [efficientnet-b0, ..., efficientnet-b8, efficientnet-l2, resnet10, ..., resnet200]. + pretrained: whether to initialize pretrained weights. ImageNet weights are available for efficient networks + if spatial_dims=2 and batch norm is used. MedicalNet weights are available for residual networks + if spatial_dims=3 and in_channels=1. Default to False. decoder_channels: number of output channels for all feature maps in decoder. `len(decoder_channels)` should equal to `len(encoder_channels) - 1`,default to (256, 128, 64, 32, 16). diff --git a/monai/networks/nets/patchgan_discriminator.py b/monai/networks/nets/patchgan_discriminator.py new file mode 100644 index 0000000000..74da917694 --- /dev/null +++ b/monai/networks/nets/patchgan_discriminator.py @@ -0,0 +1,230 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections.abc import Sequence + +import torch +import torch.nn as nn + +from monai.networks.blocks import Convolution +from monai.networks.layers import Act +from monai.networks.utils import normal_init + + +class MultiScalePatchDiscriminator(nn.Sequential): + """ + Multi-scale Patch-GAN discriminator based on Pix2PixHD: + High-Resolution Image Synthesis and Semantic Manipulation with Conditional GANs (https://arxiv.org/abs/1711.11585) + + The Multi-scale discriminator made up of several PatchGAN discriminators, that process the images + at different spatial scales. + + Args: + num_d: number of discriminators + num_layers_d: number of Convolution layers (Conv + activation + normalisation + [dropout]) in the first + discriminator. Each subsequent discriminator has one additional layer, meaning the output size is halved. + spatial_dims: number of spatial dimensions (1D, 2D etc.) + channels: number of filters in the first convolutional layer (doubled for each subsequent layer) + in_channels: number of input channels + out_channels: number of output channels in each discriminator + kernel_size: kernel size of the convolution layers + activation: activation layer type + norm: normalisation type + bias: introduction of layer bias + dropout: probability of dropout applied, defaults to 0. + minimum_size_im: minimum spatial size of the input image. Introduced to make sure the architecture + requested isn't going to downsample the input image beyond value of 1. + last_conv_kernel_size: kernel size of the last convolutional layer. + """ + + def __init__( + self, + num_d: int, + num_layers_d: int, + spatial_dims: int, + channels: int, + in_channels: int, + out_channels: int = 1, + kernel_size: int = 4, + activation: str | tuple = (Act.LEAKYRELU, {"negative_slope": 0.2}), + norm: str | tuple = "BATCH", + bias: bool = False, + dropout: float | tuple = 0.0, + minimum_size_im: int = 256, + last_conv_kernel_size: int = 1, + ) -> None: + super().__init__() + self.num_d = num_d + self.num_layers_d = num_layers_d + self.num_channels = channels + self.padding = tuple([int((kernel_size - 1) / 2)] * spatial_dims) + for i_ in range(self.num_d): + num_layers_d_i = self.num_layers_d * (i_ + 1) + output_size = float(minimum_size_im) / (2**num_layers_d_i) + if output_size < 1: + raise AssertionError( + f"Your image size is too small to take in up to {i_} discriminators with num_layers = {num_layers_d_i}." + "Please reduce num_layers, reduce num_D or enter bigger images." + ) + subnet_d = PatchDiscriminator( + spatial_dims=spatial_dims, + channels=self.num_channels, + in_channels=in_channels, + out_channels=out_channels, + num_layers_d=num_layers_d_i, + kernel_size=kernel_size, + activation=activation, + norm=norm, + bias=bias, + padding=self.padding, + dropout=dropout, + last_conv_kernel_size=last_conv_kernel_size, + ) + + self.add_module("discriminator_%d" % i_, subnet_d) + + def forward(self, i: torch.Tensor) -> tuple[list[torch.Tensor], list[list[torch.Tensor]]]: + """ + Args: + i: Input tensor + + Returns: + list of outputs and another list of lists with the intermediate features + of each discriminator. + """ + + out: list[torch.Tensor] = [] + intermediate_features: list[list[torch.Tensor]] = [] + for disc in self.children(): + out_d: list[torch.Tensor] = disc(i) + out.append(out_d[-1]) + intermediate_features.append(out_d[:-1]) + + return out, intermediate_features + + +class PatchDiscriminator(nn.Sequential): + """ + Patch-GAN discriminator based on Pix2PixHD: + High-Resolution Image Synthesis and Semantic Manipulation with Conditional GANs (https://arxiv.org/abs/1711.11585) + + + Args: + spatial_dims: number of spatial dimensions (1D, 2D etc.) + channels: number of filters in the first convolutional layer (doubled for each subsequent layer) + in_channels: number of input channels + out_channels: number of output channels + num_layers_d: number of Convolution layers (Conv + activation + normalisation + [dropout]) in the discriminator. + kernel_size: kernel size of the convolution layers + act: activation type and arguments. Defaults to LeakyReLU. + norm: feature normalization type and arguments. Defaults to batch norm. + bias: whether to have a bias term in convolution blocks. Defaults to False. + padding: padding to be applied to the convolutional layers + dropout: proportion of dropout applied, defaults to 0. + last_conv_kernel_size: kernel size of the last convolutional layer. + """ + + def __init__( + self, + spatial_dims: int, + channels: int, + in_channels: int, + out_channels: int = 1, + num_layers_d: int = 3, + kernel_size: int = 4, + activation: str | tuple = (Act.LEAKYRELU, {"negative_slope": 0.2}), + norm: str | tuple = "BATCH", + bias: bool = False, + padding: int | Sequence[int] = 1, + dropout: float | tuple = 0.0, + last_conv_kernel_size: int | None = None, + ) -> None: + super().__init__() + self.num_layers_d = num_layers_d + self.num_channels = channels + if last_conv_kernel_size is None: + last_conv_kernel_size = kernel_size + + self.add_module( + "initial_conv", + Convolution( + spatial_dims=spatial_dims, + kernel_size=kernel_size, + in_channels=in_channels, + out_channels=channels, + act=activation, + bias=True, + norm=None, + dropout=dropout, + padding=padding, + strides=2, + ), + ) + + input_channels = channels + output_channels = channels * 2 + + # Initial Layer + for l_ in range(self.num_layers_d): + if l_ == self.num_layers_d - 1: + stride = 1 + else: + stride = 2 + layer = Convolution( + spatial_dims=spatial_dims, + kernel_size=kernel_size, + in_channels=input_channels, + out_channels=output_channels, + act=activation, + bias=bias, + norm=norm, + dropout=dropout, + padding=padding, + strides=stride, + ) + self.add_module("%d" % l_, layer) + input_channels = output_channels + output_channels = output_channels * 2 + + # Final layer + self.add_module( + "final_conv", + Convolution( + spatial_dims=spatial_dims, + kernel_size=last_conv_kernel_size, + in_channels=input_channels, + out_channels=out_channels, + bias=True, + conv_only=True, + padding=int((last_conv_kernel_size - 1) / 2), + dropout=0.0, + strides=1, + ), + ) + + self.apply(normal_init) + + def forward(self, x: torch.Tensor) -> list[torch.Tensor]: + """ + Args: + x: input tensor + + Returns: + list of intermediate features, with the last element being the output. + """ + out = [x] + for submodel in self.children(): + intermediate_output = submodel(out[-1]) + out.append(intermediate_output) + + return out[1:] diff --git a/monai/networks/nets/quicknat.py b/monai/networks/nets/quicknat.py index cbcccf24d7..7e0f9c6b38 100644 --- a/monai/networks/nets/quicknat.py +++ b/monai/networks/nets/quicknat.py @@ -42,7 +42,7 @@ class SkipConnectionWithIdx(SkipConnection): Inherits from SkipConnection but provides the indizes with each forward pass. """ - def forward(self, input, indices): + def forward(self, input, indices): # type: ignore[override] return super().forward(input), indices @@ -57,7 +57,7 @@ class SequentialWithIdx(nn.Sequential): def __init__(self, *args): super().__init__(*args) - def forward(self, input, indices): + def forward(self, input, indices): # type: ignore[override] for module in self: input, indices = module(input, indices) return input, indices @@ -165,9 +165,11 @@ def _get_layer(self, in_channels, out_channels, dilation): ) return nn.Sequential(conv.get_submodule("adn"), conv.get_submodule("conv")) - def forward(self, input, _): + def forward(self, input, _): # type: ignore[override] i = 0 result = input + result1 = input # this will not stay this value, needed here for pylint/mypy + for l in self.children(): # ignoring the max (un-)pool and droupout already added in the initial initialization step if isinstance(l, (nn.MaxPool2d, nn.MaxUnpool2d, nn.Dropout2d)): @@ -213,7 +215,7 @@ def __init__(self, in_channels: int, max_pool, se_layer, dropout, kernel_size, n super().__init__(in_channels, se_layer, dropout, kernel_size, num_filters) self.max_pool = max_pool - def forward(self, input, indices=None): + def forward(self, input, indices=None): # type: ignore[override] input, indices = self.max_pool(input) out_block, _ = super().forward(input, None) @@ -241,7 +243,7 @@ def __init__(self, in_channels: int, un_pool, se_layer, dropout, kernel_size, nu super().__init__(in_channels, se_layer, dropout, kernel_size, num_filters) self.un_pool = un_pool - def forward(self, input, indices): + def forward(self, input, indices): # type: ignore[override] out_block, _ = super().forward(input, None) out_block = self.un_pool(out_block, indices) return out_block, None @@ -268,7 +270,7 @@ def __init__(self, in_channels: int, se_layer, dropout, max_pool, un_pool, kerne self.max_pool = max_pool self.un_pool = un_pool - def forward(self, input, indices): + def forward(self, input, indices): # type: ignore[override] out_block, indices = self.max_pool(input) out_block, _ = super().forward(out_block, None) out_block = self.un_pool(out_block, indices) diff --git a/monai/networks/nets/resnet.py b/monai/networks/nets/resnet.py index 34a4b7057e..d62722478e 100644 --- a/monai/networks/nets/resnet.py +++ b/monai/networks/nets/resnet.py @@ -21,8 +21,9 @@ import torch import torch.nn as nn -from monai.networks.layers.factories import Conv, Norm, Pool -from monai.networks.layers.utils import get_pool_layer +from monai.networks.blocks.encoder import BaseEncoder +from monai.networks.layers.factories import Conv, Pool +from monai.networks.layers.utils import get_act_layer, get_norm_layer, get_pool_layer from monai.utils import ensure_tuple_rep from monai.utils.module import look_up_option, optional_import @@ -45,6 +46,17 @@ "resnet200", ] +resnet_params = { + # model_name: (block, layers, shortcut_type, bias_downsample, datasets23) + "resnet10": ("basic", [1, 1, 1, 1], "B", False, True), + "resnet18": ("basic", [2, 2, 2, 2], "A", True, True), + "resnet34": ("basic", [3, 4, 6, 3], "A", True, True), + "resnet50": ("bottleneck", [3, 4, 6, 3], "B", False, True), + "resnet101": ("bottleneck", [3, 4, 23, 3], "B", False, False), + "resnet152": ("bottleneck", [3, 8, 36, 3], "B", False, False), + "resnet200": ("bottleneck", [3, 24, 36, 3], "B", False, False), +} + logger = logging.getLogger(__name__) @@ -66,6 +78,8 @@ def __init__( spatial_dims: int = 3, stride: int = 1, downsample: nn.Module | partial | None = None, + act: str | tuple = ("relu", {"inplace": True}), + norm: str | tuple = "batch", ) -> None: """ Args: @@ -74,17 +88,18 @@ def __init__( spatial_dims: number of spatial dimensions of the input image. stride: stride to use for first conv layer. downsample: which downsample layer to use. + act: activation type and arguments. Defaults to relu. + norm: feature normalization type and arguments. Defaults to batch norm. """ super().__init__() conv_type: Callable = Conv[Conv.CONV, spatial_dims] - norm_type: Callable = Norm[Norm.BATCH, spatial_dims] self.conv1 = conv_type(in_planes, planes, kernel_size=3, padding=1, stride=stride, bias=False) - self.bn1 = norm_type(planes) - self.relu = nn.ReLU(inplace=True) + self.bn1 = get_norm_layer(name=norm, spatial_dims=spatial_dims, channels=planes) + self.act = get_act_layer(name=act) self.conv2 = conv_type(planes, planes, kernel_size=3, padding=1, bias=False) - self.bn2 = norm_type(planes) + self.bn2 = get_norm_layer(name=norm, spatial_dims=spatial_dims, channels=planes) self.downsample = downsample self.stride = stride @@ -93,7 +108,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: out: torch.Tensor = self.conv1(x) out = self.bn1(out) - out = self.relu(out) + out = self.act(out) out = self.conv2(out) out = self.bn2(out) @@ -102,7 +117,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: residual = self.downsample(x) out += residual - out = self.relu(out) + out = self.act(out) return out @@ -117,6 +132,8 @@ def __init__( spatial_dims: int = 3, stride: int = 1, downsample: nn.Module | partial | None = None, + act: str | tuple = ("relu", {"inplace": True}), + norm: str | tuple = "batch", ) -> None: """ Args: @@ -125,20 +142,22 @@ def __init__( spatial_dims: number of spatial dimensions of the input image. stride: stride to use for second conv layer. downsample: which downsample layer to use. + act: activation type and arguments. Defaults to relu. + norm: feature normalization type and arguments. Defaults to batch norm. """ super().__init__() conv_type: Callable = Conv[Conv.CONV, spatial_dims] - norm_type: Callable = Norm[Norm.BATCH, spatial_dims] + norm_layer = partial(get_norm_layer, name=norm, spatial_dims=spatial_dims) self.conv1 = conv_type(in_planes, planes, kernel_size=1, bias=False) - self.bn1 = norm_type(planes) + self.bn1 = norm_layer(channels=planes) self.conv2 = conv_type(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn2 = norm_type(planes) + self.bn2 = norm_layer(channels=planes) self.conv3 = conv_type(planes, planes * self.expansion, kernel_size=1, bias=False) - self.bn3 = norm_type(planes * self.expansion) - self.relu = nn.ReLU(inplace=True) + self.bn3 = norm_layer(channels=planes * self.expansion) + self.act = get_act_layer(name=act) self.downsample = downsample self.stride = stride @@ -147,11 +166,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: out: torch.Tensor = self.conv1(x) out = self.bn1(out) - out = self.relu(out) + out = self.act(out) out = self.conv2(out) out = self.bn2(out) - out = self.relu(out) + out = self.act(out) out = self.conv3(out) out = self.bn3(out) @@ -160,7 +179,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: residual = self.downsample(x) out += residual - out = self.relu(out) + out = self.act(out) return out @@ -190,6 +209,8 @@ class ResNet(nn.Module): num_classes: number of output (classifications). feed_forward: whether to add the FC layer for the output, default to `True`. bias_downsample: whether to use bias term in the downsampling block when `shortcut_type` is 'B', default to `True`. + act: activation type and arguments. Defaults to relu. + norm: feature normalization type and arguments. Defaults to batch norm. """ @@ -208,6 +229,8 @@ def __init__( num_classes: int = 400, feed_forward: bool = True, bias_downsample: bool = True, # for backwards compatibility (also see PR #5477) + act: str | tuple = ("relu", {"inplace": True}), + norm: str | tuple = "batch", ) -> None: super().__init__() @@ -220,7 +243,6 @@ def __init__( raise ValueError("Unknown block '%s', use basic or bottleneck" % block) conv_type: type[nn.Conv1d | nn.Conv2d | nn.Conv3d] = Conv[Conv.CONV, spatial_dims] - norm_type: type[nn.BatchNorm1d | nn.BatchNorm2d | nn.BatchNorm3d] = Norm[Norm.BATCH, spatial_dims] pool_type: type[nn.MaxPool1d | nn.MaxPool2d | nn.MaxPool3d] = Pool[Pool.MAX, spatial_dims] avgp_type: type[nn.AdaptiveAvgPool1d | nn.AdaptiveAvgPool2d | nn.AdaptiveAvgPool3d] = Pool[ Pool.ADAPTIVEAVG, spatial_dims @@ -244,8 +266,10 @@ def __init__( padding=tuple(k // 2 for k in conv1_kernel_size), bias=False, ) - self.bn1 = norm_type(self.in_planes) - self.relu = nn.ReLU(inplace=True) + + norm_layer = get_norm_layer(name=norm, spatial_dims=spatial_dims, channels=self.in_planes) + self.bn1 = norm_layer + self.act = get_act_layer(name=act) self.maxpool = pool_type(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, block_inplanes[0], layers[0], spatial_dims, shortcut_type) self.layer2 = self._make_layer(block, block_inplanes[1], layers[1], spatial_dims, shortcut_type, stride=2) @@ -257,7 +281,7 @@ def __init__( for m in self.modules(): if isinstance(m, conv_type): nn.init.kaiming_normal_(torch.as_tensor(m.weight), mode="fan_out", nonlinearity="relu") - elif isinstance(m, norm_type): + elif isinstance(m, type(norm_layer)): nn.init.constant_(torch.as_tensor(m.weight), 1) nn.init.constant_(torch.as_tensor(m.bias), 0) elif isinstance(m, nn.Linear): @@ -277,9 +301,9 @@ def _make_layer( spatial_dims: int, shortcut_type: str, stride: int = 1, + norm: str | tuple = "batch", ) -> nn.Sequential: conv_type: Callable = Conv[Conv.CONV, spatial_dims] - norm_type: Callable = Norm[Norm.BATCH, spatial_dims] downsample: nn.Module | partial | None = None if stride != 1 or self.in_planes != planes * block.expansion: @@ -299,25 +323,30 @@ def _make_layer( stride=stride, bias=self.bias_downsample, ), - norm_type(planes * block.expansion), + get_norm_layer(name=norm, spatial_dims=spatial_dims, channels=planes * block.expansion), ) layers = [ block( - in_planes=self.in_planes, planes=planes, spatial_dims=spatial_dims, stride=stride, downsample=downsample + in_planes=self.in_planes, + planes=planes, + spatial_dims=spatial_dims, + stride=stride, + downsample=downsample, + norm=norm, ) ] self.in_planes = planes * block.expansion for _i in range(1, blocks): - layers.append(block(self.in_planes, planes, spatial_dims=spatial_dims)) + layers.append(block(self.in_planes, planes, spatial_dims=spatial_dims, norm=norm)) return nn.Sequential(*layers) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.conv1(x) x = self.bn1(x) - x = self.relu(x) + x = self.act(x) if not self.no_max_pool: x = self.maxpool(x) @@ -335,6 +364,120 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x +class ResNetFeatures(ResNet): + + def __init__(self, model_name: str, pretrained: bool = True, spatial_dims: int = 3, in_channels: int = 1) -> None: + """Initialize resnet18 to resnet200 models as a backbone, the backbone can be used as an encoder for + segmentation and objection models. + + Compared with the class `ResNet`, the only different place is the forward function. + + Args: + model_name: name of model to initialize, can be from [resnet10, ..., resnet200]. + pretrained: whether to initialize pretrained MedicalNet weights, + only available for spatial_dims=3 and in_channels=1. + spatial_dims: number of spatial dimensions of the input image. + in_channels: number of input channels for first convolutional layer. + """ + if model_name not in resnet_params: + model_name_string = ", ".join(resnet_params.keys()) + raise ValueError(f"invalid model_name {model_name} found, must be one of {model_name_string} ") + + block, layers, shortcut_type, bias_downsample, datasets23 = resnet_params[model_name] + + super().__init__( + block=block, + layers=layers, + block_inplanes=get_inplanes(), + spatial_dims=spatial_dims, + n_input_channels=in_channels, + conv1_t_stride=2, + shortcut_type=shortcut_type, + feed_forward=False, + bias_downsample=bias_downsample, + ) + if pretrained: + if spatial_dims == 3 and in_channels == 1: + _load_state_dict(self, model_name, datasets23=datasets23) + else: + raise ValueError("Pretrained resnet models are only available for in_channels=1 and spatial_dims=3.") + + def forward(self, inputs: torch.Tensor): + """ + Args: + inputs: input should have spatially N dimensions + ``(Batch, in_channels, dim_0[, dim_1, ..., dim_N])``, N is defined by `dimensions`. + + Returns: + a list of torch Tensors. + """ + x = self.conv1(inputs) + x = self.bn1(x) + x = self.act(x) + + features = [] + features.append(x) + + if not self.no_max_pool: + x = self.maxpool(x) + + x = self.layer1(x) + features.append(x) + + x = self.layer2(x) + features.append(x) + + x = self.layer3(x) + features.append(x) + + x = self.layer4(x) + features.append(x) + + return features + + +class ResNetEncoder(ResNetFeatures, BaseEncoder): + """Wrap the original resnet to an encoder for flexible-unet.""" + + backbone_names = ["resnet10", "resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "resnet200"] + + @classmethod + def get_encoder_parameters(cls) -> list[dict]: + """Get the initialization parameter for resnet backbones.""" + parameter_list = [] + for backbone_name in cls.backbone_names: + parameter_list.append( + {"model_name": backbone_name, "pretrained": True, "spatial_dims": 3, "in_channels": 1} + ) + return parameter_list + + @classmethod + def num_channels_per_output(cls) -> list[tuple[int, ...]]: + """Get number of resnet backbone output feature maps channel.""" + return [ + (64, 64, 128, 256, 512), + (64, 64, 128, 256, 512), + (64, 64, 128, 256, 512), + (64, 256, 512, 1024, 2048), + (64, 256, 512, 1024, 2048), + (64, 256, 512, 1024, 2048), + (64, 256, 512, 1024, 2048), + ] + + @classmethod + def num_outputs(cls) -> list[int]: + """Get number of resnet backbone output feature maps. + + Since every backbone contains the same 5 output feature maps, the number list should be `[5] * 7`. + """ + return [5] * 7 + + @classmethod + def get_encoder_names(cls) -> list[str]: + """Get names of resnet backbones.""" + return cls.backbone_names + + def _resnet( arch: str, block: type[ResNetBlock | ResNetBottleneck], @@ -367,7 +510,7 @@ def _resnet( # Check model bias_downsample and shortcut_type bias_downsample, shortcut_type = get_medicalnet_pretrained_resnet_args(resnet_depth) if shortcut_type == kwargs.get("shortcut_type", "B") and ( - bool(bias_downsample) == kwargs.get("bias_downsample", False) if bias_downsample != -1 else True + bias_downsample == kwargs.get("bias_downsample", True) ): # Download the MedicalNet pretrained model model_state_dict = get_pretrained_resnet_medicalnet( @@ -375,8 +518,7 @@ def _resnet( ) else: raise NotImplementedError( - f"Please set shortcut_type to {shortcut_type} and bias_downsample to" - f"{bool(bias_downsample) if bias_downsample!=-1 else 'True or False'}" + f"Please set shortcut_type to {shortcut_type} and bias_downsample to {bias_downsample} " f"when using pretrained MedicalNet resnet{resnet_depth}" ) else: @@ -477,7 +619,7 @@ def resnet200(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> def get_pretrained_resnet_medicalnet(resnet_depth: int, device: str = "cpu", datasets23: bool = True): """ - Donwlad resnet pretrained weights from https://huggingface.co/TencentMedicalNet + Download resnet pretrained weights from https://huggingface.co/TencentMedicalNet Args: resnet_depth: depth of the pretrained model. Supported values are 10, 18, 34, 50, 101, 152 and 200 @@ -533,11 +675,24 @@ def get_pretrained_resnet_medicalnet(resnet_depth: int, device: str = "cpu", dat def get_medicalnet_pretrained_resnet_args(resnet_depth: int): """ Return correct shortcut_type and bias_downsample - for pretrained MedicalNet weights according to resnet depth + for pretrained MedicalNet weights according to resnet depth. """ # After testing # False: 10, 50, 101, 152, 200 # Any: 18, 34 - bias_downsample = -1 if resnet_depth in [18, 34] else 0 # 18, 10, 34 + bias_downsample = resnet_depth in (18, 34) shortcut_type = "A" if resnet_depth in [18, 34] else "B" return bias_downsample, shortcut_type + + +def _load_state_dict(model: nn.Module, model_name: str, datasets23: bool = True) -> None: + search_res = re.search(r"resnet(\d+)", model_name) + if search_res: + resnet_depth = int(search_res.group(1)) + datasets23 = model_name.endswith("_23datasets") + else: + raise ValueError("model_name argument should contain resnet depth. Example: resnet18 or resnet18_23datasets.") + + model_state_dict = get_pretrained_resnet_medicalnet(resnet_depth, device="cpu", datasets23=datasets23) + model_state_dict = {key.replace("module.", ""): value for key, value in model_state_dict.items()} + model.load_state_dict(model_state_dict) diff --git a/monai/networks/nets/segresnet_ds.py b/monai/networks/nets/segresnet_ds.py index 6430f5fdc9..1ac5a79ee3 100644 --- a/monai/networks/nets/segresnet_ds.py +++ b/monai/networks/nets/segresnet_ds.py @@ -11,6 +11,7 @@ from __future__ import annotations +import copy from collections.abc import Callable from typing import Union @@ -23,7 +24,7 @@ from monai.networks.layers.utils import get_act_layer, get_norm_layer from monai.utils import UpsampleMode, has_option -__all__ = ["SegResNetDS"] +__all__ = ["SegResNetDS", "SegResNetDS2"] def scales_for_resolution(resolution: tuple | list, n_stages: int | None = None): @@ -425,3 +426,128 @@ def _forward(self, x: torch.Tensor) -> Union[None, torch.Tensor, list[torch.Tens def forward(self, x: torch.Tensor) -> Union[None, torch.Tensor, list[torch.Tensor]]: return self._forward(x) + + +class SegResNetDS2(SegResNetDS): + """ + SegResNetDS2 adds an additional decorder branch to SegResNetDS and is the image encoder of VISTA3D + `_. + + Args: + spatial_dims: spatial dimension of the input data. Defaults to 3. + init_filters: number of output channels for initial convolution layer. Defaults to 32. + in_channels: number of input channels for the network. Defaults to 1. + out_channels: number of output channels for the network. Defaults to 2. + act: activation type and arguments. Defaults to ``RELU``. + norm: feature normalization type and arguments. Defaults to ``BATCH``. + blocks_down: number of downsample blocks in each layer. Defaults to ``[1,2,2,4]``. + blocks_up: number of upsample blocks (optional). + dsdepth: number of levels for deep supervision. This will be the length of the list of outputs at each scale level. + At dsdepth==1,only a single output is returned. + preprocess: optional callable function to apply before the model's forward pass + resolution: optional input image resolution. When provided, the network will first use non-isotropic kernels to bring + image spacing into an approximately isotropic space. + Otherwise, by default, the kernel size and downsampling is always isotropic. + + """ + + def __init__( + self, + spatial_dims: int = 3, + init_filters: int = 32, + in_channels: int = 1, + out_channels: int = 2, + act: tuple | str = "relu", + norm: tuple | str = "batch", + blocks_down: tuple = (1, 2, 2, 4), + blocks_up: tuple | None = None, + dsdepth: int = 1, + preprocess: nn.Module | Callable | None = None, + upsample_mode: UpsampleMode | str = "deconv", + resolution: tuple | None = None, + ): + super().__init__( + spatial_dims=spatial_dims, + init_filters=init_filters, + in_channels=in_channels, + out_channels=out_channels, + act=act, + norm=norm, + blocks_down=blocks_down, + blocks_up=blocks_up, + dsdepth=dsdepth, + preprocess=preprocess, + upsample_mode=upsample_mode, + resolution=resolution, + ) + + self.up_layers_auto = nn.ModuleList([copy.deepcopy(layer) for layer in self.up_layers]) + + def forward( # type: ignore + self, x: torch.Tensor, with_point: bool = True, with_label: bool = True + ) -> tuple[Union[None, torch.Tensor, list[torch.Tensor]], Union[None, torch.Tensor, list[torch.Tensor]]]: + """ + Args: + x: input tensor. + with_point: if true, return the point branch output. + with_label: if true, return the label branch output. + """ + if self.preprocess is not None: + x = self.preprocess(x) + + if not self.is_valid_shape(x): + raise ValueError(f"Input spatial dims {x.shape} must be divisible by {self.shape_factor()}") + + x_down = self.encoder(x) + + x_down.reverse() + x = x_down.pop(0) + + if len(x_down) == 0: + x_down = [torch.zeros(1, device=x.device, dtype=x.dtype)] + + outputs: list[torch.Tensor] = [] + outputs_auto: list[torch.Tensor] = [] + x_ = x.clone() + if with_point: + i = 0 + for level in self.up_layers: + x = level["upsample"](x) + x = x + x_down[i] + x = level["blocks"](x) + + if len(self.up_layers) - i <= self.dsdepth: + outputs.append(level["head"](x)) + i = i + 1 + + outputs.reverse() + x = x_ + if with_label: + i = 0 + for level in self.up_layers_auto: + x = level["upsample"](x) + x = x + x_down[i] + x = level["blocks"](x) + + if len(self.up_layers) - i <= self.dsdepth: + outputs_auto.append(level["head"](x)) + i = i + 1 + + outputs_auto.reverse() + + return outputs[0] if len(outputs) == 1 else outputs, outputs_auto[0] if len(outputs_auto) == 1 else outputs_auto + + def set_auto_grad(self, auto_freeze=False, point_freeze=False): + """ + Args: + auto_freeze: if true, freeze the image encoder and the auto-branch. + point_freeze: if true, freeze the image encoder and the point-branch. + """ + for param in self.encoder.parameters(): + param.requires_grad = (not auto_freeze) and (not point_freeze) + + for param in self.up_layers_auto.parameters(): + param.requires_grad = not auto_freeze + + for param in self.up_layers.parameters(): + param.requires_grad = not point_freeze diff --git a/monai/networks/nets/spade_autoencoderkl.py b/monai/networks/nets/spade_autoencoderkl.py new file mode 100644 index 0000000000..cc8909194a --- /dev/null +++ b/monai/networks/nets/spade_autoencoderkl.py @@ -0,0 +1,502 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections.abc import Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from monai.networks.blocks import Convolution, SpatialAttentionBlock, Upsample +from monai.networks.blocks.spade_norm import SPADE +from monai.networks.nets.autoencoderkl import Encoder +from monai.utils import ensure_tuple_rep + +__all__ = ["SPADEAutoencoderKL"] + + +class SPADEResBlock(nn.Module): + """ + Residual block consisting of a cascade of 2 convolutions + activation + normalisation block, and a + residual connection between input and output. + Enables SPADE normalisation for semantic conditioning (Park et. al (2019): https://github.com/NVlabs/SPADE) + + Args: + spatial_dims: number of spatial dimensions (1D, 2D, 3D). + in_channels: input channels to the layer. + norm_num_groups: number of groups involved for the group normalisation layer. Ensure that your number of + channels is divisible by this number. + norm_eps: epsilon for the normalisation. + out_channels: number of output channels. + label_nc: number of semantic channels for SPADE normalisation + spade_intermediate_channels: number of intermediate channels for SPADE block layer + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + norm_num_groups: int, + norm_eps: float, + out_channels: int, + label_nc: int, + spade_intermediate_channels: int, + ) -> None: + super().__init__() + self.in_channels = in_channels + self.out_channels = in_channels if out_channels is None else out_channels + self.norm1 = SPADE( + label_nc=label_nc, + norm_nc=in_channels, + norm="GROUP", + norm_params={"num_groups": norm_num_groups, "affine": False, "eps": norm_eps}, + hidden_channels=spade_intermediate_channels, + kernel_size=3, + spatial_dims=spatial_dims, + ) + self.conv1 = Convolution( + spatial_dims=spatial_dims, + in_channels=self.in_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + self.norm2 = SPADE( + label_nc=label_nc, + norm_nc=out_channels, + norm="GROUP", + norm_params={"num_groups": norm_num_groups, "affine": False, "eps": norm_eps}, + hidden_channels=spade_intermediate_channels, + kernel_size=3, + spatial_dims=spatial_dims, + ) + self.conv2 = Convolution( + spatial_dims=spatial_dims, + in_channels=self.out_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + + self.nin_shortcut: nn.Module + if self.in_channels != self.out_channels: + self.nin_shortcut = Convolution( + spatial_dims=spatial_dims, + in_channels=self.in_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + else: + self.nin_shortcut = nn.Identity() + + def forward(self, x: torch.Tensor, seg: torch.Tensor) -> torch.Tensor: + h = x + h = self.norm1(h, seg) + h = F.silu(h) + h = self.conv1(h) + h = self.norm2(h, seg) + h = F.silu(h) + h = self.conv2(h) + + x = self.nin_shortcut(x) + + return x + h + + +class SPADEDecoder(nn.Module): + """ + Convolutional cascade upsampling from a spatial latent space into an image space. + Enables SPADE normalisation for semantic conditioning (Park et. al (2019): https://github.com/NVlabs/SPADE) + + Args: + spatial_dims: number of spatial dimensions (1D, 2D, 3D). + channels: sequence of block output channels. + in_channels: number of channels in the bottom layer (latent space) of the autoencoder. + out_channels: number of output channels. + num_res_blocks: number of residual blocks (see ResBlock) per level. + norm_num_groups: number of groups for the GroupNorm layers, channels must be divisible by this number. + norm_eps: epsilon for the normalization. + attention_levels: indicate which level from channels contain an attention block. + label_nc: number of semantic channels for SPADE normalisation. + with_nonlocal_attn: if True use non-local attention block. + spade_intermediate_channels: number of intermediate channels for SPADE block layer. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + channels: Sequence[int], + in_channels: int, + out_channels: int, + num_res_blocks: Sequence[int], + norm_num_groups: int, + norm_eps: float, + attention_levels: Sequence[bool], + label_nc: int, + with_nonlocal_attn: bool = True, + spade_intermediate_channels: int = 128, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.spatial_dims = spatial_dims + self.channels = channels + self.in_channels = in_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.norm_num_groups = norm_num_groups + self.norm_eps = norm_eps + self.attention_levels = attention_levels + self.label_nc = label_nc + + reversed_block_out_channels = list(reversed(channels)) + + blocks: list[nn.Module] = [] + + # Initial convolution + blocks.append( + Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=reversed_block_out_channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + + # Non-local attention block + if with_nonlocal_attn is True: + blocks.append( + SPADEResBlock( + spatial_dims=spatial_dims, + in_channels=reversed_block_out_channels[0], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=reversed_block_out_channels[0], + label_nc=label_nc, + spade_intermediate_channels=spade_intermediate_channels, + ) + ) + blocks.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=reversed_block_out_channels[0], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + blocks.append( + SPADEResBlock( + spatial_dims=spatial_dims, + in_channels=reversed_block_out_channels[0], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=reversed_block_out_channels[0], + label_nc=label_nc, + spade_intermediate_channels=spade_intermediate_channels, + ) + ) + + reversed_attention_levels = list(reversed(attention_levels)) + reversed_num_res_blocks = list(reversed(num_res_blocks)) + block_out_ch = reversed_block_out_channels[0] + for i in range(len(reversed_block_out_channels)): + block_in_ch = block_out_ch + block_out_ch = reversed_block_out_channels[i] + is_final_block = i == len(channels) - 1 + + for _ in range(reversed_num_res_blocks[i]): + blocks.append( + SPADEResBlock( + spatial_dims=spatial_dims, + in_channels=block_in_ch, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + out_channels=block_out_ch, + label_nc=label_nc, + spade_intermediate_channels=spade_intermediate_channels, + ) + ) + block_in_ch = block_out_ch + + if reversed_attention_levels[i]: + blocks.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=block_in_ch, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + if not is_final_block: + post_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=block_in_ch, + out_channels=block_in_ch, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + blocks.append( + Upsample( + spatial_dims=spatial_dims, + mode="nontrainable", + in_channels=block_in_ch, + out_channels=block_in_ch, + interp_mode="nearest", + scale_factor=2.0, + post_conv=post_conv, + align_corners=None, + ) + ) + + blocks.append(nn.GroupNorm(num_groups=norm_num_groups, num_channels=block_in_ch, eps=norm_eps, affine=True)) + blocks.append( + Convolution( + spatial_dims=spatial_dims, + in_channels=block_in_ch, + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + + self.blocks = nn.ModuleList(blocks) + + def forward(self, x: torch.Tensor, seg: torch.Tensor) -> torch.Tensor: + for block in self.blocks: + if isinstance(block, SPADEResBlock): + x = block(x, seg) + else: + x = block(x) + return x + + +class SPADEAutoencoderKL(nn.Module): + """ + Autoencoder model with KL-regularized latent space based on + Rombach et al. "High-Resolution Image Synthesis with Latent Diffusion Models" https://arxiv.org/abs/2112.10752 + and Pinaya et al. "Brain Imaging Generation with Latent Diffusion Models" https://arxiv.org/abs/2209.07162 + Enables SPADE normalisation for semantic conditioning (Park et. al (2019): https://github.com/NVlabs/SPADE) + + Args: + spatial_dims: number of spatial dimensions (1D, 2D, 3D). + label_nc: number of semantic channels for SPADE normalisation. + in_channels: number of input channels. + out_channels: number of output channels. + num_res_blocks: number of residual blocks (see ResBlock) per level. + channels: sequence of block output channels. + attention_levels: sequence of levels to add attention. + latent_channels: latent embedding dimension. + norm_num_groups: number of groups for the GroupNorm layers, channels must be divisible by this number. + norm_eps: epsilon for the normalization. + with_encoder_nonlocal_attn: if True use non-local attention block in the encoder. + with_decoder_nonlocal_attn: if True use non-local attention block in the decoder. + spade_intermediate_channels: number of intermediate channels for SPADE block layer. + """ + + def __init__( + self, + spatial_dims: int, + label_nc: int, + in_channels: int = 1, + out_channels: int = 1, + num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), + channels: Sequence[int] = (32, 64, 64, 64), + attention_levels: Sequence[bool] = (False, False, True, True), + latent_channels: int = 3, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + with_encoder_nonlocal_attn: bool = True, + with_decoder_nonlocal_attn: bool = True, + spade_intermediate_channels: int = 128, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + + # All number of channels should be multiple of num_groups + if any((out_channel % norm_num_groups) != 0 for out_channel in channels): + raise ValueError("SPADEAutoencoderKL expects all channels being multiple of norm_num_groups") + + if len(channels) != len(attention_levels): + raise ValueError("SPADEAutoencoderKL expects channels being same size of attention_levels") + + if isinstance(num_res_blocks, int): + num_res_blocks = ensure_tuple_rep(num_res_blocks, len(channels)) + + if len(num_res_blocks) != len(channels): + raise ValueError( + "`num_res_blocks` should be a single integer or a tuple of integers with the same length as " + "`channels`." + ) + + self.encoder = Encoder( + spatial_dims=spatial_dims, + in_channels=in_channels, + channels=channels, + out_channels=latent_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + attention_levels=attention_levels, + with_nonlocal_attn=with_encoder_nonlocal_attn, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + self.decoder = SPADEDecoder( + spatial_dims=spatial_dims, + channels=channels, + in_channels=latent_channels, + out_channels=out_channels, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + attention_levels=attention_levels, + label_nc=label_nc, + with_nonlocal_attn=with_decoder_nonlocal_attn, + spade_intermediate_channels=spade_intermediate_channels, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + self.quant_conv_mu = Convolution( + spatial_dims=spatial_dims, + in_channels=latent_channels, + out_channels=latent_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + self.quant_conv_log_sigma = Convolution( + spatial_dims=spatial_dims, + in_channels=latent_channels, + out_channels=latent_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + self.post_quant_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=latent_channels, + out_channels=latent_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + self.latent_channels = latent_channels + + def encode(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """ + Forwards an image through the spatial encoder, obtaining the latent mean and sigma representations. + + Args: + x: BxCx[SPATIAL DIMS] tensor + + """ + h = self.encoder(x) + z_mu = self.quant_conv_mu(h) + z_log_var = self.quant_conv_log_sigma(h) + z_log_var = torch.clamp(z_log_var, -30.0, 20.0) + z_sigma = torch.exp(z_log_var / 2) + + return z_mu, z_sigma + + def sampling(self, z_mu: torch.Tensor, z_sigma: torch.Tensor) -> torch.Tensor: + """ + From the mean and sigma representations resulting of encoding an image through the latent space, + obtains a noise sample resulting from sampling gaussian noise, multiplying by the variance (sigma) and + adding the mean. + + Args: + z_mu: Bx[Z_CHANNELS]x[LATENT SPACE SIZE] mean vector obtained by the encoder when you encode an image + z_sigma: Bx[Z_CHANNELS]x[LATENT SPACE SIZE] variance vector obtained by the encoder when you encode an image + + Returns: + sample of shape Bx[Z_CHANNELS]x[LATENT SPACE SIZE] + """ + eps = torch.randn_like(z_sigma) + z_vae = z_mu + eps * z_sigma + return z_vae + + def reconstruct(self, x: torch.Tensor, seg: torch.Tensor) -> torch.Tensor: + """ + Encodes and decodes an input image. + + Args: + x: BxCx[SPATIAL DIMENSIONS] tensor. + seg: Bx[LABEL_NC]x[SPATIAL DIMENSIONS] tensor of segmentations for SPADE norm. + Returns: + reconstructed image, of the same shape as input + """ + z_mu, _ = self.encode(x) + reconstruction = self.decode(z_mu, seg) + return reconstruction + + def decode(self, z: torch.Tensor, seg: torch.Tensor) -> torch.Tensor: + """ + Based on a latent space sample, forwards it through the Decoder. + + Args: + z: Bx[Z_CHANNELS]x[LATENT SPACE SHAPE] + seg: Bx[LABEL_NC]x[SPATIAL DIMENSIONS] tensor of segmentations for SPADE norm. + Returns: + decoded image tensor + """ + z = self.post_quant_conv(z) + dec: torch.Tensor = self.decoder(z, seg) + return dec + + def forward(self, x: torch.Tensor, seg: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + z_mu, z_sigma = self.encode(x) + z = self.sampling(z_mu, z_sigma) + reconstruction = self.decode(z, seg) + return reconstruction, z_mu, z_sigma + + def encode_stage_2_inputs(self, x: torch.Tensor) -> torch.Tensor: + z_mu, z_sigma = self.encode(x) + z = self.sampling(z_mu, z_sigma) + return z + + def decode_stage_2_outputs(self, z: torch.Tensor, seg: torch.Tensor) -> torch.Tensor: + image = self.decode(z, seg) + return image diff --git a/monai/networks/nets/spade_diffusion_model_unet.py b/monai/networks/nets/spade_diffusion_model_unet.py new file mode 100644 index 0000000000..a9609b1d39 --- /dev/null +++ b/monai/networks/nets/spade_diffusion_model_unet.py @@ -0,0 +1,971 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ========================================================================= +# Adapted from https://github.com/huggingface/diffusers +# which has the following license: +# https://github.com/huggingface/diffusers/blob/main/LICENSE +# +# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + +from __future__ import annotations + +from collections.abc import Sequence + +import torch +from torch import nn + +from monai.networks.blocks import Convolution, SpatialAttentionBlock +from monai.networks.blocks.spade_norm import SPADE +from monai.networks.nets.diffusion_model_unet import ( + DiffusionUnetDownsample, + DiffusionUNetResnetBlock, + SpatialTransformer, + WrappedUpsample, + get_down_block, + get_mid_block, + get_timestep_embedding, + zero_module, +) +from monai.utils import ensure_tuple_rep + +__all__ = ["SPADEDiffusionModelUNet"] + + +class SPADEDiffResBlock(nn.Module): + """ + Residual block with timestep conditioning and SPADE norm. + Enables SPADE normalisation for semantic conditioning (Park et. al (2019): https://github.com/NVlabs/SPADE) + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + temb_channels: number of timestep embedding channels. + label_nc: number of semantic channels for SPADE normalisation. + out_channels: number of output channels. + up: if True, performs upsampling. + down: if True, performs downsampling. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + spade_intermediate_channels: number of intermediate channels for SPADE block layer + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + temb_channels: int, + label_nc: int, + out_channels: int | None = None, + up: bool = False, + down: bool = False, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + spade_intermediate_channels: int = 128, + ) -> None: + super().__init__() + self.spatial_dims = spatial_dims + self.channels = in_channels + self.emb_channels = temb_channels + self.out_channels = out_channels or in_channels + self.up = up + self.down = down + + self.norm1 = SPADE( + label_nc=label_nc, + norm_nc=in_channels, + norm="GROUP", + norm_params={"num_groups": norm_num_groups, "eps": norm_eps, "affine": True}, + hidden_channels=spade_intermediate_channels, + kernel_size=3, + spatial_dims=spatial_dims, + ) + + self.nonlinearity = nn.SiLU() + self.conv1 = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + + self.upsample = self.downsample = None + if self.up: + self.upsample = WrappedUpsample( + spatial_dims=spatial_dims, + mode="nontrainable", + in_channels=in_channels, + out_channels=in_channels, + interp_mode="nearest", + scale_factor=2.0, + align_corners=None, + ) + elif down: + self.downsample = DiffusionUnetDownsample(spatial_dims, in_channels, use_conv=False) + + self.time_emb_proj = nn.Linear(temb_channels, self.out_channels) + + self.norm2 = SPADE( + label_nc=label_nc, + norm_nc=self.out_channels, + norm="GROUP", + norm_params={"num_groups": norm_num_groups, "eps": norm_eps, "affine": True}, + hidden_channels=spade_intermediate_channels, + kernel_size=3, + spatial_dims=spatial_dims, + ) + self.conv2 = zero_module( + Convolution( + spatial_dims=spatial_dims, + in_channels=self.out_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + self.skip_connection: nn.Module + + if self.out_channels == in_channels: + self.skip_connection = nn.Identity() + else: + self.skip_connection = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=self.out_channels, + strides=1, + kernel_size=1, + padding=0, + conv_only=True, + ) + + def forward(self, x: torch.Tensor, emb: torch.Tensor, seg: torch.Tensor) -> torch.Tensor: + h = x + h = self.norm1(h, seg) + h = self.nonlinearity(h) + + if self.upsample is not None: + x = self.upsample(x) + h = self.upsample(h) + elif self.downsample is not None: + x = self.downsample(x) + h = self.downsample(h) + + h = self.conv1(h) + + if self.spatial_dims == 2: + temb = self.time_emb_proj(self.nonlinearity(emb))[:, :, None, None] + else: + temb = self.time_emb_proj(self.nonlinearity(emb))[:, :, None, None, None] + h = h + temb + + h = self.norm2(h, seg) + h = self.nonlinearity(h) + h = self.conv2(h) + output: torch.Tensor = self.skip_connection(x) + h + return output + + +class SPADEUpBlock(nn.Module): + """ + Unet's up block containing resnet and upsamplers blocks. + Enables SPADE normalisation for semantic conditioning (Park et. al (2019): https://github.com/NVlabs/SPADE) + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + prev_output_channel: number of channels from residual connection. + out_channels: number of output channels. + temb_channels: number of timestep embedding channels. + label_nc: number of semantic channels for SPADE normalisation. + num_res_blocks: number of residual blocks. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + add_upsample: if True add downsample block. + resblock_updown: if True use residual blocks for upsampling. + spade_intermediate_channels: number of intermediate channels for SPADE block layer. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + label_nc: int, + num_res_blocks: int = 1, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + add_upsample: bool = True, + resblock_updown: bool = False, + spade_intermediate_channels: int = 128, + ) -> None: + super().__init__() + self.resblock_updown = resblock_updown + resnets = [] + + for i in range(num_res_blocks): + res_skip_channels = in_channels if (i == num_res_blocks - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + SPADEDiffResBlock( + spatial_dims=spatial_dims, + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + label_nc=label_nc, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + spade_intermediate_channels=spade_intermediate_channels, + ) + ) + + self.resnets = nn.ModuleList(resnets) + + self.upsampler: nn.Module | None + if add_upsample: + if resblock_updown: + self.upsampler = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + up=True, + ) + else: + post_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + self.upsampler = WrappedUpsample( + spatial_dims=spatial_dims, + mode="nontrainable", + in_channels=out_channels, + out_channels=out_channels, + interp_mode="nearest", + scale_factor=2.0, + post_conv=post_conv, + align_corners=None, + ) + else: + self.upsampler = None + + def forward( + self, + hidden_states: torch.Tensor, + res_hidden_states_list: list[torch.Tensor], + temb: torch.Tensor, + seg: torch.Tensor, + context: torch.Tensor | None = None, + ) -> torch.Tensor: + del context + for resnet in self.resnets: + # pop res hidden states + res_hidden_states = res_hidden_states_list[-1] + res_hidden_states_list = res_hidden_states_list[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + hidden_states = resnet(hidden_states, temb, seg) + + if self.upsampler is not None: + hidden_states = self.upsampler(hidden_states, temb) + + return hidden_states + + +class SPADEAttnUpBlock(nn.Module): + """ + Unet's up block containing resnet, upsamplers, and self-attention blocks. + Enables SPADE normalisation for semantic conditioning (Park et. al (2019): https://github.com/NVlabs/SPADE) + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + prev_output_channel: number of channels from residual connection. + out_channels: number of output channels. + temb_channels: number of timestep embedding channels. + label_nc: number of semantic channels for SPADE normalisation + num_res_blocks: number of residual blocks. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + add_upsample: if True add downsample block. + resblock_updown: if True use residual blocks for upsampling. + num_head_channels: number of channels in each attention head. + spade_intermediate_channels: number of intermediate channels for SPADE block layer + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to False. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + label_nc: int, + num_res_blocks: int = 1, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + add_upsample: bool = True, + resblock_updown: bool = False, + num_head_channels: int = 1, + spade_intermediate_channels: int = 128, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.resblock_updown = resblock_updown + resnets = [] + attentions = [] + + for i in range(num_res_blocks): + res_skip_channels = in_channels if (i == num_res_blocks - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + SPADEDiffResBlock( + spatial_dims=spatial_dims, + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + label_nc=label_nc, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + spade_intermediate_channels=spade_intermediate_channels, + ) + ) + attentions.append( + SpatialAttentionBlock( + spatial_dims=spatial_dims, + num_channels=out_channels, + num_head_channels=num_head_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + self.resnets = nn.ModuleList(resnets) + self.attentions = nn.ModuleList(attentions) + + self.upsampler: nn.Module | None + if add_upsample: + if resblock_updown: + self.upsampler = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + up=True, + ) + else: + post_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + self.upsampler = WrappedUpsample( + spatial_dims=spatial_dims, + mode="nontrainable", + in_channels=out_channels, + out_channels=out_channels, + interp_mode="nearest", + scale_factor=2.0, + post_conv=post_conv, + align_corners=None, + ) + else: + self.upsampler = None + + def forward( + self, + hidden_states: torch.Tensor, + res_hidden_states_list: list[torch.Tensor], + temb: torch.Tensor, + seg: torch.Tensor, + context: torch.Tensor | None = None, + ) -> torch.Tensor: + del context + for resnet, attn in zip(self.resnets, self.attentions): + # pop res hidden states + res_hidden_states = res_hidden_states_list[-1] + res_hidden_states_list = res_hidden_states_list[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + hidden_states = resnet(hidden_states, temb, seg) + hidden_states = attn(hidden_states).contiguous() + + if self.upsampler is not None: + hidden_states = self.upsampler(hidden_states, temb) + + return hidden_states + + +class SPADECrossAttnUpBlock(nn.Module): + """ + Unet's up block containing resnet, upsamplers, and self-attention blocks. + Enables SPADE normalisation for semantic conditioning (Park et. al (2019): https://github.com/NVlabs/SPADE) + + Args: + spatial_dims: The number of spatial dimensions. + in_channels: number of input channels. + prev_output_channel: number of channels from residual connection. + out_channels: number of output channels. + temb_channels: number of timestep embedding channels. + label_nc: number of semantic channels for SPADE normalisation. + num_res_blocks: number of residual blocks. + norm_num_groups: number of groups for the group normalization. + norm_eps: epsilon for the group normalization. + add_upsample: if True add downsample block. + resblock_updown: if True use residual blocks for upsampling. + num_head_channels: number of channels in each attention head. + transformer_num_layers: number of layers of Transformer blocks to use. + cross_attention_dim: number of context dimensions to use. + upcast_attention: if True, upcast attention operations to full precision. + spade_intermediate_channels: number of intermediate channels for SPADE block layer. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism. + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + label_nc: int, + num_res_blocks: int = 1, + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + add_upsample: bool = True, + resblock_updown: bool = False, + num_head_channels: int = 1, + transformer_num_layers: int = 1, + cross_attention_dim: int | None = None, + upcast_attention: bool = False, + spade_intermediate_channels: int = 128, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.resblock_updown = resblock_updown + resnets = [] + attentions = [] + + for i in range(num_res_blocks): + res_skip_channels = in_channels if (i == num_res_blocks - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + SPADEDiffResBlock( + spatial_dims=spatial_dims, + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + label_nc=label_nc, + spade_intermediate_channels=spade_intermediate_channels, + ) + ) + attentions.append( + SpatialTransformer( + spatial_dims=spatial_dims, + in_channels=out_channels, + num_attention_heads=out_channels // num_head_channels, + num_head_channels=num_head_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + ) + + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + + self.upsampler: nn.Module | None + if add_upsample: + if resblock_updown: + self.upsampler = DiffusionUNetResnetBlock( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + up=True, + ) + else: + post_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=out_channels, + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + self.upsampler = WrappedUpsample( + spatial_dims=spatial_dims, + mode="nontrainable", + in_channels=out_channels, + out_channels=out_channels, + interp_mode="nearest", + scale_factor=2.0, + post_conv=post_conv, + align_corners=None, + ) + else: + self.upsampler = None + + def forward( + self, + hidden_states: torch.Tensor, + res_hidden_states_list: list[torch.Tensor], + temb: torch.Tensor, + seg: torch.Tensor | None = None, + context: torch.Tensor | None = None, + ) -> torch.Tensor: + for resnet, attn in zip(self.resnets, self.attentions): + # pop res hidden states + res_hidden_states = res_hidden_states_list[-1] + res_hidden_states_list = res_hidden_states_list[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + hidden_states = resnet(hidden_states, temb, seg) + hidden_states = attn(hidden_states, context=context).contiguous() + + if self.upsampler is not None: + hidden_states = self.upsampler(hidden_states, temb) + + return hidden_states + + +def get_spade_up_block( + spatial_dims: int, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + num_res_blocks: int, + norm_num_groups: int, + norm_eps: float, + add_upsample: bool, + resblock_updown: bool, + with_attn: bool, + with_cross_attn: bool, + num_head_channels: int, + transformer_num_layers: int, + label_nc: int, + cross_attention_dim: int | None, + upcast_attention: bool = False, + spade_intermediate_channels: int = 128, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, +) -> nn.Module: + if with_attn: + return SPADEAttnUpBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + prev_output_channel=prev_output_channel, + out_channels=out_channels, + temb_channels=temb_channels, + label_nc=label_nc, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_upsample=add_upsample, + resblock_updown=resblock_updown, + num_head_channels=num_head_channels, + spade_intermediate_channels=spade_intermediate_channels, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + elif with_cross_attn: + return SPADECrossAttnUpBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + prev_output_channel=prev_output_channel, + out_channels=out_channels, + temb_channels=temb_channels, + label_nc=label_nc, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_upsample=add_upsample, + resblock_updown=resblock_updown, + num_head_channels=num_head_channels, + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + spade_intermediate_channels=spade_intermediate_channels, + use_flash_attention=use_flash_attention, + ) + else: + return SPADEUpBlock( + spatial_dims=spatial_dims, + in_channels=in_channels, + prev_output_channel=prev_output_channel, + out_channels=out_channels, + temb_channels=temb_channels, + label_nc=label_nc, + num_res_blocks=num_res_blocks, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_upsample=add_upsample, + resblock_updown=resblock_updown, + spade_intermediate_channels=spade_intermediate_channels, + ) + + +class SPADEDiffusionModelUNet(nn.Module): + """ + UNet network with timestep embedding and attention mechanisms for conditioning, with added SPADE normalization for + semantic conditioning (Park et.al (2019): https://github.com/NVlabs/SPADE). An example tutorial can be found at + https://github.com/Project-MONAI/GenerativeModels/tree/main/tutorials/generative/2d_spade_ldm + + Args: + spatial_dims: number of spatial dimensions. + in_channels: number of input channels. + out_channels: number of output channels. + label_nc: number of semantic channels for SPADE normalisation. + num_res_blocks: number of residual blocks (see ResnetBlock) per level. + channels: tuple of block output channels. + attention_levels: list of levels to add attention. + norm_num_groups: number of groups for the normalization. + norm_eps: epsilon for the normalization. + resblock_updown: if True use residual blocks for up/downsampling. + num_head_channels: number of channels in each attention head. + with_conditioning: if True add spatial transformers to perform conditioning. + transformer_num_layers: number of layers of Transformer blocks to use. + cross_attention_dim: number of context dimensions to use. + num_class_embeds: if specified (as an int), then this model will be class-conditional with `num_class_embeds` + classes. + upcast_attention: if True, upcast attention operations to full precision. + spade_intermediate_channels: number of intermediate channels for SPADE block layer. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + label_nc: int, + num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), + channels: Sequence[int] = (32, 64, 64, 64), + attention_levels: Sequence[bool] = (False, False, True, True), + norm_num_groups: int = 32, + norm_eps: float = 1e-6, + resblock_updown: bool = False, + num_head_channels: int | Sequence[int] = 8, + with_conditioning: bool = False, + transformer_num_layers: int = 1, + cross_attention_dim: int | None = None, + num_class_embeds: int | None = None, + upcast_attention: bool = False, + spade_intermediate_channels: int = 128, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + if with_conditioning is True and cross_attention_dim is None: + raise ValueError( + "SPADEDiffusionModelUNet expects dimension of the cross-attention conditioning (cross_attention_dim) " + "when using with_conditioning." + ) + if cross_attention_dim is not None and with_conditioning is False: + raise ValueError( + "SPADEDiffusionModelUNet expects with_conditioning=True when specifying the cross_attention_dim." + ) + + # All number of channels should be multiple of num_groups + if any((out_channel % norm_num_groups) != 0 for out_channel in channels): + raise ValueError("SPADEDiffusionModelUNet expects all num_channels being multiple of norm_num_groups") + + if len(channels) != len(attention_levels): + raise ValueError("SPADEDiffusionModelUNet expects num_channels being same size of attention_levels") + + if isinstance(num_head_channels, int): + num_head_channels = ensure_tuple_rep(num_head_channels, len(attention_levels)) + + if len(num_head_channels) != len(attention_levels): + raise ValueError( + "num_head_channels should have the same length as attention_levels. For the i levels without attention," + " i.e. `attention_level[i]=False`, the num_head_channels[i] will be ignored." + ) + + if isinstance(num_res_blocks, int): + num_res_blocks = ensure_tuple_rep(num_res_blocks, len(channels)) + + if len(num_res_blocks) != len(channels): + raise ValueError( + "`num_res_blocks` should be a single integer or a tuple of integers with the same length as " + "`num_channels`." + ) + + self.in_channels = in_channels + self.block_out_channels = channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_levels = attention_levels + self.num_head_channels = num_head_channels + self.with_conditioning = with_conditioning + self.label_nc = label_nc + + # input + self.conv_in = Convolution( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + + # time + time_embed_dim = channels[0] * 4 + self.time_embed = nn.Sequential( + nn.Linear(channels[0], time_embed_dim), nn.SiLU(), nn.Linear(time_embed_dim, time_embed_dim) + ) + + # class embedding + self.num_class_embeds = num_class_embeds + if num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) + + # down + self.down_blocks = nn.ModuleList([]) + output_channel = channels[0] + for i in range(len(channels)): + input_channel = output_channel + output_channel = channels[i] + is_final_block = i == len(channels) - 1 + + down_block = get_down_block( + spatial_dims=spatial_dims, + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + num_res_blocks=num_res_blocks[i], + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_downsample=not is_final_block, + resblock_updown=resblock_updown, + with_attn=(attention_levels[i] and not with_conditioning), + with_cross_attn=(attention_levels[i] and with_conditioning), + num_head_channels=num_head_channels[i], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + self.down_blocks.append(down_block) + + # mid + self.middle_block = get_mid_block( + spatial_dims=spatial_dims, + in_channels=channels[-1], + temb_channels=time_embed_dim, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + with_conditioning=with_conditioning, + num_head_channels=num_head_channels[-1], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + + # up + self.up_blocks = nn.ModuleList([]) + reversed_block_out_channels = list(reversed(channels)) + reversed_num_res_blocks = list(reversed(num_res_blocks)) + reversed_attention_levels = list(reversed(attention_levels)) + reversed_num_head_channels = list(reversed(num_head_channels)) + output_channel = reversed_block_out_channels[0] + for i in range(len(reversed_block_out_channels)): + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[min(i + 1, len(channels) - 1)] + + is_final_block = i == len(channels) - 1 + + up_block = get_spade_up_block( + spatial_dims=spatial_dims, + in_channels=input_channel, + prev_output_channel=prev_output_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + num_res_blocks=reversed_num_res_blocks[i] + 1, + norm_num_groups=norm_num_groups, + norm_eps=norm_eps, + add_upsample=not is_final_block, + resblock_updown=resblock_updown, + with_attn=(reversed_attention_levels[i] and not with_conditioning), + with_cross_attn=(reversed_attention_levels[i] and with_conditioning), + num_head_channels=reversed_num_head_channels[i], + transformer_num_layers=transformer_num_layers, + cross_attention_dim=cross_attention_dim, + upcast_attention=upcast_attention, + label_nc=label_nc, + spade_intermediate_channels=spade_intermediate_channels, + use_flash_attention=use_flash_attention, + ) + + self.up_blocks.append(up_block) + + # out + self.out = nn.Sequential( + nn.GroupNorm(num_groups=norm_num_groups, num_channels=channels[0], eps=norm_eps, affine=True), + nn.SiLU(), + zero_module( + Convolution( + spatial_dims=spatial_dims, + in_channels=channels[0], + out_channels=out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ), + ) + + def forward( + self, + x: torch.Tensor, + timesteps: torch.Tensor, + seg: torch.Tensor, + context: torch.Tensor | None = None, + class_labels: torch.Tensor | None = None, + down_block_additional_residuals: tuple[torch.Tensor] | None = None, + mid_block_additional_residual: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Args: + x: input tensor (N, C, SpatialDims). + timesteps: timestep tensor (N,). + seg: Bx[LABEL_NC]x[SPATIAL DIMENSIONS] tensor of segmentations for SPADE norm. + context: context tensor (N, 1, ContextDim). + class_labels: context tensor (N, ). + down_block_additional_residuals: additional residual tensors for down blocks (N, C, FeatureMapsDims). + mid_block_additional_residual: additional residual tensor for mid block (N, C, FeatureMapsDims). + """ + # 1. time + t_emb = get_timestep_embedding(timesteps, self.block_out_channels[0]) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=x.dtype) + emb = self.time_embed(t_emb) + + # 2. class + if self.num_class_embeds is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + class_emb = self.class_embedding(class_labels) + class_emb = class_emb.to(dtype=x.dtype) + emb = emb + class_emb + + # 3. initial convolution + h = self.conv_in(x) + + # 4. down + if context is not None and self.with_conditioning is False: + raise ValueError("model should have with_conditioning = True if context is provided") + down_block_res_samples: list[torch.Tensor] = [h] + for downsample_block in self.down_blocks: + h, res_samples = downsample_block(hidden_states=h, temb=emb, context=context) + for residual in res_samples: + down_block_res_samples.append(residual) + + # Additional residual conections for Controlnets + if down_block_additional_residuals is not None: + new_down_block_res_samples: list[torch.Tensor] = [h] + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples.append(down_block_res_sample) + + down_block_res_samples = new_down_block_res_samples + + # 5. mid + h = self.middle_block(hidden_states=h, temb=emb, context=context) + + # Additional residual conections for Controlnets + if mid_block_additional_residual is not None: + h = h + mid_block_additional_residual + + # 6. up + for upsample_block in self.up_blocks: + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + h = upsample_block(hidden_states=h, res_hidden_states_list=res_samples, seg=seg, temb=emb, context=context) + + # 7. output block + output: torch.Tensor = self.out(h) + + return output diff --git a/monai/networks/nets/spade_network.py b/monai/networks/nets/spade_network.py new file mode 100644 index 0000000000..9164541f27 --- /dev/null +++ b/monai/networks/nets/spade_network.py @@ -0,0 +1,435 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Sequence + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from monai.networks.blocks import Convolution +from monai.networks.blocks.spade_norm import SPADE +from monai.networks.layers import Act +from monai.networks.layers.utils import get_act_layer +from monai.utils.enums import StrEnum + +__all__ = ["SPADENet"] + + +class UpsamplingModes(StrEnum): + bicubic = "bicubic" + nearest = "nearest" + bilinear = "bilinear" + + +class SPADENetResBlock(nn.Module): + """ + Creates a Residual Block with SPADE normalisation. + + Args: + spatial_dims: number of spatial dimensions + in_channels: number of input channels + out_channels: number of output channels + label_nc: number of semantic channels that will be taken into account in SPADE normalisation blocks + spade_intermediate_channels: number of intermediate channels in the middle conv. layers in SPADE normalisation blocks + norm: base normalisation type used on top of SPADE + kernel_size: convolutional kernel size + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + label_nc: int, + spade_intermediate_channels: int = 128, + norm: str | tuple = "INSTANCE", + act: str | tuple = (Act.LEAKYRELU, {"negative_slope": 0.2}), + kernel_size: int = 3, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.int_channels = min(self.in_channels, self.out_channels) + self.learned_shortcut = self.in_channels != self.out_channels + self.conv_0 = Convolution( + spatial_dims=spatial_dims, in_channels=self.in_channels, out_channels=self.int_channels, act=None, norm=None + ) + self.conv_1 = Convolution( + spatial_dims=spatial_dims, + in_channels=self.int_channels, + out_channels=self.out_channels, + act=None, + norm=None, + ) + self.activation = get_act_layer(act) + self.norm_0 = SPADE( + label_nc=label_nc, + norm_nc=self.in_channels, + kernel_size=kernel_size, + spatial_dims=spatial_dims, + hidden_channels=spade_intermediate_channels, + norm=norm, + ) + self.norm_1 = SPADE( + label_nc=label_nc, + norm_nc=self.int_channels, + kernel_size=kernel_size, + spatial_dims=spatial_dims, + hidden_channels=spade_intermediate_channels, + norm=norm, + ) + + if self.learned_shortcut: + self.conv_s = Convolution( + spatial_dims=spatial_dims, + in_channels=self.in_channels, + out_channels=self.out_channels, + act=None, + norm=None, + kernel_size=1, + ) + self.norm_s = SPADE( + label_nc=label_nc, + norm_nc=self.in_channels, + kernel_size=kernel_size, + spatial_dims=spatial_dims, + hidden_channels=spade_intermediate_channels, + norm=norm, + ) + + def forward(self, x, seg): + x_s = self.shortcut(x, seg) + dx = self.conv_0(self.activation(self.norm_0(x, seg))) + dx = self.conv_1(self.activation(self.norm_1(dx, seg))) + out = x_s + dx + return out + + def shortcut(self, x, seg): + if self.learned_shortcut: + x_s = self.conv_s(self.norm_s(x, seg)) + else: + x_s = x + return x_s + + +class SPADEEncoder(nn.Module): + """ + Encoding branch of a VAE compatible with a SPADE-like generator + + Args: + spatial_dims: number of spatial dimensions + in_channels: number of input channels + z_dim: latent space dimension of the VAE containing the image sytle information + channels: number of output after each downsampling block + input_shape: spatial input shape of the tensor, necessary to do the reshaping after the linear layers + of the autoencoder (HxWx[D]) + kernel_size: convolutional kernel size + norm: normalisation layer type + act: activation type + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + z_dim: int, + channels: Sequence[int], + input_shape: Sequence[int], + kernel_size: int = 3, + norm: str | tuple = "INSTANCE", + act: str | tuple = (Act.LEAKYRELU, {"negative_slope": 0.2}), + ): + super().__init__() + self.in_channels = in_channels + self.z_dim = z_dim + self.channels = channels + if len(input_shape) != spatial_dims: + raise ValueError("Length of parameter input shape must match spatial_dims; got %s" % (input_shape)) + for s_ind, s_ in enumerate(input_shape): + if s_ / (2 ** len(channels)) != s_ // (2 ** len(channels)): + raise ValueError( + "Each dimension of your input must be divisible by 2 ** (autoencoder depth)." + "The shape in position %d, %d is not divisible by %d. " % (s_ind, s_, len(channels)) + ) + self.input_shape = input_shape + self.latent_spatial_shape = [s_ // (2 ** len(self.channels)) for s_ in self.input_shape] + blocks = [] + ch_init = self.in_channels + for _, ch_value in enumerate(channels): + blocks.append( + Convolution( + spatial_dims=spatial_dims, + in_channels=ch_init, + out_channels=ch_value, + strides=2, + kernel_size=kernel_size, + norm=norm, + act=act, + ) + ) + ch_init = ch_value + + self.blocks = nn.ModuleList(blocks) + self.fc_mu = nn.Linear( + in_features=np.prod(self.latent_spatial_shape) * self.channels[-1], out_features=self.z_dim + ) + self.fc_var = nn.Linear( + in_features=np.prod(self.latent_spatial_shape) * self.channels[-1], out_features=self.z_dim + ) + + def forward(self, x): + for block in self.blocks: + x = block(x) + x = x.view(x.size(0), -1) + mu = self.fc_mu(x) + logvar = self.fc_var(x) + return mu, logvar + + def encode(self, x): + for block in self.blocks: + x = block(x) + x = x.view(x.size(0), -1) + mu = self.fc_mu(x) + logvar = self.fc_var(x) + return self.reparameterize(mu, logvar) + + def reparameterize(self, mu, logvar): + std = torch.exp(0.5 * logvar) + eps = torch.randn_like(std) + return eps.mul(std) + mu + + +class SPADEDecoder(nn.Module): + """ + Decoder branch of a SPADE-like generator. It can be used independently, without an encoding branch, + behaving like a GAN, or coupled to a SPADE encoder. + + Args: + label_nc: number of semantic labels + spatial_dims: number of spatial dimensions + out_channels: number of output channels + label_nc: number of semantic channels used for the SPADE normalisation blocks + input_shape: spatial input shape of the tensor, necessary to do the reshaping after the linear layers + channels: number of output after each downsampling block + z_dim: latent space dimension of the VAE containing the image sytle information (None if encoder is not used) + is_vae: whether the decoder is going to be coupled to an autoencoder or not (true: yes, false: no) + spade_intermediate_channels: number of channels in the intermediate layers of the SPADE normalisation blocks + norm: base normalisation type + act: activation layer type + last_act: activation layer type for the last layer of the network (can differ from previous) + kernel_size: convolutional kernel size + upsampling_mode: upsampling mode (nearest, bilinear etc.) + """ + + def __init__( + self, + spatial_dims: int, + out_channels: int, + label_nc: int, + input_shape: Sequence[int], + channels: list[int], + z_dim: int | None = None, + is_vae: bool = True, + spade_intermediate_channels: int = 128, + norm: str | tuple = "INSTANCE", + act: str | tuple = (Act.LEAKYRELU, {"negative_slope": 0.2}), + last_act: str | tuple | None = (Act.LEAKYRELU, {"negative_slope": 0.2}), + kernel_size: int = 3, + upsampling_mode: str = UpsamplingModes.nearest.value, + ): + super().__init__() + self.is_vae = is_vae + self.out_channels = out_channels + self.label_nc = label_nc + self.num_channels = channels + if len(input_shape) != spatial_dims: + raise ValueError("Length of parameter input shape must match spatial_dims; got %s" % (input_shape)) + for s_ind, s_ in enumerate(input_shape): + if s_ / (2 ** len(channels)) != s_ // (2 ** len(channels)): + raise ValueError( + "Each dimension of your input must be divisible by 2 ** (autoencoder depth)." + "The shape in position %d, %d is not divisible by %d. " % (s_ind, s_, len(channels)) + ) + self.latent_spatial_shape = [s_ // (2 ** len(self.num_channels)) for s_ in input_shape] + + if not self.is_vae: + self.conv_init = Convolution( + spatial_dims=spatial_dims, in_channels=label_nc, out_channels=channels[0], kernel_size=kernel_size + ) + elif self.is_vae and z_dim is None: + raise ValueError( + "If the network is used in VAE-GAN mode, parameter z_dim " + "(number of latent channels in the VAE) must be populated." + ) + else: + self.fc = nn.Linear(z_dim, np.prod(self.latent_spatial_shape) * channels[0]) + + self.z_dim = z_dim + blocks = [] + channels.append(self.out_channels) + self.upsampling = torch.nn.Upsample(scale_factor=2, mode=upsampling_mode) + for ch_ind, ch_value in enumerate(channels[:-1]): + blocks.append( + SPADENetResBlock( + spatial_dims=spatial_dims, + in_channels=ch_value, + out_channels=channels[ch_ind + 1], + label_nc=label_nc, + spade_intermediate_channels=spade_intermediate_channels, + norm=norm, + kernel_size=kernel_size, + act=act, + ) + ) + + self.blocks = torch.nn.ModuleList(blocks) + self.last_conv = Convolution( + spatial_dims=spatial_dims, + in_channels=channels[-1], + out_channels=out_channels, + padding=(kernel_size - 1) // 2, + kernel_size=kernel_size, + norm=None, + act=last_act, + ) + + def forward(self, seg, z: torch.Tensor | None = None): + """ + Args: + seg: input BxCxHxW[xD] semantic map on which the output is conditioned on + z: latent vector output by the encoder if self.is_vae is True. When is_vae is + False, z is a random noise vector. + + Returns: + + """ + if not self.is_vae: + x = F.interpolate(seg, size=tuple(self.latent_spatial_shape)) + x = self.conv_init(x) + else: + if ( + z is None and self.z_dim is not None + ): # Even though this network is a VAE (self.is_vae), you should be able to sample from noise as well. + z = torch.randn(seg.size(0), self.z_dim, dtype=torch.float32, device=seg.get_device()) + x = self.fc(z) + x = x.view(*[-1, self.num_channels[0]] + self.latent_spatial_shape) + + for res_block in self.blocks: + x = res_block(x, seg) + x = self.upsampling(x) + + x = self.last_conv(x) + return x + + +class SPADENet(nn.Module): + """ + SPADE Network, implemented based on the code by Park, T et al. in + "Semantic Image Synthesis with Spatially-Adaptive Normalization" + (https://github.com/NVlabs/SPADE) + + Args: + spatial_dims: number of spatial dimensions + in_channels: number of input channels + out_channels: number of output channels + label_nc: number of semantic channels used for the SPADE normalisation blocks + input_shape: spatial input shape of the tensor, necessary to do the reshaping after the linear layers + channels: number of output after each downsampling block + z_dim: latent space dimension of the VAE containing the image sytle information (None if encoder is not used) + is_vae: whether the decoder is going to be coupled to an autoencoder (true) or not (false) + spade_intermediate_channels: number of channels in the intermediate layers of the SPADE normalisation blocks + norm: base normalisation type + act: activation layer type + last_act: activation layer type for the last layer of the network (can differ from previous) + kernel_size: convolutional kernel size + upsampling_mode: upsampling mode (nearest, bilinear etc.) + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + label_nc: int, + input_shape: Sequence[int], + channels: list[int], + z_dim: int | None = None, + is_vae: bool = True, + spade_intermediate_channels: int = 128, + norm: str | tuple = "INSTANCE", + act: str | tuple = (Act.LEAKYRELU, {"negative_slope": 0.2}), + last_act: str | tuple | None = (Act.LEAKYRELU, {"negative_slope": 0.2}), + kernel_size: int = 3, + upsampling_mode: str = UpsamplingModes.nearest.value, + ): + super().__init__() + self.is_vae = is_vae + self.in_channels = in_channels + self.out_channels = out_channels + self.channels = channels + self.label_nc = label_nc + self.input_shape = input_shape + + if self.is_vae: + if z_dim is None: + ValueError("The latent space dimension mapped by parameter z_dim cannot be None is is_vae is True.") + else: + self.encoder = SPADEEncoder( + spatial_dims=spatial_dims, + in_channels=in_channels, + z_dim=z_dim, + channels=channels, + input_shape=input_shape, + kernel_size=kernel_size, + norm=norm, + act=act, + ) + + decoder_channels = channels + decoder_channels.reverse() + + self.decoder = SPADEDecoder( + spatial_dims=spatial_dims, + out_channels=out_channels, + label_nc=label_nc, + input_shape=input_shape, + channels=decoder_channels, + z_dim=z_dim, + is_vae=is_vae, + spade_intermediate_channels=spade_intermediate_channels, + norm=norm, + act=act, + last_act=last_act, + kernel_size=kernel_size, + upsampling_mode=upsampling_mode, + ) + + def forward(self, seg: torch.Tensor, x: torch.Tensor | None = None): + z = None + if self.is_vae: + z_mu, z_logvar = self.encoder(x) + z = self.encoder.reparameterize(z_mu, z_logvar) + return self.decoder(seg, z), z_mu, z_logvar + else: + return (self.decoder(seg, z),) + + def encode(self, x: torch.Tensor): + if self.is_vae: + return self.encoder.encode(x) + else: + return None + + def decode(self, seg: torch.Tensor, z: torch.Tensor | None = None): + return self.decoder(seg, z) diff --git a/monai/networks/nets/swin_unetr.py b/monai/networks/nets/swin_unetr.py index 6f96dfd291..3900c866b3 100644 --- a/monai/networks/nets/swin_unetr.py +++ b/monai/networks/nets/swin_unetr.py @@ -347,7 +347,7 @@ def window_partition(x, window_size): x: input tensor. window_size: local window size. """ - x_shape = x.size() + x_shape = x.size() # length 4 or 5 only if len(x_shape) == 5: b, d, h, w, c = x_shape x = x.view( @@ -363,10 +363,11 @@ def window_partition(x, window_size): windows = ( x.permute(0, 1, 3, 5, 2, 4, 6, 7).contiguous().view(-1, window_size[0] * window_size[1] * window_size[2], c) ) - elif len(x_shape) == 4: + else: # if len(x_shape) == 4: b, h, w, c = x.shape x = x.view(b, h // window_size[0], window_size[0], w // window_size[1], window_size[1], c) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0] * window_size[1], c) + return windows @@ -613,7 +614,7 @@ def forward_part1(self, x, mask_matrix): _, dp, hp, wp, _ = x.shape dims = [b, dp, hp, wp] - elif len(x_shape) == 4: + else: # elif len(x_shape) == 4 b, h, w, c = x.shape window_size, shift_size = get_window_size((h, w), self.window_size, self.shift_size) pad_l = pad_t = 0 diff --git a/monai/networks/nets/transformer.py b/monai/networks/nets/transformer.py new file mode 100644 index 0000000000..3a278c112a --- /dev/null +++ b/monai/networks/nets/transformer.py @@ -0,0 +1,157 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import torch +import torch.nn as nn + +from monai.networks.blocks import TransformerBlock + +__all__ = ["DecoderOnlyTransformer"] + + +class AbsolutePositionalEmbedding(nn.Module): + """Absolute positional embedding. + + Args: + max_seq_len: Maximum sequence length. + embedding_dim: Dimensionality of the embedding. + """ + + def __init__(self, max_seq_len: int, embedding_dim: int) -> None: + super().__init__() + self.max_seq_len = max_seq_len + self.embedding_dim = embedding_dim + self.embedding = nn.Embedding(max_seq_len, embedding_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + batch_size, seq_len = x.size() + positions = torch.arange(seq_len, device=x.device).repeat(batch_size, 1) + embedding: torch.Tensor = self.embedding(positions) + return embedding + + +class DecoderOnlyTransformer(nn.Module): + """Decoder-only (Autoregressive) Transformer model. + + Args: + num_tokens: Number of tokens in the vocabulary. + max_seq_len: Maximum sequence length. + attn_layers_dim: Dimensionality of the attention layers. + attn_layers_depth: Number of attention layers. + attn_layers_heads: Number of attention heads. + with_cross_attention: Whether to use cross attention for conditioning. + embedding_dropout_rate: Dropout rate for the embedding. + include_fc: whether to include the final linear layer. Default to True. + use_combined_linear: whether to use a single linear layer for qkv projection, default to True. + use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism + (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html). + """ + + def __init__( + self, + num_tokens: int, + max_seq_len: int, + attn_layers_dim: int, + attn_layers_depth: int, + attn_layers_heads: int, + with_cross_attention: bool = False, + embedding_dropout_rate: float = 0.0, + include_fc: bool = True, + use_combined_linear: bool = False, + use_flash_attention: bool = False, + ) -> None: + super().__init__() + self.num_tokens = num_tokens + self.max_seq_len = max_seq_len + self.attn_layers_dim = attn_layers_dim + self.attn_layers_depth = attn_layers_depth + self.attn_layers_heads = attn_layers_heads + self.with_cross_attention = with_cross_attention + + self.token_embeddings = nn.Embedding(num_tokens, attn_layers_dim) + self.position_embeddings = AbsolutePositionalEmbedding(max_seq_len=max_seq_len, embedding_dim=attn_layers_dim) + self.embedding_dropout = nn.Dropout(embedding_dropout_rate) + + self.blocks = nn.ModuleList( + [ + TransformerBlock( + hidden_size=attn_layers_dim, + mlp_dim=attn_layers_dim * 4, + num_heads=attn_layers_heads, + dropout_rate=0.0, + qkv_bias=False, + causal=True, + sequence_length=max_seq_len, + with_cross_attention=with_cross_attention, + include_fc=include_fc, + use_combined_linear=use_combined_linear, + use_flash_attention=use_flash_attention, + ) + for _ in range(attn_layers_depth) + ] + ) + + self.to_logits = nn.Linear(attn_layers_dim, num_tokens) + + def forward(self, x: torch.Tensor, context: torch.Tensor | None = None) -> torch.Tensor: + tok_emb = self.token_embeddings(x) + pos_emb = self.position_embeddings(x) + x = self.embedding_dropout(tok_emb + pos_emb) + + for block in self.blocks: + x = block(x, context=context) + logits: torch.Tensor = self.to_logits(x) + return logits + + def load_old_state_dict(self, old_state_dict: dict, verbose=False) -> None: + """ + Load a state dict from a DecoderOnlyTransformer trained with + [MONAI Generative](https://github.com/Project-MONAI/GenerativeModels). + + Args: + old_state_dict: state dict from the old DecoderOnlyTransformer model. + """ + + new_state_dict = self.state_dict() + # if all keys match, just load the state dict + if all(k in new_state_dict for k in old_state_dict): + print("All keys match, loading state dict.") + self.load_state_dict(old_state_dict) + return + + if verbose: + # print all new_state_dict keys that are not in old_state_dict + for k in new_state_dict: + if k not in old_state_dict: + print(f"key {k} not found in old state dict") + # and vice versa + print("----------------------------------------------") + for k in old_state_dict: + if k not in new_state_dict: + print(f"key {k} not found in new state dict") + + # copy over all matching keys + for k in new_state_dict: + if k in old_state_dict: + new_state_dict[k] = old_state_dict.pop(k) + + # fix the renamed norm blocks first norm2 -> norm_cross_attention , norm3 -> norm2 + for k in list(old_state_dict.keys()): + if "norm2" in k: + new_state_dict[k.replace("norm2", "norm_cross_attn")] = old_state_dict.pop(k) + if "norm3" in k: + new_state_dict[k.replace("norm3", "norm2")] = old_state_dict.pop(k) + if verbose: + # print all remaining keys in old_state_dict + print("remaining keys in old_state_dict:", old_state_dict.keys()) + self.load_state_dict(new_state_dict) diff --git a/monai/networks/nets/vista3d.py b/monai/networks/nets/vista3d.py new file mode 100644 index 0000000000..fe7f93d493 --- /dev/null +++ b/monai/networks/nets/vista3d.py @@ -0,0 +1,908 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import math +from typing import Any, Callable, Optional, Sequence, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +import monai +from monai.networks.blocks import MLPBlock, UnetrBasicBlock +from monai.networks.nets import SegResNetDS2 +from monai.transforms.utils import convert_points_to_disc +from monai.transforms.utils import get_largest_connected_component_mask_point as lcc +from monai.transforms.utils import sample_points_from_label +from monai.utils import optional_import, unsqueeze_left, unsqueeze_right + +rearrange, _ = optional_import("einops", name="rearrange") + +__all__ = ["VISTA3D", "vista3d132"] + + +def vista3d132(encoder_embed_dim: int = 48, in_channels: int = 1): + """ + Exact VISTA3D network configuration used in https://arxiv.org/abs/2406.05285>`_. + The model treats class index larger than 132 as zero-shot. + + Args: + encoder_embed_dim: hidden dimension for encoder. + in_channels: input channel number. + """ + segresnet = SegResNetDS2( + in_channels=in_channels, + blocks_down=(1, 2, 2, 4, 4), + norm="instance", + out_channels=encoder_embed_dim, + init_filters=encoder_embed_dim, + dsdepth=1, + ) + point_head = PointMappingSAM(feature_size=encoder_embed_dim, n_classes=512, last_supported=132) + class_head = ClassMappingClassify(n_classes=512, feature_size=encoder_embed_dim, use_mlp=True) + vista = VISTA3D(image_encoder=segresnet, class_head=class_head, point_head=point_head) + return vista + + +class VISTA3D(nn.Module): + """ + VISTA3D based on: + `VISTA3D: Versatile Imaging SegmenTation and Annotation model for 3D Computed Tomography + `_. + + Args: + image_encoder: image encoder backbone for feature extraction. + class_head: class head used for class index based segmentation + point_head: point head used for interactive segmetnation + """ + + def __init__(self, image_encoder: nn.Module, class_head: nn.Module, point_head: nn.Module): + super().__init__() + self.image_encoder = image_encoder + self.class_head = class_head + self.point_head = point_head + self.image_embeddings = None + self.auto_freeze = False + self.point_freeze = False + self.NINF_VALUE = -9999 + self.PINF_VALUE = 9999 + + def get_foreground_class_count(self, class_vector: torch.Tensor | None, point_coords: torch.Tensor | None) -> int: + """Get number of foreground classes based on class and point prompt.""" + if class_vector is None: + if point_coords is None: + raise ValueError("class_vector and point_coords cannot be both None.") + return point_coords.shape[0] + else: + return class_vector.shape[0] + + def convert_point_label( + self, + point_label: torch.Tensor, + label_set: Sequence[int] | None = None, + special_index: Sequence[int] = (23, 24, 25, 26, 27, 57, 128), + ): + """ + Convert point label based on its class prompt. For special classes defined in special index, + the positive/negative point label will be converted from 1/0 to 3/2. The purpose is to separate those + classes with ambiguous classes. + + Args: + point_label: the point label tensor, [B, N]. + label_set: the label index matching the indexes in labels. If labels are mapped to global index using RelabelID, + this label_set should be global mapped index. If labels are not mapped to global index, e.g. in zero-shot + evaluation, this label_set should be the original index. + special_index: the special class index that needs to be converted. + """ + if label_set is None: + return point_label + if not point_label.shape[0] == len(label_set): + raise ValueError("point_label and label_set must have the same length.") + + for i in range(len(label_set)): + if label_set[i] in special_index: + for j in range(len(point_label[i])): + point_label[i, j] = point_label[i, j] + 2 if point_label[i, j] > -1 else point_label[i, j] + return point_label + + def sample_points_patch_val( + self, + labels: torch.Tensor, + patch_coords: Sequence[slice], + label_set: Sequence[int], + use_center: bool = True, + mapped_label_set: Sequence[int] | None = None, + max_ppoint: int = 1, + max_npoint: int = 0, + ): + """ + Sample points for patch during sliding window validation. Only used for point only validation. + + Args: + labels: shape [1, 1, H, W, D]. + patch_coords: a sequence of sliding window slice objects. + label_set: local index, must match values in labels. + use_center: sample points from the center. + mapped_label_set: global index, it is used to identify special classes and is the global index + for the sampled points. + max_ppoint/max_npoint: positive points and negative points to sample. + """ + point_coords, point_labels = sample_points_from_label( + labels[patch_coords], + label_set, + max_ppoint=max_ppoint, + max_npoint=max_npoint, + device=labels.device, + use_center=use_center, + ) + point_labels = self.convert_point_label(point_labels, mapped_label_set) + return (point_coords, point_labels, torch.tensor(label_set).to(point_coords.device).unsqueeze(-1)) + + def update_point_to_patch( + self, patch_coords: Sequence[slice], point_coords: torch.Tensor, point_labels: torch.Tensor + ): + """ + Update point_coords with respect to patch coords. + If point is outside of the patch, remove the coordinates and set label to -1. + + Args: + patch_coords: a sequence of the python slice objects representing the patch coordinates during sliding window inference. + This value is passed from sliding_window_inferer. + point_coords: point coordinates, [B, N, 3]. + point_labels: point labels, [B, N]. + """ + patch_ends = [patch_coords[-3].stop, patch_coords[-2].stop, patch_coords[-1].stop] + patch_starts = [patch_coords[-3].start, patch_coords[-2].start, patch_coords[-1].start] + # update point coords + patch_starts_tensor = unsqueeze_left(torch.tensor(patch_starts, device=point_coords.device), 2) + patch_ends_tensor = unsqueeze_left(torch.tensor(patch_ends, device=point_coords.device), 2) + # [1 N 1] + indices = torch.logical_and( + ((point_coords - patch_starts_tensor) > 0).all(2), ((patch_ends_tensor - point_coords) > 0).all(2) + ) + # check if it's within patch coords + point_coords = point_coords.clone() - patch_starts_tensor + point_labels = point_labels.clone() + if indices.any(): + point_labels[~indices] = -1 + point_coords[~indices] = 0 + # also remove padded points, mainly used for inference. + not_pad_indices = (point_labels != -1).any(0) + point_coords = point_coords[:, not_pad_indices] + point_labels = point_labels[:, not_pad_indices] + return point_coords, point_labels + return None, None + + def connected_components_combine( + self, + logits: torch.Tensor, + point_logits: torch.Tensor, + point_coords: torch.Tensor, + point_labels: torch.Tensor, + mapping_index: torch.Tensor, + thred: float = 0.5, + ): + """ + Combine auto results with point click response. The auto results have shape [B, 1, H, W, D] which means B foreground masks + from a single image patch. + Out of those B foreground masks, user may add points to a subset of B1 foreground masks for editing. + mapping_index represents the correspondence between B and B1. + For mapping_index with point clicks, NaN values in logits will be replaced with point_logits. Meanwhile, the added/removed + region in point clicks must be updated by the lcc function. + Notice, if a positive point is within logits/prev_mask, the components containing the positive point will be added. + + Args: + logits: automatic branch results, [B, 1, H, W, D]. + point_logits: point branch results, [B1, 1, H, W, D]. + point_coords: point coordinates, [B1, N, 3]. + point_labels: point labels, [B1, N]. + mapping_index: [B]. + thred: the threshold to convert logits to binary. + """ + logits = logits.as_tensor() if isinstance(logits, monai.data.MetaTensor) else logits + _logits = logits[mapping_index] + inside = [] + for i in range(_logits.shape[0]): + inside.append( + np.any( + [ + _logits[i, 0, p[0], p[1], p[2]].item() > 0 + for p in point_coords[i].cpu().numpy().round().astype(int) + ] + ) + ) + inside_tensor = torch.tensor(inside).to(logits.device) + nan_mask = torch.isnan(_logits) + # _logits are converted to binary [B1, 1, H, W, D] + _logits = torch.nan_to_num(_logits, nan=self.NINF_VALUE).sigmoid() + pos_region = point_logits.sigmoid() > thred + diff_pos = torch.logical_and(torch.logical_or(_logits <= thred, unsqueeze_right(inside_tensor, 5)), pos_region) + diff_neg = torch.logical_and((_logits > thred), ~pos_region) + cc = lcc(diff_pos, diff_neg, point_coords=point_coords, point_labels=point_labels) + # cc is the region that can be updated by point_logits. + cc = cc.to(logits.device) + # Need to replace NaN with point_logits. diff_neg will never lie in nan_mask, + # only remove unconnected positive region. + uc_pos_region = torch.logical_and(pos_region, ~cc) + fill_mask = torch.logical_and(nan_mask, uc_pos_region) + if fill_mask.any(): + # fill in the mean negative value + point_logits[fill_mask] = -1 + # replace logits nan value and cc with point_logits + cc = torch.logical_or(nan_mask, cc).to(logits.dtype) + logits[mapping_index] *= 1 - cc + logits[mapping_index] += cc * point_logits + return logits + + def gaussian_combine( + self, + logits: torch.Tensor, + point_logits: torch.Tensor, + point_coords: torch.Tensor, + point_labels: torch.Tensor, + mapping_index: torch.Tensor, + radius: int | None = None, + ): + """ + Combine point results with auto results using gaussian. + + Args: + logits: automatic branch results, [B, 1, H, W, D]. + point_logits: point branch results, [B1, 1, H, W, D]. + point_coords: point coordinates, [B1, N, 3]. + point_labels: point labels, [B1, N]. + mapping_index: [B]. + radius: gaussian ball radius. + """ + if radius is None: + radius = min(point_logits.shape[-3:]) // 5 # empirical value 5 + weight = 1 - convert_points_to_disc(point_logits.shape[-3:], point_coords, point_labels, radius=radius).sum( + 1, keepdims=True + ) + weight[weight < 0] = 0 + logits = logits.as_tensor() if isinstance(logits, monai.data.MetaTensor) else logits + logits[mapping_index] *= weight + logits[mapping_index] += (1 - weight) * point_logits + return logits + + def set_auto_grad(self, auto_freeze: bool = False, point_freeze: bool = False): + """ + Freeze auto-branch or point-branch. + + Args: + auto_freeze: whether to freeze the auto branch. + point_freeze: whether to freeze the point branch. + """ + if auto_freeze != self.auto_freeze: + if hasattr(self.image_encoder, "set_auto_grad"): + self.image_encoder.set_auto_grad(auto_freeze=auto_freeze, point_freeze=point_freeze) + else: + for param in self.image_encoder.parameters(): + param.requires_grad = (not auto_freeze) and (not point_freeze) + for param in self.class_head.parameters(): + param.requires_grad = not auto_freeze + self.auto_freeze = auto_freeze + + if point_freeze != self.point_freeze: + if hasattr(self.image_encoder, "set_auto_grad"): + self.image_encoder.set_auto_grad(auto_freeze=auto_freeze, point_freeze=point_freeze) + else: + for param in self.image_encoder.parameters(): + param.requires_grad = (not auto_freeze) and (not point_freeze) + for param in self.point_head.parameters(): + param.requires_grad = not point_freeze + self.point_freeze = point_freeze + + def forward( + self, + input_images: torch.Tensor, + point_coords: torch.Tensor | None = None, + point_labels: torch.Tensor | None = None, + class_vector: torch.Tensor | None = None, + prompt_class: torch.Tensor | None = None, + patch_coords: Sequence[slice] | None = None, + labels: torch.Tensor | None = None, + label_set: Sequence[int] | None = None, + prev_mask: torch.Tensor | None = None, + radius: int | None = None, + val_point_sampler: Callable | None = None, + **kwargs, + ): + """ + The forward function for VISTA3D. We only support single patch in training and inference. + One exception is allowing sliding window batch size > 1 for automatic segmentation only case. + B represents number of objects, N represents number of points for each objects. + + Args: + input_images: [1, 1, H, W, D] + point_coords: [B, N, 3] + point_labels: [B, N], -1 represents padding. 0/1 means negative/positive points for regular class. + 2/3 means negative/postive ponits for special supported class like tumor. + class_vector: [B, 1], the global class index + prompt_class: [B, 1], the global class index. This value is associated with point_coords to identify if + the points are for zero-shot or supported class. When class_vector and point_coords are both + provided, prompt_class is the same as class_vector. For prompt_class[b] > 512, point_coords[b] + will be considered novel class. + patch_coords: a sequence of the python slice objects representing the patch coordinates during sliding window inference. + This value is passed from sliding_window_inferer. This is an indicator for training phase or validation phase. + labels: [1, 1, H, W, D], the groundtruth label tensor, only used for point-only evaluation + label_set: the label index matching the indexes in labels. If labels are mapped to global index using RelabelID, + this label_set should be global mapped index. If labels are not mapped to global index, e.g. in zero-shot + evaluation, this label_set should be the original index. + prev_mask: [B, N, H_fullsize, W_fullsize, D_fullsize]. + This is the transposed raw output from sliding_window_inferer before any postprocessing. + When user click points to perform auto-results correction, this can be the auto-results. + radius: single float value controling the gaussian blur when combining point and auto results. + The gaussian combine is not used in VISTA3D training but might be useful for finetuning purposes. + val_point_sampler: function used to sample points from labels. This is only used for point-only evaluation. + + """ + image_size = input_images.shape[-3:] + device = input_images.device + if point_coords is None and class_vector is None: + return self.NINF_VALUE + torch.zeros([1, 1, *image_size], device=device) + + bs = self.get_foreground_class_count(class_vector, point_coords) + if patch_coords is not None: + # if during validation and perform enable based point-validation. + if labels is not None and label_set is not None: + # if labels is not None, sample from labels for each patch. + if val_point_sampler is None: + # TODO: think about how to refactor this part. + val_point_sampler = self.sample_points_patch_val + point_coords, point_labels, prompt_class = val_point_sampler(labels, patch_coords, label_set) + if prompt_class[0].item() == 0: # type: ignore + point_labels[0] = -1 # type: ignore + labels, prev_mask = None, None + elif point_coords is not None: + # If not performing patch-based point only validation, use user provided click points for inference. + # the point clicks is in original image space, convert it to current patch-coordinate space. + point_coords, point_labels = self.update_point_to_patch(patch_coords, point_coords, point_labels) # type: ignore + + if point_coords is not None and point_labels is not None: + # remove points that used for padding purposes (point_label = -1) + mapping_index = ((point_labels != -1).sum(1) > 0).to(torch.bool) + if mapping_index.any(): + point_coords = point_coords[mapping_index] + point_labels = point_labels[mapping_index] + if prompt_class is not None: + prompt_class = prompt_class[mapping_index] + else: + if self.auto_freeze or (class_vector is None and patch_coords is None): + # if auto_freeze, point prompt must exist to allow loss backward + # in training, class_vector and point cannot both be None due to loss.backward() + mapping_index.fill_(True) + else: + point_coords, point_labels = None, None + + if point_coords is None and class_vector is None: + return self.NINF_VALUE + torch.zeros([bs, 1, *image_size], device=device) + + if self.image_embeddings is not None and kwargs.get("keep_cache", False) and class_vector is None: + out, out_auto = self.image_embeddings, None + else: + out, out_auto = self.image_encoder( + input_images, with_point=point_coords is not None, with_label=class_vector is not None + ) + # release memory + input_images = None # type: ignore + + # force releasing memories that set to None + torch.cuda.empty_cache() + if class_vector is not None: + logits, _ = self.class_head(out_auto, class_vector) + if point_coords is not None: + point_logits = self.point_head(out, point_coords, point_labels, class_vector=prompt_class) + if patch_coords is None: + logits = self.gaussian_combine( + logits, point_logits, point_coords, point_labels, mapping_index, radius # type: ignore + ) + else: + # during validation use largest component + logits = self.connected_components_combine( + logits, point_logits, point_coords, point_labels, mapping_index # type: ignore + ) + else: + logits = self.NINF_VALUE + torch.zeros([bs, 1, *image_size], device=device, dtype=out.dtype) + logits[mapping_index] = self.point_head(out, point_coords, point_labels, class_vector=prompt_class) + if prev_mask is not None and patch_coords is not None: + logits = self.connected_components_combine( + prev_mask[patch_coords].transpose(1, 0).to(logits.device), + logits[mapping_index], + point_coords, # type: ignore + point_labels, # type: ignore + mapping_index, + ) + + if kwargs.get("keep_cache", False) and class_vector is None: + self.image_embeddings = out.detach() + return logits + + +class PointMappingSAM(nn.Module): + def __init__(self, feature_size: int, max_prompt: int = 32, n_classes: int = 512, last_supported: int = 132): + """Interactive point head used for VISTA3D. + Adapted from segment anything: + `https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/mask_decoder.py`. + + Args: + feature_size: feature channel from encoder. + max_prompt: max prompt number in each forward iteration. + n_classes: number of classes the model can potentially support. This is the maximum number of class embeddings. + last_supported: number of classes the model support, this value should match the trained model weights. + """ + super().__init__() + transformer_dim = feature_size + self.max_prompt = max_prompt + self.feat_downsample = nn.Sequential( + nn.Conv3d(in_channels=feature_size, out_channels=feature_size, kernel_size=3, stride=2, padding=1), + nn.InstanceNorm3d(feature_size), + nn.GELU(), + nn.Conv3d(in_channels=feature_size, out_channels=transformer_dim, kernel_size=3, stride=1, padding=1), + nn.InstanceNorm3d(feature_size), + ) + + self.mask_downsample = nn.Conv3d(in_channels=2, out_channels=2, kernel_size=3, stride=2, padding=1) + + self.transformer = TwoWayTransformer(depth=2, embedding_dim=transformer_dim, mlp_dim=512, num_heads=4) + self.pe_layer = PositionEmbeddingRandom(transformer_dim // 2) + self.point_embeddings = nn.ModuleList([nn.Embedding(1, transformer_dim), nn.Embedding(1, transformer_dim)]) + self.not_a_point_embed = nn.Embedding(1, transformer_dim) + self.special_class_embed = nn.Embedding(1, transformer_dim) + self.mask_tokens = nn.Embedding(1, transformer_dim) + + self.output_upscaling = nn.Sequential( + nn.ConvTranspose3d(transformer_dim, transformer_dim, kernel_size=3, stride=2, padding=1, output_padding=1), + nn.InstanceNorm3d(transformer_dim), + nn.GELU(), + nn.Conv3d(transformer_dim, transformer_dim, kernel_size=3, stride=1, padding=1), + ) + + self.output_hypernetworks_mlps = MLP(transformer_dim, transformer_dim, transformer_dim, 3) + # class embedding + self.n_classes = n_classes + self.last_supported = last_supported + self.class_embeddings = nn.Embedding(n_classes, feature_size) + self.zeroshot_embed = nn.Embedding(1, transformer_dim) + self.supported_embed = nn.Embedding(1, transformer_dim) + + def forward( + self, + out: torch.Tensor, + point_coords: torch.Tensor, + point_labels: torch.Tensor, + class_vector: torch.Tensor | None = None, + ): + """Args: + out: feature from encoder, [1, C, H, W, C] + point_coords: point coordinates, [B, N, 3] + point_labels: point labels, [B, N] + class_vector: class prompts, [B] + """ + # downsample out + out_low = self.feat_downsample(out) + out_shape = tuple(out.shape[-3:]) + # release memory + out = None # type: ignore + torch.cuda.empty_cache() + # embed points + points = point_coords + 0.5 # Shift to center of pixel + point_embedding = self.pe_layer.forward_with_coords(points, out_shape) # type: ignore + point_embedding[point_labels == -1] = 0.0 + point_embedding[point_labels == -1] += self.not_a_point_embed.weight + point_embedding[point_labels == 0] += self.point_embeddings[0].weight + point_embedding[point_labels == 1] += self.point_embeddings[1].weight + point_embedding[point_labels == 2] += self.point_embeddings[0].weight + self.special_class_embed.weight + point_embedding[point_labels == 3] += self.point_embeddings[1].weight + self.special_class_embed.weight + output_tokens = self.mask_tokens.weight + + output_tokens = output_tokens.unsqueeze(0).expand(point_embedding.size(0), -1, -1) + if class_vector is None: + tokens_all = torch.cat( + ( + output_tokens, + point_embedding, + self.supported_embed.weight.unsqueeze(0).expand(point_embedding.size(0), -1, -1), + ), + dim=1, + ) + # tokens_all = torch.cat((output_tokens, point_embedding), dim=1) + else: + class_embeddings = [] + for i in class_vector: + if i > self.last_supported: + class_embeddings.append(self.zeroshot_embed.weight) + else: + class_embeddings.append(self.supported_embed.weight) + tokens_all = torch.cat((output_tokens, point_embedding, torch.stack(class_embeddings)), dim=1) + # cross attention + masks = [] + max_prompt = self.max_prompt + for i in range(int(np.ceil(tokens_all.shape[0] / max_prompt))): + # remove variables in previous for loops to save peak memory for self.transformer + src, upscaled_embedding, hyper_in = None, None, None + torch.cuda.empty_cache() + idx = (i * max_prompt, min((i + 1) * max_prompt, tokens_all.shape[0])) + tokens = tokens_all[idx[0] : idx[1]] + src = torch.repeat_interleave(out_low, tokens.shape[0], dim=0) + pos_src = torch.repeat_interleave(self.pe_layer(out_low.shape[-3:]).unsqueeze(0), tokens.shape[0], dim=0) + b, c, h, w, d = src.shape + hs, src = self.transformer(src, pos_src, tokens) + mask_tokens_out = hs[:, :1, :] + hyper_in = self.output_hypernetworks_mlps(mask_tokens_out) + src = src.transpose(1, 2).view(b, c, h, w, d) # type: ignore + upscaled_embedding = self.output_upscaling(src) + b, c, h, w, d = upscaled_embedding.shape + mask = hyper_in @ upscaled_embedding.view(b, c, h * w * d) + masks.append(mask.view(-1, 1, h, w, d)) + + return torch.vstack(masks) + + +class ClassMappingClassify(nn.Module): + """Class head that performs automatic segmentation based on class vector.""" + + def __init__(self, n_classes: int, feature_size: int, use_mlp: bool = True): + """Args: + n_classes: maximum number of class embedding. + feature_size: class embedding size. + use_mlp: use mlp to further map class embedding. + """ + super().__init__() + self.use_mlp = use_mlp + if use_mlp: + self.mlp = nn.Sequential( + nn.Linear(feature_size, feature_size), + nn.InstanceNorm1d(1), + nn.GELU(), + nn.Linear(feature_size, feature_size), + ) + self.class_embeddings = nn.Embedding(n_classes, feature_size) + self.image_post_mapping = nn.Sequential( + UnetrBasicBlock( + spatial_dims=3, + in_channels=feature_size, + out_channels=feature_size, + kernel_size=3, + stride=1, + norm_name="instance", + res_block=True, + ), + UnetrBasicBlock( + spatial_dims=3, + in_channels=feature_size, + out_channels=feature_size, + kernel_size=3, + stride=1, + norm_name="instance", + res_block=True, + ), + ) + + def forward(self, src: torch.Tensor, class_vector: torch.Tensor): + b, c, h, w, d = src.shape + src = self.image_post_mapping(src) + class_embedding = self.class_embeddings(class_vector) + if self.use_mlp: + class_embedding = self.mlp(class_embedding) + # [b,1,feat] @ [1,feat,dim], batch dimension become class_embedding batch dimension. + masks = [] + for i in range(b): + mask = class_embedding @ src[[i]].view(1, c, h * w * d) + masks.append(mask.view(-1, 1, h, w, d)) + + return torch.cat(masks, 1), class_embedding + + +class TwoWayTransformer(nn.Module): + def __init__( + self, + depth: int, + embedding_dim: int, + num_heads: int, + mlp_dim: int, + activation: tuple | str = "relu", + attention_downsample_rate: int = 2, + ) -> None: + """ + A transformer decoder that attends to an input image using + queries whose positional embedding is supplied. + Adapted from `https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/transformer.py`. + + Args: + depth: number of layers in the transformer. + embedding_dim: the channel dimension for the input embeddings. + num_heads: the number of heads for multihead attention. Must divide embedding_dim. + mlp_dim: the channel dimension internal to the MLP block. + activation: the activation to use in the MLP block. + attention_downsample_rate: the rate at which to downsample the image before projecting. + """ + super().__init__() + self.depth = depth + self.embedding_dim = embedding_dim + self.num_heads = num_heads + self.mlp_dim = mlp_dim + self.layers = nn.ModuleList() + + for i in range(depth): + self.layers.append( + TwoWayAttentionBlock( + embedding_dim=embedding_dim, + num_heads=num_heads, + mlp_dim=mlp_dim, + activation=activation, + attention_downsample_rate=attention_downsample_rate, + skip_first_layer_pe=(i == 0), + ) + ) + + self.final_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate) + self.norm_final_attn = nn.LayerNorm(embedding_dim) + + def forward( + self, image_embedding: torch.Tensor, image_pe: torch.Tensor, point_embedding: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + image_embedding: image to attend to. Should be shape + B x embedding_dim x h x w for any h and w. + image_pe: the positional encoding to add to the image. Must + have the same shape as image_embedding. + point_embedding: the embedding to add to the query points. + Must have shape B x N_points x embedding_dim for any N_points. + + Returns: + torch.Tensor: the processed point_embedding. + torch.Tensor: the processed image_embedding. + """ + # BxCxHxW -> BxHWxC == B x N_image_tokens x C + image_embedding = image_embedding.flatten(2).permute(0, 2, 1) + image_pe = image_pe.flatten(2).permute(0, 2, 1) + + # Prepare queries + queries = point_embedding + keys = image_embedding + + # Apply transformer blocks and final layernorm + for layer in self.layers: + queries, keys = layer(queries=queries, keys=keys, query_pe=point_embedding, key_pe=image_pe) + + # Apply the final attention layer from the points to the image + q = queries + point_embedding + k = keys + image_pe + attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys) + queries = queries + attn_out + queries = self.norm_final_attn(queries) + + return queries, keys + + +class TwoWayAttentionBlock(nn.Module): + def __init__( + self, + embedding_dim: int, + num_heads: int, + mlp_dim: int = 2048, + activation: tuple | str = "relu", + attention_downsample_rate: int = 2, + skip_first_layer_pe: bool = False, + ) -> None: + """ + A transformer block with four layers: (1) self-attention of sparse + inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp + block on sparse inputs, and (4) cross attention of dense inputs to sparse + inputs. + Adapted from `https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/transformer.py`. + + Args: + embedding_dim: the channel dimension of the embeddings. + num_heads: the number of heads in the attention layers. + mlp_dim: the hidden dimension of the mlp block. + activation: the activation of the mlp block. + skip_first_layer_pe: skip the PE on the first layer. + """ + super().__init__() + self.self_attn = Attention(embedding_dim, num_heads) + self.norm1 = nn.LayerNorm(embedding_dim) + + self.cross_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate) + self.norm2 = nn.LayerNorm(embedding_dim) + + self.mlp = MLPBlock(hidden_size=embedding_dim, mlp_dim=mlp_dim, act=activation, dropout_mode="vista3d") + self.norm3 = nn.LayerNorm(embedding_dim) + + self.norm4 = nn.LayerNorm(embedding_dim) + self.cross_attn_image_to_token = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate) + + self.skip_first_layer_pe = skip_first_layer_pe + + def forward( + self, queries: torch.Tensor, keys: torch.Tensor, query_pe: torch.Tensor, key_pe: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self attention block + if self.skip_first_layer_pe: + queries = self.self_attn(q=queries, k=queries, v=queries) + else: + q = queries + query_pe + attn_out = self.self_attn(q=q, k=q, v=queries) + queries = queries + attn_out + queries = self.norm1(queries) + + # Cross attention block, tokens attending to image embedding + q = queries + query_pe + k = keys + key_pe + attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys) + queries = queries + attn_out + queries = self.norm2(queries) + + # MLP block + mlp_out = self.mlp(queries) + queries = queries + mlp_out + queries = self.norm3(queries) + + # Cross attention block, image embedding attending to tokens + q = queries + query_pe + k = keys + key_pe + attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries) + keys = keys + attn_out + keys = self.norm4(keys) + + return queries, keys + + +class Attention(nn.Module): + """ + An attention layer that allows for downscaling the size of the embedding + after projection to queries, keys, and values. + Adapted from `https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/transformer.py`. + + Args: + embedding_dim: the channel dimension of the embeddings. + num_heads: the number of heads in the attention layers. + downsample_rate: the rate at which to downsample the image before projecting. + """ + + def __init__(self, embedding_dim: int, num_heads: int, downsample_rate: int = 1) -> None: + super().__init__() + self.embedding_dim = embedding_dim + self.internal_dim = embedding_dim // downsample_rate + self.num_heads = num_heads + if not self.internal_dim % num_heads == 0: + raise ValueError("num_heads must divide embedding_dim.") + + self.q_proj = nn.Linear(embedding_dim, self.internal_dim) + self.k_proj = nn.Linear(embedding_dim, self.internal_dim) + self.v_proj = nn.Linear(embedding_dim, self.internal_dim) + self.out_proj = nn.Linear(self.internal_dim, embedding_dim) + + def _separate_heads(self, x: torch.Tensor, num_heads: int) -> torch.Tensor: + b, n, c = x.shape + x = x.reshape(b, n, num_heads, c // num_heads) + # B x N_heads x N_tokens x C_per_head + return x.transpose(1, 2) + + def _recombine_heads(self, x: torch.Tensor) -> torch.Tensor: + b, n_heads, n_tokens, c_per_head = x.shape + x = x.transpose(1, 2) + # B x N_tokens x C + return x.reshape(b, n_tokens, n_heads * c_per_head) + + def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor: + # Input projections + q = self.q_proj(q) + k = self.k_proj(k) + v = self.v_proj(v) + + # Separate into heads + q = self._separate_heads(q, self.num_heads) + k = self._separate_heads(k, self.num_heads) + v = self._separate_heads(v, self.num_heads) + + # Attention + _, _, _, c_per_head = q.shape + attn = q @ k.permute(0, 1, 3, 2) # B x N_heads x N_tokens x N_tokens + attn = attn / math.sqrt(c_per_head) + attn = torch.softmax(attn, dim=-1) + + # Get output + out = attn @ v + out = self._recombine_heads(out) + out = self.out_proj(out) + + return out + + +class PositionEmbeddingRandom(nn.Module): + """ + Positional encoding using random spatial frequencies. + Adapted from `https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py`. + + Args: + num_pos_feats: the number of positional encoding features. + scale: the scale of the positional encoding. + """ + + def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None: + super().__init__() + if scale is None or scale <= 0.0: + scale = 1.0 + self.register_buffer("positional_encoding_gaussian_matrix", scale * torch.randn((3, num_pos_feats))) + + def _pe_encoding(self, coords: torch.torch.Tensor) -> torch.torch.Tensor: + """Positionally encode points that are normalized to [0,1].""" + # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape + coords = 2 * coords - 1 + # [bs=1,N=2,2] @ [2,128] + # [bs=1, N=2, 128] + coords = coords @ self.positional_encoding_gaussian_matrix + coords = 2 * np.pi * coords + # outputs d_1 x ... x d_n x C shape + # [bs=1, N=2, 128+128=256] + return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1) + + def forward(self, size: Tuple[int, int, int]) -> torch.torch.Tensor: + """Generate positional encoding for a grid of the specified size.""" + h, w, d = size + device: Any = self.positional_encoding_gaussian_matrix.device + grid = torch.ones((h, w, d), device=device, dtype=torch.float32) + x_embed = grid.cumsum(dim=0) - 0.5 + y_embed = grid.cumsum(dim=1) - 0.5 + z_embed = grid.cumsum(dim=2) - 0.5 + x_embed = x_embed / h + y_embed = y_embed / w + z_embed = z_embed / d + pe = self._pe_encoding(torch.stack([x_embed, y_embed, z_embed], dim=-1)) + # C x H x W + return pe.permute(3, 0, 1, 2) + + def forward_with_coords( + self, coords_input: torch.torch.Tensor, image_size: Tuple[int, int, int] + ) -> torch.torch.Tensor: + """Positionally encode points that are not normalized to [0,1].""" + coords = coords_input.clone() + coords[:, :, 0] = coords[:, :, 0] / image_size[0] + coords[:, :, 1] = coords[:, :, 1] / image_size[1] + coords[:, :, 2] = coords[:, :, 2] / image_size[2] + # B x N x C + return self._pe_encoding(coords.to(torch.float)) + + +class MLP(nn.Module): + """ + Multi-layer perceptron. This class is only used for `PointMappingSAM`. + Adapted from `https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/mask_decoder.py`. + + Args: + input_dim: the input dimension. + hidden_dim: the hidden dimension. + output_dim: the output dimension. + num_layers: the number of layers. + sigmoid_output: whether to apply a sigmoid activation to the output. + """ + + def __init__( + self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, sigmoid_output: bool = False + ) -> None: + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + self.sigmoid_output = sigmoid_output + + def forward(self, x: torch.Tensor) -> torch.Tensor: + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + if self.sigmoid_output: + x = F.sigmoid(x) + return x diff --git a/monai/networks/nets/vqvae.py b/monai/networks/nets/vqvae.py new file mode 100644 index 0000000000..f198bfbb2b --- /dev/null +++ b/monai/networks/nets/vqvae.py @@ -0,0 +1,472 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections.abc import Sequence +from typing import Tuple + +import torch +import torch.nn as nn + +from monai.networks.blocks import Convolution +from monai.networks.layers import Act +from monai.networks.layers.vector_quantizer import EMAQuantizer, VectorQuantizer +from monai.utils import ensure_tuple_rep + +__all__ = ["VQVAE"] + + +class VQVAEResidualUnit(nn.Module): + """ + Implementation of the ResidualLayer used in the VQVAE network as originally used in Morphology-preserving + Autoregressive 3D Generative Modelling of the Brain by Tudosiu et al. (https://arxiv.org/pdf/2209.03177.pdf). + + The original implementation that can be found at + https://github.com/AmigoLab/SynthAnatomy/blob/main/src/networks/vqvae/baseline.py#L150. + + Args: + spatial_dims: number of spatial spatial_dims of the input data. + in_channels: number of input channels. + num_res_channels: number of channels in the residual layers. + act: activation type and arguments. Defaults to RELU. + dropout: dropout ratio. Defaults to no dropout. + bias: whether to have a bias term. Defaults to True. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + num_res_channels: int, + act: tuple | str | None = Act.RELU, + dropout: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + + self.spatial_dims = spatial_dims + self.in_channels = in_channels + self.num_res_channels = num_res_channels + self.act = act + self.dropout = dropout + self.bias = bias + + self.conv1 = Convolution( + spatial_dims=self.spatial_dims, + in_channels=self.in_channels, + out_channels=self.num_res_channels, + adn_ordering="DA", + act=self.act, + dropout=self.dropout, + bias=self.bias, + ) + + self.conv2 = Convolution( + spatial_dims=self.spatial_dims, + in_channels=self.num_res_channels, + out_channels=self.in_channels, + bias=self.bias, + conv_only=True, + ) + + def forward(self, x): + return torch.nn.functional.relu(x + self.conv2(self.conv1(x)), True) + + +class Encoder(nn.Module): + """ + Encoder module for VQ-VAE. + + Args: + spatial_dims: number of spatial spatial_dims. + in_channels: number of input channels. + out_channels: number of channels in the latent space (embedding_dim). + channels: sequence containing the number of channels at each level of the encoder. + num_res_layers: number of sequential residual layers at each level. + num_res_channels: number of channels in the residual layers at each level. + downsample_parameters: A Tuple of Tuples for defining the downsampling convolutions. Each Tuple should hold the + following information stride (int), kernel_size (int), dilation (int) and padding (int). + dropout: dropout ratio. + act: activation type and arguments. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + channels: Sequence[int], + num_res_layers: int, + num_res_channels: Sequence[int], + downsample_parameters: Sequence[Tuple[int, int, int, int]], + dropout: float, + act: tuple | str | None, + ) -> None: + super().__init__() + self.spatial_dims = spatial_dims + self.in_channels = in_channels + self.out_channels = out_channels + self.channels = channels + self.num_res_layers = num_res_layers + self.num_res_channels = num_res_channels + self.downsample_parameters = downsample_parameters + self.dropout = dropout + self.act = act + + blocks: list[nn.Module] = [] + + for i in range(len(self.channels)): + blocks.append( + Convolution( + spatial_dims=self.spatial_dims, + in_channels=self.in_channels if i == 0 else self.channels[i - 1], + out_channels=self.channels[i], + strides=self.downsample_parameters[i][0], + kernel_size=self.downsample_parameters[i][1], + adn_ordering="DA", + act=self.act, + dropout=None if i == 0 else self.dropout, + dropout_dim=1, + dilation=self.downsample_parameters[i][2], + padding=self.downsample_parameters[i][3], + ) + ) + + for _ in range(self.num_res_layers): + blocks.append( + VQVAEResidualUnit( + spatial_dims=self.spatial_dims, + in_channels=self.channels[i], + num_res_channels=self.num_res_channels[i], + act=self.act, + dropout=self.dropout, + ) + ) + + blocks.append( + Convolution( + spatial_dims=self.spatial_dims, + in_channels=self.channels[len(self.channels) - 1], + out_channels=self.out_channels, + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + + self.blocks = nn.ModuleList(blocks) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + for block in self.blocks: + x = block(x) + return x + + +class Decoder(nn.Module): + """ + Decoder module for VQ-VAE. + + Args: + spatial_dims: number of spatial spatial_dims. + in_channels: number of channels in the latent space (embedding_dim). + out_channels: number of output channels. + channels: sequence containing the number of channels at each level of the decoder. + num_res_layers: number of sequential residual layers at each level. + num_res_channels: number of channels in the residual layers at each level. + upsample_parameters: A Tuple of Tuples for defining the upsampling convolutions. Each Tuple should hold the + following information stride (int), kernel_size (int), dilation (int), padding (int), output_padding (int). + dropout: dropout ratio. + act: activation type and arguments. + output_act: activation type and arguments for the output. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + channels: Sequence[int], + num_res_layers: int, + num_res_channels: Sequence[int], + upsample_parameters: Sequence[Tuple[int, int, int, int, int]], + dropout: float, + act: tuple | str | None, + output_act: tuple | str | None, + ) -> None: + super().__init__() + self.spatial_dims = spatial_dims + self.in_channels = in_channels + self.out_channels = out_channels + self.channels = channels + self.num_res_layers = num_res_layers + self.num_res_channels = num_res_channels + self.upsample_parameters = upsample_parameters + self.dropout = dropout + self.act = act + self.output_act = output_act + + reversed_num_channels = list(reversed(self.channels)) + + blocks: list[nn.Module] = [] + blocks.append( + Convolution( + spatial_dims=self.spatial_dims, + in_channels=self.in_channels, + out_channels=reversed_num_channels[0], + strides=1, + kernel_size=3, + padding=1, + conv_only=True, + ) + ) + + reversed_num_res_channels = list(reversed(self.num_res_channels)) + for i in range(len(self.channels)): + for _ in range(self.num_res_layers): + blocks.append( + VQVAEResidualUnit( + spatial_dims=self.spatial_dims, + in_channels=reversed_num_channels[i], + num_res_channels=reversed_num_res_channels[i], + act=self.act, + dropout=self.dropout, + ) + ) + + blocks.append( + Convolution( + spatial_dims=self.spatial_dims, + in_channels=reversed_num_channels[i], + out_channels=self.out_channels if i == len(self.channels) - 1 else reversed_num_channels[i + 1], + strides=self.upsample_parameters[i][0], + kernel_size=self.upsample_parameters[i][1], + adn_ordering="DA", + act=self.act, + dropout=self.dropout if i != len(self.channels) - 1 else None, + norm=None, + dilation=self.upsample_parameters[i][2], + conv_only=i == len(self.channels) - 1, + is_transposed=True, + padding=self.upsample_parameters[i][3], + output_padding=self.upsample_parameters[i][4], + ) + ) + + if self.output_act: + blocks.append(Act[self.output_act]()) + + self.blocks = nn.ModuleList(blocks) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + for block in self.blocks: + x = block(x) + return x + + +class VQVAE(nn.Module): + """ + Vector-Quantised Variational Autoencoder (VQ-VAE) used in Morphology-preserving Autoregressive 3D Generative + Modelling of the Brain by Tudosiu et al. (https://arxiv.org/pdf/2209.03177.pdf) + + The original implementation can be found at + https://github.com/AmigoLab/SynthAnatomy/blob/main/src/networks/vqvae/baseline.py#L163/ + + Args: + spatial_dims: number of spatial spatial_dims. + in_channels: number of input channels. + out_channels: number of output channels. + downsample_parameters: A Tuple of Tuples for defining the downsampling convolutions. Each Tuple should hold the + following information stride (int), kernel_size (int), dilation (int) and padding (int). + upsample_parameters: A Tuple of Tuples for defining the upsampling convolutions. Each Tuple should hold the + following information stride (int), kernel_size (int), dilation (int), padding (int), output_padding (int). + num_res_layers: number of sequential residual layers at each level. + channels: number of channels at each level. + num_res_channels: number of channels in the residual layers at each level. + num_embeddings: VectorQuantization number of atomic elements in the codebook. + embedding_dim: VectorQuantization number of channels of the input and atomic elements. + commitment_cost: VectorQuantization commitment_cost. + decay: VectorQuantization decay. + epsilon: VectorQuantization epsilon. + act: activation type and arguments. + dropout: dropout ratio. + output_act: activation type and arguments for the output. + ddp_sync: whether to synchronize the codebook across processes. + use_checkpointing if True, use activation checkpointing to save memory. + """ + + def __init__( + self, + spatial_dims: int, + in_channels: int, + out_channels: int, + channels: Sequence[int] = (96, 96, 192), + num_res_layers: int = 3, + num_res_channels: Sequence[int] | int = (96, 96, 192), + downsample_parameters: Sequence[Tuple[int, int, int, int]] | Tuple[int, int, int, int] = ( + (2, 4, 1, 1), + (2, 4, 1, 1), + (2, 4, 1, 1), + ), + upsample_parameters: Sequence[Tuple[int, int, int, int, int]] | Tuple[int, int, int, int, int] = ( + (2, 4, 1, 1, 0), + (2, 4, 1, 1, 0), + (2, 4, 1, 1, 0), + ), + num_embeddings: int = 32, + embedding_dim: int = 64, + embedding_init: str = "normal", + commitment_cost: float = 0.25, + decay: float = 0.5, + epsilon: float = 1e-5, + dropout: float = 0.0, + act: tuple | str | None = Act.RELU, + output_act: tuple | str | None = None, + ddp_sync: bool = True, + use_checkpointing: bool = False, + ): + super().__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + self.spatial_dims = spatial_dims + self.channels = channels + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.use_checkpointing = use_checkpointing + + if isinstance(num_res_channels, int): + num_res_channels = ensure_tuple_rep(num_res_channels, len(channels)) + + if len(num_res_channels) != len(channels): + raise ValueError( + "`num_res_channels` should be a single integer or a tuple of integers with the same length as " + "`num_channls`." + ) + if all(isinstance(values, int) for values in upsample_parameters): + upsample_parameters_tuple: Sequence = (upsample_parameters,) * len(channels) + else: + upsample_parameters_tuple = upsample_parameters + + if all(isinstance(values, int) for values in downsample_parameters): + downsample_parameters_tuple: Sequence = (downsample_parameters,) * len(channels) + else: + downsample_parameters_tuple = downsample_parameters + + if not all(all(isinstance(value, int) for value in sub_item) for sub_item in downsample_parameters_tuple): + raise ValueError("`downsample_parameters` should be a single tuple of integer or a tuple of tuples.") + + # check if downsample_parameters is a tuple of ints or a tuple of tuples of ints + if not all(all(isinstance(value, int) for value in sub_item) for sub_item in upsample_parameters_tuple): + raise ValueError("`upsample_parameters` should be a single tuple of integer or a tuple of tuples.") + + for parameter in downsample_parameters_tuple: + if len(parameter) != 4: + raise ValueError("`downsample_parameters` should be a tuple of tuples with 4 integers.") + + for parameter in upsample_parameters_tuple: + if len(parameter) != 5: + raise ValueError("`upsample_parameters` should be a tuple of tuples with 5 integers.") + + if len(downsample_parameters_tuple) != len(channels): + raise ValueError( + "`downsample_parameters` should be a tuple of tuples with the same length as `num_channels`." + ) + + if len(upsample_parameters_tuple) != len(channels): + raise ValueError( + "`upsample_parameters` should be a tuple of tuples with the same length as `num_channels`." + ) + + self.num_res_layers = num_res_layers + self.num_res_channels = num_res_channels + + self.encoder = Encoder( + spatial_dims=spatial_dims, + in_channels=in_channels, + out_channels=embedding_dim, + channels=channels, + num_res_layers=num_res_layers, + num_res_channels=num_res_channels, + downsample_parameters=downsample_parameters_tuple, + dropout=dropout, + act=act, + ) + + self.decoder = Decoder( + spatial_dims=spatial_dims, + in_channels=embedding_dim, + out_channels=out_channels, + channels=channels, + num_res_layers=num_res_layers, + num_res_channels=num_res_channels, + upsample_parameters=upsample_parameters_tuple, + dropout=dropout, + act=act, + output_act=output_act, + ) + + self.quantizer = VectorQuantizer( + quantizer=EMAQuantizer( + spatial_dims=spatial_dims, + num_embeddings=num_embeddings, + embedding_dim=embedding_dim, + commitment_cost=commitment_cost, + decay=decay, + epsilon=epsilon, + embedding_init=embedding_init, + ddp_sync=ddp_sync, + ) + ) + + def encode(self, images: torch.Tensor) -> torch.Tensor: + output: torch.Tensor + if self.use_checkpointing: + output = torch.utils.checkpoint.checkpoint(self.encoder, images, use_reentrant=False) + else: + output = self.encoder(images) + return output + + def quantize(self, encodings: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + x_loss, x = self.quantizer(encodings) + return x, x_loss + + def decode(self, quantizations: torch.Tensor) -> torch.Tensor: + output: torch.Tensor + + if self.use_checkpointing: + output = torch.utils.checkpoint.checkpoint(self.decoder, quantizations, use_reentrant=False) + else: + output = self.decoder(quantizations) + return output + + def index_quantize(self, images: torch.Tensor) -> torch.Tensor: + return self.quantizer.quantize(self.encode(images=images)) + + def decode_samples(self, embedding_indices: torch.Tensor) -> torch.Tensor: + return self.decode(self.quantizer.embed(embedding_indices)) + + def forward(self, images: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + quantizations, quantization_losses = self.quantize(self.encode(images)) + reconstruction = self.decode(quantizations) + + return reconstruction, quantization_losses + + def encode_stage_2_inputs(self, x: torch.Tensor) -> torch.Tensor: + z = self.encode(x) + e, _ = self.quantize(z) + return e + + def decode_stage_2_outputs(self, z: torch.Tensor) -> torch.Tensor: + e, _ = self.quantize(z) + image = self.decode(e) + return image diff --git a/monai/networks/schedulers/__init__.py b/monai/networks/schedulers/__init__.py new file mode 100644 index 0000000000..29e9020d65 --- /dev/null +++ b/monai/networks/schedulers/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from .ddim import DDIMScheduler +from .ddpm import DDPMScheduler +from .pndm import PNDMScheduler +from .scheduler import NoiseSchedules, Scheduler diff --git a/monai/networks/schedulers/ddim.py b/monai/networks/schedulers/ddim.py new file mode 100644 index 0000000000..2a0121d063 --- /dev/null +++ b/monai/networks/schedulers/ddim.py @@ -0,0 +1,294 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ========================================================================= +# Adapted from https://github.com/huggingface/diffusers +# which has the following license: +# https://github.com/huggingface/diffusers/blob/main/LICENSE +# +# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + +from __future__ import annotations + +import numpy as np +import torch + +from .ddpm import DDPMPredictionType +from .scheduler import Scheduler + +DDIMPredictionType = DDPMPredictionType + + +class DDIMScheduler(Scheduler): + """ + Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising + diffusion probabilistic models (DDPMs) with non-Markovian guidance. Based on: Song et al. "Denoising Diffusion + Implicit Models" https://arxiv.org/abs/2010.02502 + + Args: + num_train_timesteps: number of diffusion steps used to train the model. + schedule: member of NoiseSchedules, name of noise schedule function in component store + clip_sample: option to clip predicted sample between -1 and 1 for numerical stability. + set_alpha_to_one: each diffusion step uses the value of alphas product at that step and at the previous one. + For the final step there is no previous alpha. When this option is `True` the previous alpha product is + fixed to `1`, otherwise it uses the value of alpha at step 0. + steps_offset: an offset added to the inference steps. You can use a combination of `steps_offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. + prediction_type: member of DDPMPredictionType + clip_sample_min: minimum clipping value when clip_sample equals True + clip_sample_max: maximum clipping value when clip_sample equals True + schedule_args: arguments to pass to the schedule function + + """ + + def __init__( + self, + num_train_timesteps: int = 1000, + schedule: str = "linear_beta", + clip_sample: bool = True, + set_alpha_to_one: bool = True, + steps_offset: int = 0, + prediction_type: str = DDIMPredictionType.EPSILON, + clip_sample_min: float = -1.0, + clip_sample_max: float = 1.0, + **schedule_args, + ) -> None: + super().__init__(num_train_timesteps, schedule, **schedule_args) + + if prediction_type not in DDIMPredictionType.__members__.values(): + raise ValueError("Argument `prediction_type` must be a member of DDIMPredictionType") + + self.prediction_type = prediction_type + + # At every step in ddim, we are looking into the previous alphas_cumprod + # For the final step, there is no previous alphas_cumprod because we are already at 0 + # `set_alpha_to_one` decides whether we set this parameter simply to one or + # whether we use the final alpha of the "non-previous" one. + self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] + + # standard deviation of the initial noise distribution + self.init_noise_sigma = 1.0 + + self.timesteps = torch.from_numpy(np.arange(0, self.num_train_timesteps)[::-1].astype(np.int64)) + + self.clip_sample = clip_sample + self.clip_sample_values = [clip_sample_min, clip_sample_max] + self.steps_offset = steps_offset + + # default the number of inference timesteps to the number of train steps + self.num_inference_steps: int + self.set_timesteps(self.num_train_timesteps) + + def set_timesteps(self, num_inference_steps: int, device: str | torch.device | None = None) -> None: + """ + Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. + + Args: + num_inference_steps: number of diffusion steps used when generating samples with a pre-trained model. + device: target device to put the data. + """ + if num_inference_steps > self.num_train_timesteps: + raise ValueError( + f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.num_train_timesteps`:" + f" {self.num_train_timesteps} as the unet model trained with this scheduler can only handle" + f" maximal {self.num_train_timesteps} timesteps." + ) + + self.num_inference_steps = num_inference_steps + step_ratio = self.num_train_timesteps // self.num_inference_steps + if self.steps_offset >= step_ratio: + raise ValueError( + f"`steps_offset`: {self.steps_offset} cannot be greater than or equal to " + f"`num_train_timesteps // num_inference_steps : {step_ratio}` as this will cause timesteps to exceed" + f" the max train timestep." + ) + + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + self.timesteps = torch.from_numpy(timesteps).to(device) + self.timesteps += self.steps_offset + + def _get_variance(self, timestep: int, prev_timestep: int) -> torch.Tensor: + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + + variance: torch.Tensor = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) + + return variance + + def step( + self, + model_output: torch.Tensor, + timestep: int, + sample: torch.Tensor, + eta: float = 0.0, + generator: torch.Generator | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output: direct output from learned diffusion model. + timestep: current discrete timestep in the diffusion chain. + sample: current instance of sample being created by diffusion process. + eta: weight of noise for added noise in diffusion step. + generator: random number generator. + + Returns: + pred_prev_sample: Predicted previous sample + pred_original_sample: Predicted original sample + """ + # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf + # Ideally, read DDIM paper in-detail understanding + + # Notation ( -> + # - model_output -> e_theta(x_t, t) + # - pred_original_sample -> f_theta(x_t, t) or x_0 + # - std_dev_t -> sigma_t + # - eta -> η + # - pred_sample_direction -> "direction pointing to x_t" + # - pred_prev_sample -> "x_t-1" + + # 1. get previous step value (=t-1) + prev_timestep = timestep - self.num_train_timesteps // self.num_inference_steps + + # 2. compute alphas, betas + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + + beta_prod_t = 1 - alpha_prod_t + + # predefinitions satisfy pylint/mypy, these values won't be ultimately used + pred_original_sample = sample + pred_epsilon = model_output + + # 3. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + if self.prediction_type == DDIMPredictionType.EPSILON: + pred_original_sample = (sample - (beta_prod_t**0.5) * model_output) / (alpha_prod_t**0.5) + pred_epsilon = model_output + elif self.prediction_type == DDIMPredictionType.SAMPLE: + pred_original_sample = model_output + pred_epsilon = (sample - (alpha_prod_t**0.5) * pred_original_sample) / (beta_prod_t**0.5) + elif self.prediction_type == DDIMPredictionType.V_PREDICTION: + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample + + # 4. Clip "predicted x_0" + if self.clip_sample: + pred_original_sample = torch.clamp( + pred_original_sample, self.clip_sample_values[0], self.clip_sample_values[1] + ) + + # 5. compute variance: "sigma_t(η)" -> see formula (16) + # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1) + variance = self._get_variance(timestep, prev_timestep) + std_dev_t = eta * variance**0.5 + + # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** 0.5 * pred_epsilon + + # 7. compute x_t-1 without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + pred_prev_sample = alpha_prod_t_prev**0.5 * pred_original_sample + pred_sample_direction + + if eta > 0: + # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072 + device: torch.device = torch.device(model_output.device if torch.is_tensor(model_output) else "cpu") + noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device) + variance = self._get_variance(timestep, prev_timestep) ** 0.5 * eta * noise + + pred_prev_sample = pred_prev_sample + variance + + return pred_prev_sample, pred_original_sample + + def reversed_step( + self, model_output: torch.Tensor, timestep: int, sample: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Predict the sample at the next timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output: direct output from learned diffusion model. + timestep: current discrete timestep in the diffusion chain. + sample: current instance of sample being created by diffusion process. + + Returns: + pred_prev_sample: Predicted previous sample + pred_original_sample: Predicted original sample + """ + # See Appendix F at https://arxiv.org/pdf/2105.05233.pdf, or Equation (6) in https://arxiv.org/pdf/2203.04306.pdf + + # Notation ( -> + # - model_output -> e_theta(x_t, t) + # - pred_original_sample -> f_theta(x_t, t) or x_0 + # - std_dev_t -> sigma_t + # - eta -> η + # - pred_sample_direction -> "direction pointing to x_t" + # - pred_post_sample -> "x_t+1" + + # 1. get previous step value (=t+1) + prev_timestep = timestep + self.num_train_timesteps // self.num_inference_steps + + # 2. compute alphas, betas at timestep t+1 + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + + beta_prod_t = 1 - alpha_prod_t + + # predefinitions satisfy pylint/mypy, these values won't be ultimately used + pred_original_sample = sample + pred_epsilon = model_output + + # 3. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + + if self.prediction_type == DDIMPredictionType.EPSILON: + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + pred_epsilon = model_output + elif self.prediction_type == DDIMPredictionType.SAMPLE: + pred_original_sample = model_output + pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) + elif self.prediction_type == DDIMPredictionType.V_PREDICTION: + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample + + # 4. Clip "predicted x_0" + if self.clip_sample: + pred_original_sample = torch.clamp( + pred_original_sample, self.clip_sample_values[0], self.clip_sample_values[1] + ) + + # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon + + # 6. compute x_t+1 without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + pred_post_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction + + return pred_post_sample, pred_original_sample diff --git a/monai/networks/schedulers/ddpm.py b/monai/networks/schedulers/ddpm.py new file mode 100644 index 0000000000..93ad833031 --- /dev/null +++ b/monai/networks/schedulers/ddpm.py @@ -0,0 +1,250 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ========================================================================= +# Adapted from https://github.com/huggingface/diffusers +# which has the following license: +# https://github.com/huggingface/diffusers/blob/main/LICENSE +# +# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + +from __future__ import annotations + +import numpy as np +import torch + +from monai.utils import StrEnum + +from .scheduler import Scheduler + + +class DDPMVarianceType(StrEnum): + """ + Valid names for DDPM Scheduler's `variance_type` argument. Options to clip the variance used when adding noise + to the denoised sample. + """ + + FIXED_SMALL = "fixed_small" + FIXED_LARGE = "fixed_large" + LEARNED = "learned" + LEARNED_RANGE = "learned_range" + + +class DDPMPredictionType(StrEnum): + """ + Set of valid prediction type names for the DDPM scheduler's `prediction_type` argument. + + epsilon: predicting the noise of the diffusion process + sample: directly predicting the noisy sample + v_prediction: velocity prediction, see section 2.4 https://imagen.research.google/video/paper.pdf + """ + + EPSILON = "epsilon" + SAMPLE = "sample" + V_PREDICTION = "v_prediction" + + +class DDPMScheduler(Scheduler): + """ + Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and + Langevin dynamics sampling. Based on: Ho et al., "Denoising Diffusion Probabilistic Models" + https://arxiv.org/abs/2006.11239 + + Args: + num_train_timesteps: number of diffusion steps used to train the model. + schedule: member of NoiseSchedules, name of noise schedule function in component store + variance_type: member of DDPMVarianceType + clip_sample: option to clip predicted sample between -1 and 1 for numerical stability. + prediction_type: member of DDPMPredictionType + clip_sample_min: minimum clipping value when clip_sample equals True + clip_sample_max: maximum clipping value when clip_sample equals True + schedule_args: arguments to pass to the schedule function + """ + + def __init__( + self, + num_train_timesteps: int = 1000, + schedule: str = "linear_beta", + variance_type: str = DDPMVarianceType.FIXED_SMALL, + clip_sample: bool = True, + prediction_type: str = DDPMPredictionType.EPSILON, + clip_sample_min: float = -1.0, + clip_sample_max: float = 1.0, + **schedule_args, + ) -> None: + super().__init__(num_train_timesteps, schedule, **schedule_args) + + if variance_type not in DDPMVarianceType.__members__.values(): + raise ValueError("Argument `variance_type` must be a member of `DDPMVarianceType`") + + if prediction_type not in DDPMPredictionType.__members__.values(): + raise ValueError("Argument `prediction_type` must be a member of `DDPMPredictionType`") + + self.clip_sample = clip_sample + self.clip_sample_values = [clip_sample_min, clip_sample_max] + self.variance_type = variance_type + self.prediction_type = prediction_type + + def set_timesteps(self, num_inference_steps: int, device: str | torch.device | None = None) -> None: + """ + Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. + + Args: + num_inference_steps: number of diffusion steps used when generating samples with a pre-trained model. + device: target device to put the data. + """ + if num_inference_steps > self.num_train_timesteps: + raise ValueError( + f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.num_train_timesteps`:" + f" {self.num_train_timesteps} as the unet model trained with this scheduler can only handle" + f" maximal {self.num_train_timesteps} timesteps." + ) + + self.num_inference_steps = num_inference_steps + step_ratio = self.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].astype(np.int64) + self.timesteps = torch.from_numpy(timesteps).to(device) + + def _get_mean(self, timestep: int, x_0: torch.Tensor, x_t: torch.Tensor) -> torch.Tensor: + """ + Compute the mean of the posterior at timestep t. + + Args: + timestep: current timestep. + x0: the noise-free input. + x_t: the input noised to timestep t. + + Returns: + Returns the mean + """ + # these attributes are used for calculating the posterior, q(x_{t-1}|x_t,x_0), + # (see formula (5-7) from https://arxiv.org/pdf/2006.11239.pdf) + alpha_t = self.alphas[timestep] + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else self.one + + x_0_coefficient = alpha_prod_t_prev.sqrt() * self.betas[timestep] / (1 - alpha_prod_t) + x_t_coefficient = alpha_t.sqrt() * (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) + + mean: torch.Tensor = x_0_coefficient * x_0 + x_t_coefficient * x_t + + return mean + + def _get_variance(self, timestep: int, predicted_variance: torch.Tensor | None = None) -> torch.Tensor: + """ + Compute the variance of the posterior at timestep t. + + Args: + timestep: current timestep. + predicted_variance: variance predicted by the model. + + Returns: + Returns the variance + """ + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else self.one + + # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf) + # and sample from it to get previous sample + # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample + variance: torch.Tensor = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[timestep] + # hacks - were probably added for training stability + if self.variance_type == DDPMVarianceType.FIXED_SMALL: + variance = torch.clamp(variance, min=1e-20) + elif self.variance_type == DDPMVarianceType.FIXED_LARGE: + variance = self.betas[timestep] + elif self.variance_type == DDPMVarianceType.LEARNED and predicted_variance is not None: + return predicted_variance + elif self.variance_type == DDPMVarianceType.LEARNED_RANGE and predicted_variance is not None: + min_log = variance + max_log = self.betas[timestep] + frac = (predicted_variance + 1) / 2 + variance = frac * max_log + (1 - frac) * min_log + + return variance + + def step( + self, model_output: torch.Tensor, timestep: int, sample: torch.Tensor, generator: torch.Generator | None = None + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output: direct output from learned diffusion model. + timestep: current discrete timestep in the diffusion chain. + sample: current instance of sample being created by diffusion process. + generator: random number generator. + + Returns: + pred_prev_sample: Predicted previous sample + """ + if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]: + model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1) + else: + predicted_variance = None + + # 1. compute alphas, betas + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else self.one + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + + # 2. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf + if self.prediction_type == DDPMPredictionType.EPSILON: + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + elif self.prediction_type == DDPMPredictionType.SAMPLE: + pred_original_sample = model_output + elif self.prediction_type == DDPMPredictionType.V_PREDICTION: + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + + # 3. Clip "predicted x_0" + if self.clip_sample: + pred_original_sample = torch.clamp( + pred_original_sample, self.clip_sample_values[0], self.clip_sample_values[1] + ) + + # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t + # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * self.betas[timestep]) / beta_prod_t + current_sample_coeff = self.alphas[timestep] ** (0.5) * beta_prod_t_prev / beta_prod_t + + # 5. Compute predicted previous sample µ_t + # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample + + # 6. Add noise + variance = 0 + if timestep > 0: + noise = torch.randn( + model_output.size(), dtype=model_output.dtype, layout=model_output.layout, generator=generator + ).to(model_output.device) + variance = (self._get_variance(timestep, predicted_variance=predicted_variance) ** 0.5) * noise + + pred_prev_sample = pred_prev_sample + variance + + return pred_prev_sample, pred_original_sample diff --git a/monai/networks/schedulers/pndm.py b/monai/networks/schedulers/pndm.py new file mode 100644 index 0000000000..c0728bbdff --- /dev/null +++ b/monai/networks/schedulers/pndm.py @@ -0,0 +1,316 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ========================================================================= +# Adapted from https://github.com/huggingface/diffusers +# which has the following license: +# https://github.com/huggingface/diffusers/blob/main/LICENSE +# +# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + +from __future__ import annotations + +from typing import Any + +import numpy as np +import torch + +from monai.utils import StrEnum + +from .scheduler import Scheduler + + +class PNDMPredictionType(StrEnum): + """ + Set of valid prediction type names for the PNDM scheduler's `prediction_type` argument. + + epsilon: predicting the noise of the diffusion process + v_prediction: velocity prediction, see section 2.4 https://imagen.research.google/video/paper.pdf + """ + + EPSILON = "epsilon" + V_PREDICTION = "v_prediction" + + +class PNDMScheduler(Scheduler): + """ + Pseudo numerical methods for diffusion models (PNDM) proposes using more advanced ODE integration techniques, + namely Runge-Kutta method and a linear multi-step method. Based on: Liu et al., + "Pseudo Numerical Methods for Diffusion Models on Manifolds" https://arxiv.org/abs/2202.09778 + + Args: + num_train_timesteps: number of diffusion steps used to train the model. + schedule: member of NoiseSchedules, name of noise schedule function in component store + skip_prk_steps: + allows the scheduler to skip the Runge-Kutta steps that are defined in the original paper as being required + before plms step. + set_alpha_to_one: + each diffusion step uses the value of alphas product at that step and at the previous one. For the final + step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`, + otherwise it uses the value of alpha at step 0. + prediction_type: member of DDPMPredictionType + steps_offset: + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. + schedule_args: arguments to pass to the schedule function + """ + + def __init__( + self, + num_train_timesteps: int = 1000, + schedule: str = "linear_beta", + skip_prk_steps: bool = False, + set_alpha_to_one: bool = False, + prediction_type: str = PNDMPredictionType.EPSILON, + steps_offset: int = 0, + **schedule_args, + ) -> None: + super().__init__(num_train_timesteps, schedule, **schedule_args) + + if prediction_type not in PNDMPredictionType.__members__.values(): + raise ValueError("Argument `prediction_type` must be a member of PNDMPredictionType") + + self.prediction_type = prediction_type + + self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] + + # standard deviation of the initial noise distribution + self.init_noise_sigma = 1.0 + + # For now we only support F-PNDM, i.e. the runge-kutta method + # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf + # mainly at formula (9), (12), (13) and the Algorithm 2. + self.pndm_order = 4 + + self.skip_prk_steps = skip_prk_steps + self.steps_offset = steps_offset + + # running values + self.cur_model_output = torch.Tensor() + self.counter = 0 + self.cur_sample = torch.Tensor() + self.ets: list = [] + + # default the number of inference timesteps to the number of train steps + self.set_timesteps(num_train_timesteps) + + def set_timesteps(self, num_inference_steps: int, device: str | torch.device | None = None) -> None: + """ + Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. + + Args: + num_inference_steps: number of diffusion steps used when generating samples with a pre-trained model. + device: target device to put the data. + """ + if num_inference_steps > self.num_train_timesteps: + raise ValueError( + f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.num_train_timesteps`:" + f" {self.num_train_timesteps} as the unet model trained with this scheduler can only handle" + f" maximal {self.num_train_timesteps} timesteps." + ) + + self.num_inference_steps = num_inference_steps + step_ratio = self.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round().astype(np.int64) + self._timesteps += self.steps_offset + + if self.skip_prk_steps: + # for some models like stable diffusion the prk steps can/should be skipped to + # produce better results. When using PNDM with `self.skip_prk_steps` the implementation + # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51 + self.prk_timesteps = np.array([]) + self.plms_timesteps = self._timesteps[::-1] + + else: + prk_timesteps = np.array(self._timesteps[-self.pndm_order :]).repeat(2) + np.tile( + np.array([0, self.num_train_timesteps // num_inference_steps // 2]), self.pndm_order + ) + self.prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy() + self.plms_timesteps = self._timesteps[:-3][ + ::-1 + ].copy() # we copy to avoid having negative strides which are not supported by torch.from_numpy + + timesteps = np.concatenate([self.prk_timesteps, self.plms_timesteps]).astype(np.int64) + self.timesteps = torch.from_numpy(timesteps).to(device) + # update num_inference_steps - necessary if we use prk steps + self.num_inference_steps = len(self.timesteps) + + self.ets = [] + self.counter = 0 + + def step(self, model_output: torch.Tensor, timestep: int, sample: torch.Tensor) -> tuple[torch.Tensor, Any]: + """ + Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + This function calls `step_prk()` or `step_plms()` depending on the internal variable `counter`. + + Args: + model_output: direct output from learned diffusion model. + timestep: current discrete timestep in the diffusion chain. + sample: current instance of sample being created by diffusion process. + Returns: + pred_prev_sample: Predicted previous sample + """ + # return a tuple for consistency with samplers that return (previous pred, original sample pred) + + if self.counter < len(self.prk_timesteps) and not self.skip_prk_steps: + return self.step_prk(model_output=model_output, timestep=timestep, sample=sample), None + else: + return self.step_plms(model_output=model_output, timestep=timestep, sample=sample), None + + def step_prk(self, model_output: torch.Tensor, timestep: int, sample: torch.Tensor) -> torch.Tensor: + """ + Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the + solution to the differential equation. + + Args: + model_output: direct output from learned diffusion model. + timestep: current discrete timestep in the diffusion chain. + sample: current instance of sample being created by diffusion process. + + Returns: + pred_prev_sample: Predicted previous sample + """ + if self.num_inference_steps is None: + raise ValueError( + "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" + ) + + diff_to_prev = 0 if self.counter % 2 else self.num_train_timesteps // self.num_inference_steps // 2 + prev_timestep = timestep - diff_to_prev + timestep = self.prk_timesteps[self.counter // 4 * 4] + + if self.counter % 4 == 0: + self.cur_model_output = 1 / 6 * model_output + self.ets.append(model_output) + self.cur_sample = sample + elif (self.counter - 1) % 4 == 0: + self.cur_model_output += 1 / 3 * model_output + elif (self.counter - 2) % 4 == 0: + self.cur_model_output += 1 / 3 * model_output + elif (self.counter - 3) % 4 == 0: + model_output = self.cur_model_output + 1 / 6 * model_output + self.cur_model_output = torch.Tensor() + + # cur_sample should not be an empty torch.Tensor() + cur_sample = self.cur_sample if self.cur_sample.numel() != 0 else sample + + prev_sample: torch.Tensor = self._get_prev_sample(cur_sample, timestep, prev_timestep, model_output) + self.counter += 1 + + return prev_sample + + def step_plms(self, model_output: torch.Tensor, timestep: int, sample: torch.Tensor) -> Any: + """ + Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple + times to approximate the solution. + + Args: + model_output: direct output from learned diffusion model. + timestep: current discrete timestep in the diffusion chain. + sample: current instance of sample being created by diffusion process. + + Returns: + pred_prev_sample: Predicted previous sample + """ + if self.num_inference_steps is None: + raise ValueError( + "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" + ) + + if not self.skip_prk_steps and len(self.ets) < 3: + raise ValueError( + f"{self.__class__} can only be run AFTER scheduler has been run " + "in 'prk' mode for at least 12 iterations " + ) + + prev_timestep = timestep - self.num_train_timesteps // self.num_inference_steps + + if self.counter != 1: + self.ets = self.ets[-3:] + self.ets.append(model_output) + else: + prev_timestep = timestep + timestep = timestep + self.num_train_timesteps // self.num_inference_steps + + if len(self.ets) == 1 and self.counter == 0: + model_output = model_output + self.cur_sample = sample + elif len(self.ets) == 1 and self.counter == 1: + model_output = (model_output + self.ets[-1]) / 2 + sample = self.cur_sample + self.cur_sample = torch.Tensor() + elif len(self.ets) == 2: + model_output = (3 * self.ets[-1] - self.ets[-2]) / 2 + elif len(self.ets) == 3: + model_output = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12 + else: + model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4]) + + prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output) + self.counter += 1 + + return prev_sample + + def _get_prev_sample(self, sample: torch.Tensor, timestep: int, prev_timestep: int, model_output: torch.Tensor): + # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf + # this function computes x_(t−δ) using the formula of (9) + # Note that x_t needs to be added to both sides of the equation + + # Notation ( -> + # alpha_prod_t -> α_t + # alpha_prod_t_prev -> α_(t−δ) + # beta_prod_t -> (1 - α_t) + # beta_prod_t_prev -> (1 - α_(t−δ)) + # sample -> x_t + # model_output -> e_θ(x_t, t) + # prev_sample -> x_(t−δ) + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + + if self.prediction_type == PNDMPredictionType.V_PREDICTION: + model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample + + # corresponds to (α_(t−δ) - α_t) divided by + # denominator of x_t in formula (9) and plus 1 + # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) = + # sqrt(α_(t−δ)) / sqrt(α_t)) + sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5) + + # corresponds to denominator of e_θ(x_t, t) in formula (9) + model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + ( + alpha_prod_t * beta_prod_t * alpha_prod_t_prev + ) ** (0.5) + + # full formula (9) + prev_sample = ( + sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff + ) + + return prev_sample diff --git a/monai/networks/schedulers/scheduler.py b/monai/networks/schedulers/scheduler.py new file mode 100644 index 0000000000..acdccc60de --- /dev/null +++ b/monai/networks/schedulers/scheduler.py @@ -0,0 +1,205 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ========================================================================= +# Adapted from https://github.com/huggingface/diffusers +# which has the following license: +# https://github.com/huggingface/diffusers/blob/main/LICENSE +# +# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= + + +from __future__ import annotations + +import torch +import torch.nn as nn + +from monai.utils import ComponentStore, unsqueeze_right + +NoiseSchedules = ComponentStore("NoiseSchedules", "Functions to generate noise schedules") + + +@NoiseSchedules.add_def("linear_beta", "Linear beta schedule") +def _linear_beta(num_train_timesteps: int, beta_start: float = 1e-4, beta_end: float = 2e-2): + """ + Linear beta noise schedule function. + + Args: + num_train_timesteps: number of timesteps + beta_start: start of beta range, default 1e-4 + beta_end: end of beta range, default 2e-2 + + Returns: + betas: beta schedule tensor + """ + return torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) + + +@NoiseSchedules.add_def("scaled_linear_beta", "Scaled linear beta schedule") +def _scaled_linear_beta(num_train_timesteps: int, beta_start: float = 1e-4, beta_end: float = 2e-2): + """ + Scaled linear beta noise schedule function. + + Args: + num_train_timesteps: number of timesteps + beta_start: start of beta range, default 1e-4 + beta_end: end of beta range, default 2e-2 + + Returns: + betas: beta schedule tensor + """ + return torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2 + + +@NoiseSchedules.add_def("sigmoid_beta", "Sigmoid beta schedule") +def _sigmoid_beta(num_train_timesteps: int, beta_start: float = 1e-4, beta_end: float = 2e-2, sig_range: float = 6): + """ + Sigmoid beta noise schedule function. + + Args: + num_train_timesteps: number of timesteps + beta_start: start of beta range, default 1e-4 + beta_end: end of beta range, default 2e-2 + sig_range: pos/neg range of sigmoid input, default 6 + + Returns: + betas: beta schedule tensor + """ + betas = torch.linspace(-sig_range, sig_range, num_train_timesteps) + return torch.sigmoid(betas) * (beta_end - beta_start) + beta_start + + +@NoiseSchedules.add_def("cosine", "Cosine schedule") +def _cosine_beta(num_train_timesteps: int, s: float = 8e-3): + """ + Cosine noise schedule, see https://arxiv.org/abs/2102.09672 + + Args: + num_train_timesteps: number of timesteps + s: smoothing factor, default 8e-3 (see referenced paper) + + Returns: + (betas, alphas, alpha_cumprod) values + """ + x = torch.linspace(0, num_train_timesteps, num_train_timesteps + 1) + alphas_cumprod = torch.cos(((x / num_train_timesteps) + s) / (1 + s) * torch.pi * 0.5) ** 2 + alphas_cumprod /= alphas_cumprod[0].item() + alphas = torch.clip(alphas_cumprod[1:] / alphas_cumprod[:-1], 0.0001, 0.9999) + betas = 1.0 - alphas + return betas, alphas, alphas_cumprod[:-1] + + +class Scheduler(nn.Module): + """ + Base class for other schedulers based on a noise schedule function. + + This class is meant as the base for other schedulers which implement their own way of sampling or stepping. Here + the class defines beta, alpha, and alpha_cumprod values from a noise schedule function named with `schedule`, + which is the name of a component in NoiseSchedules. These components must all be callables which return either + the beta schedule alone or a triple containing (betas, alphas, alphas_cumprod) values. New schedule functions + can be provided by using the NoiseSchedules.add_def, for example: + + .. code-block:: python + + from monai.networks.schedulers import NoiseSchedules, DDPMScheduler + + @NoiseSchedules.add_def("my_beta_schedule", "Some description of your function") + def _beta_function(num_train_timesteps, beta_start=1e-4, beta_end=2e-2): + return torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) + + scheduler = DDPMScheduler(num_train_timesteps=1000, schedule="my_beta_schedule") + + All such functions should have an initial positional integer argument `num_train_timesteps` stating the number of + timesteps the schedule is for, otherwise any other arguments can be given which will be passed by keyword through + the constructor's `schedule_args` value. To see what noise functions are available, print the object NoiseSchedules + to get a listing of stored objects with their docstring descriptions. + + Note: in previous versions of the schedulers the argument `schedule_beta` was used to state the beta schedule + type, this now replaced with `schedule` and most names used with the previous argument now have "_beta" appended + to them, eg. 'schedule_beta="linear"' -> 'schedule="linear_beta"'. The `beta_start` and `beta_end` arguments are + still used for some schedules but these are provided as keyword arguments now. + + Args: + num_train_timesteps: number of diffusion steps used to train the model. + schedule: member of NoiseSchedules, + a named function returning the beta tensor or (betas, alphas, alphas_cumprod) triple + schedule_args: arguments to pass to the schedule function + """ + + def __init__(self, num_train_timesteps: int = 1000, schedule: str = "linear_beta", **schedule_args) -> None: + super().__init__() + schedule_args["num_train_timesteps"] = num_train_timesteps + noise_sched = NoiseSchedules[schedule](**schedule_args) + + # set betas, alphas, alphas_cumprod based off return value from noise function + if isinstance(noise_sched, tuple): + self.betas, self.alphas, self.alphas_cumprod = noise_sched + else: + self.betas = noise_sched + self.alphas = 1.0 - self.betas + self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) + + self.num_train_timesteps = num_train_timesteps + self.one = torch.tensor(1.0) + + # settable values + self.num_inference_steps: int | None = None + self.timesteps = torch.arange(num_train_timesteps - 1, -1, -1) + + def add_noise(self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor: + """ + Add noise to the original samples. + + Args: + original_samples: original samples + noise: noise to add to samples + timesteps: timesteps tensor indicating the timestep to be computed for each sample. + + Returns: + noisy_samples: sample with added noise + """ + # Make sure alphas_cumprod and timestep have same device and dtype as original_samples + self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) + timesteps = timesteps.to(original_samples.device) + + sqrt_alpha_cumprod: torch.Tensor = unsqueeze_right(self.alphas_cumprod[timesteps] ** 0.5, original_samples.ndim) + sqrt_one_minus_alpha_prod: torch.Tensor = unsqueeze_right( + (1 - self.alphas_cumprod[timesteps]) ** 0.5, original_samples.ndim + ) + + noisy_samples = sqrt_alpha_cumprod * original_samples + sqrt_one_minus_alpha_prod * noise + return noisy_samples + + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor: + # Make sure alphas_cumprod and timestep have same device and dtype as sample + self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) + timesteps = timesteps.to(sample.device) + + sqrt_alpha_prod: torch.Tensor = unsqueeze_right(self.alphas_cumprod[timesteps] ** 0.5, sample.ndim) + sqrt_one_minus_alpha_prod: torch.Tensor = unsqueeze_right( + (1 - self.alphas_cumprod[timesteps]) ** 0.5, sample.ndim + ) + + velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample + return velocity diff --git a/monai/networks/utils.py b/monai/networks/utils.py index 42e537648a..f301c2dd5c 100644 --- a/monai/networks/utils.py +++ b/monai/networks/utils.py @@ -42,6 +42,7 @@ "predict_segmentation", "normalize_transform", "to_norm_affine", + "CastTempType", "normal_init", "icnr_init", "pixelshuffle", @@ -821,7 +822,7 @@ def _onnx_trt_compile( output_names = [] if not output_names else output_names # set up the TensorRT builder - torch_tensorrt.set_device(device) + torch.cuda.set_device(device) logger = trt.Logger(trt.Logger.WARNING) builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) @@ -840,7 +841,6 @@ def _onnx_trt_compile( # set up the conversion configuration config = builder.create_builder_config() - config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 31) config.add_optimization_profile(profile) if precision == "fp16": config.set_flag(trt.BuilderFlag.FP16) @@ -850,7 +850,10 @@ def _onnx_trt_compile( # wrap the serialized TensorRT engine back to a TorchScript module. trt_model = torch_tensorrt.ts.embed_engine_in_new_module( - f.getvalue(), torch.device(f"cuda:{device}"), input_names, output_names + f.getvalue(), + device=torch.device(f"cuda:{device}"), + input_binding_names=input_names, + output_binding_names=output_names, ) return trt_model @@ -928,7 +931,7 @@ def convert_to_trt( warnings.warn(f"The dynamic batch range sequence should have 3 elements, but got {dynamic_batchsize} elements.") device = device if device else 0 - target_device = torch.device(f"cuda:{device}") if device else torch.device("cuda:0") + target_device = torch.device(f"cuda:{device}") convert_precision = torch.float32 if precision == "fp32" else torch.half inputs = [torch.rand(ensure_tuple(input_shape)).to(target_device)] @@ -983,7 +986,8 @@ def scale_batch_size(input_shape: Sequence[int], scale_num: int): ir_model, inputs=input_placeholder, enabled_precisions=convert_precision, - device=target_device, + device=torch_tensorrt.Device(f"cuda:{device}"), + ir="torchscript", **kwargs, ) @@ -1164,3 +1168,24 @@ def freeze_layers(model: nn.Module, freeze_vars=None, exclude_vars=None): warnings.warn(f"The exclude_vars includes {param}, but requires_grad is False, change it to True.") logger.info(f"{len(frozen_keys)} of {len(src_dict)} variables frozen.") + + +class CastTempType(nn.Module): + """ + Cast the input tensor to a temporary type before applying the submodule, and then cast it back to the initial type. + """ + + def __init__(self, initial_type, temporary_type, submodule): + super().__init__() + self.initial_type = initial_type + self.temporary_type = temporary_type + self.submodule = submodule + + def forward(self, x): + dtype = x.dtype + if dtype == self.initial_type: + x = x.to(self.temporary_type) + x = self.submodule(x) + if dtype == self.initial_type: + x = x.to(self.initial_type) + return x diff --git a/monai/optimizers/lr_finder.py b/monai/optimizers/lr_finder.py index 045135628d..aa2e4567b3 100644 --- a/monai/optimizers/lr_finder.py +++ b/monai/optimizers/lr_finder.py @@ -524,7 +524,7 @@ def plot( # Plot the LR with steepest gradient if steepest_lr: lr_at_steepest_grad, loss_at_steepest_grad = self.get_steepest_gradient(skip_start, skip_end) - if lr_at_steepest_grad is not None: + if lr_at_steepest_grad is not None and loss_at_steepest_grad is not None: ax.scatter( lr_at_steepest_grad, loss_at_steepest_grad, diff --git a/monai/transforms/__init__.py b/monai/transforms/__init__.py index 2aa8fbf8a1..9548443768 100644 --- a/monai/transforms/__init__.py +++ b/monai/transforms/__init__.py @@ -92,6 +92,7 @@ from .croppad.functional import crop_func, crop_or_pad_nd, pad_func, pad_nd from .intensity.array import ( AdjustContrast, + ClipIntensityPercentiles, ComputeHoVerMaps, DetectEnvelope, ForegroundMask, @@ -135,6 +136,9 @@ AdjustContrastd, AdjustContrastD, AdjustContrastDict, + ClipIntensityPercentilesd, + ClipIntensityPercentilesD, + ClipIntensityPercentilesDict, ComputeHoVerMapsd, ComputeHoVerMapsD, ComputeHoVerMapsDict, @@ -336,6 +340,18 @@ VoteEnsembled, VoteEnsembleDict, ) +from .regularization.array import CutMix, CutOut, MixUp +from .regularization.dictionary import ( + CutMixd, + CutMixD, + CutMixDict, + CutOutd, + CutOutD, + CutOutDict, + MixUpd, + MixUpD, + MixUpDict, +) from .signal.array import ( SignalContinuousWavelet, SignalFillEmpty, @@ -655,6 +671,7 @@ in_bounds, is_empty, is_positive, + map_and_generate_sampling_centers, map_binary_to_indices, map_classes_to_indices, map_spatial_axes, @@ -671,6 +688,7 @@ weighted_patch_samples, zero_margins, ) +from .utils_morphological_ops import dilate, erode from .utils_pytorch_numpy_unification import ( allclose, any_np_pt, diff --git a/monai/transforms/croppad/array.py b/monai/transforms/croppad/array.py index ce3701b263..813f8c1d44 100644 --- a/monai/transforms/croppad/array.py +++ b/monai/transforms/croppad/array.py @@ -362,10 +362,10 @@ def __init__(self, lazy: bool = False): @staticmethod def compute_slices( - roi_center: Sequence[int] | NdarrayOrTensor | None = None, - roi_size: Sequence[int] | NdarrayOrTensor | None = None, - roi_start: Sequence[int] | NdarrayOrTensor | None = None, - roi_end: Sequence[int] | NdarrayOrTensor | None = None, + roi_center: Sequence[int] | int | NdarrayOrTensor | None = None, + roi_size: Sequence[int] | int | NdarrayOrTensor | None = None, + roi_start: Sequence[int] | int | NdarrayOrTensor | None = None, + roi_end: Sequence[int] | int | NdarrayOrTensor | None = None, roi_slices: Sequence[slice] | None = None, ) -> tuple[slice]: """ @@ -459,10 +459,10 @@ class SpatialCrop(Crop): def __init__( self, - roi_center: Sequence[int] | NdarrayOrTensor | None = None, - roi_size: Sequence[int] | NdarrayOrTensor | None = None, - roi_start: Sequence[int] | NdarrayOrTensor | None = None, - roi_end: Sequence[int] | NdarrayOrTensor | None = None, + roi_center: Sequence[int] | int | NdarrayOrTensor | None = None, + roi_size: Sequence[int] | int | NdarrayOrTensor | None = None, + roi_start: Sequence[int] | int | NdarrayOrTensor | None = None, + roi_end: Sequence[int] | int | NdarrayOrTensor | None = None, roi_slices: Sequence[slice] | None = None, lazy: bool = False, ) -> None: diff --git a/monai/transforms/croppad/dictionary.py b/monai/transforms/croppad/dictionary.py index be9441dc4a..cea11d9676 100644 --- a/monai/transforms/croppad/dictionary.py +++ b/monai/transforms/croppad/dictionary.py @@ -438,10 +438,10 @@ class SpatialCropd(Cropd): def __init__( self, keys: KeysCollection, - roi_center: Sequence[int] | None = None, - roi_size: Sequence[int] | None = None, - roi_start: Sequence[int] | None = None, - roi_end: Sequence[int] | None = None, + roi_center: Sequence[int] | int | None = None, + roi_size: Sequence[int] | int | None = None, + roi_start: Sequence[int] | int | None = None, + roi_end: Sequence[int] | int | None = None, roi_slices: Sequence[slice] | None = None, allow_missing_keys: bool = False, lazy: bool = False, diff --git a/monai/transforms/croppad/functional.py b/monai/transforms/croppad/functional.py index a8286fb90c..361ec48dcd 100644 --- a/monai/transforms/croppad/functional.py +++ b/monai/transforms/croppad/functional.py @@ -48,7 +48,7 @@ def _np_pad(img: NdarrayTensor, pad_width: list[tuple[int, int]], mode: str, **k warnings.warn(f"Padding: moving img {img.shape} from cuda to cpu for dtype={img.dtype} mode={mode}.") img_np = img.detach().cpu().numpy() else: - img_np = img + img_np = np.asarray(img) mode = convert_pad_mode(dst=img_np, mode=mode).value if mode == "constant" and "value" in kwargs: kwargs["constant_values"] = kwargs.pop("value") diff --git a/monai/transforms/intensity/array.py b/monai/transforms/intensity/array.py index f9667402c9..3b813809e4 100644 --- a/monai/transforms/intensity/array.py +++ b/monai/transforms/intensity/array.py @@ -30,7 +30,7 @@ from monai.data.utils import get_random_patch, get_valid_patch_size from monai.networks.layers import GaussianFilter, HilbertTransform, MedianFilter, SavitzkyGolayFilter from monai.transforms.transform import RandomizableTransform, Transform -from monai.transforms.utils import Fourier, equalize_hist, is_positive, rescale_array +from monai.transforms.utils import Fourier, equalize_hist, is_positive, rescale_array, soft_clip from monai.transforms.utils_pytorch_numpy_unification import clip, percentile, where from monai.utils.enums import TransformBackends from monai.utils.misc import ensure_tuple, ensure_tuple_rep, ensure_tuple_size, fall_back_tuple @@ -54,6 +54,7 @@ "NormalizeIntensity", "ThresholdIntensity", "ScaleIntensityRange", + "ClipIntensityPercentiles", "AdjustContrast", "RandAdjustContrast", "ScaleIntensityRangePercentiles", @@ -91,24 +92,33 @@ class RandGaussianNoise(RandomizableTransform): mean: Mean or “centre” of the distribution. std: Standard deviation (spread) of distribution. dtype: output data type, if None, same as input image. defaults to float32. + sample_std: If True, sample the spread of the Gaussian distribution uniformly from 0 to std. """ backend = [TransformBackends.TORCH, TransformBackends.NUMPY] - def __init__(self, prob: float = 0.1, mean: float = 0.0, std: float = 0.1, dtype: DtypeLike = np.float32) -> None: + def __init__( + self, + prob: float = 0.1, + mean: float = 0.0, + std: float = 0.1, + dtype: DtypeLike = np.float32, + sample_std: bool = True, + ) -> None: RandomizableTransform.__init__(self, prob) self.mean = mean self.std = std self.dtype = dtype self.noise: np.ndarray | None = None + self.sample_std = sample_std def randomize(self, img: NdarrayOrTensor, mean: float | None = None) -> None: super().randomize(None) if not self._do_transform: return None - rand_std = self.R.uniform(0, self.std) - noise = self.R.normal(self.mean if mean is None else mean, rand_std, size=img.shape) + std = self.R.uniform(0, self.std) if self.sample_std else self.std + noise = self.R.normal(self.mean if mean is None else mean, std, size=img.shape) # noise is float64 array, convert to the output dtype to save memory self.noise, *_ = convert_data_type(noise, dtype=self.dtype) @@ -998,6 +1008,151 @@ def __call__(self, img: NdarrayOrTensor) -> NdarrayOrTensor: return ret +class ClipIntensityPercentiles(Transform): + """ + Apply clip based on the intensity distribution of input image. + If `sharpness_factor` is provided, the intensity values will be soft clipped according to + f(x) = x + (1/sharpness_factor)*softplus(- c(x - minv)) - (1/sharpness_factor)*softplus(c(x - maxv)) + From https://medium.com/life-at-hopper/clip-it-clip-it-good-1f1bf711b291 + + Soft clipping preserves the order of the values and maintains the gradient everywhere. + For example: + + .. code-block:: python + :emphasize-lines: 11, 22 + + image = torch.Tensor( + [[[1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5]]]) + + # Hard clipping from lower and upper image intensity percentiles + hard_clipper = ClipIntensityPercentiles(30, 70) + print(hard_clipper(image)) + metatensor([[[2., 2., 3., 4., 4.], + [2., 2., 3., 4., 4.], + [2., 2., 3., 4., 4.], + [2., 2., 3., 4., 4.], + [2., 2., 3., 4., 4.], + [2., 2., 3., 4., 4.]]]) + + + # Soft clipping from lower and upper image intensity percentiles + soft_clipper = ClipIntensityPercentiles(30, 70, 10.) + print(soft_clipper(image)) + metatensor([[[2.0000, 2.0693, 3.0000, 3.9307, 4.0000], + [2.0000, 2.0693, 3.0000, 3.9307, 4.0000], + [2.0000, 2.0693, 3.0000, 3.9307, 4.0000], + [2.0000, 2.0693, 3.0000, 3.9307, 4.0000], + [2.0000, 2.0693, 3.0000, 3.9307, 4.0000], + [2.0000, 2.0693, 3.0000, 3.9307, 4.0000]]]) + + See Also: + + - :py:class:`monai.transforms.ScaleIntensityRangePercentiles` + """ + + backend = [TransformBackends.TORCH, TransformBackends.NUMPY] + + def __init__( + self, + lower: float | None, + upper: float | None, + sharpness_factor: float | None = None, + channel_wise: bool = False, + return_clipping_values: bool = False, + dtype: DtypeLike = np.float32, + ) -> None: + """ + Args: + lower: lower intensity percentile. In the case of hard clipping, None will have the same effect as 0 by + not clipping the lowest input values. However, in the case of soft clipping, None and zero will have + two different effects: None will not apply clipping to low values, whereas zero will still transform + the lower values according to the soft clipping transformation. Please check for more details: + https://medium.com/life-at-hopper/clip-it-clip-it-good-1f1bf711b291. + upper: upper intensity percentile. The same as for lower, but this time with the highest values. If we + are looking to perform soft clipping, if None then there will be no effect on this side whereas if set + to 100, the values will be passed via the corresponding clipping equation. + sharpness_factor: if not None, the intensity values will be soft clipped according to + f(x) = x + (1/sharpness_factor)*softplus(- c(x - minv)) - (1/sharpness_factor)*softplus(c(x - maxv)). + defaults to None. + channel_wise: if True, compute intensity percentile and normalize every channel separately. + default to False. + return_clipping_values: whether to return the calculated percentiles in tensor meta information. + If soft clipping and requested percentile is None, return None as the corresponding clipping + values in meta information. Clipping values are stored in a list with each element corresponding + to a channel if channel_wise is set to True. defaults to False. + dtype: output data type, if None, same as input image. defaults to float32. + """ + if lower is None and upper is None: + raise ValueError("lower or upper percentiles must be provided") + if lower is not None and (lower < 0.0 or lower > 100.0): + raise ValueError("Percentiles must be in the range [0, 100]") + if upper is not None and (upper < 0.0 or upper > 100.0): + raise ValueError("Percentiles must be in the range [0, 100]") + if upper is not None and lower is not None and upper < lower: + raise ValueError("upper must be greater than or equal to lower") + if sharpness_factor is not None and sharpness_factor <= 0: + raise ValueError("sharpness_factor must be greater than 0") + + self.lower = lower + self.upper = upper + self.sharpness_factor = sharpness_factor + self.channel_wise = channel_wise + if return_clipping_values: + self.clipping_values: list[tuple[float | None, float | None]] = [] + self.return_clipping_values = return_clipping_values + self.dtype = dtype + + def _clip(self, img: NdarrayOrTensor) -> NdarrayOrTensor: + if self.sharpness_factor is not None: + lower_percentile = percentile(img, self.lower) if self.lower is not None else None + upper_percentile = percentile(img, self.upper) if self.upper is not None else None + img = soft_clip(img, self.sharpness_factor, lower_percentile, upper_percentile, self.dtype) + else: + lower_percentile = percentile(img, self.lower) if self.lower is not None else percentile(img, 0) + upper_percentile = percentile(img, self.upper) if self.upper is not None else percentile(img, 100) + img = clip(img, lower_percentile, upper_percentile) + + if self.return_clipping_values: + self.clipping_values.append( + ( + ( + lower_percentile + if lower_percentile is None + else lower_percentile.item() if hasattr(lower_percentile, "item") else lower_percentile + ), + ( + upper_percentile + if upper_percentile is None + else upper_percentile.item() if hasattr(upper_percentile, "item") else upper_percentile + ), + ) + ) + img = convert_to_tensor(img, track_meta=False) + return img + + def __call__(self, img: NdarrayOrTensor) -> NdarrayOrTensor: + """ + Apply the transform to `img`. + """ + img = convert_to_tensor(img, track_meta=get_track_meta()) + img_t = convert_to_tensor(img, track_meta=False) + if self.channel_wise: + img_t = torch.stack([self._clip(img=d) for d in img_t]) # type: ignore + else: + img_t = self._clip(img=img_t) + + img = convert_to_dst_type(img_t, dst=img)[0] + if self.return_clipping_values: + img.meta["clipping_values"] = self.clipping_values # type: ignore + + return img + + class AdjustContrast(Transform): """ Changes image intensity with gamma transform. Each pixel/voxel intensity is updated as:: @@ -1831,15 +1986,19 @@ class RandGibbsNoise(RandomizableTransform): Args: prob (float): probability of applying the transform. - alpha (Sequence(float)): Parametrizes the intensity of the Gibbs noise filter applied. Takes + alpha (float, Sequence(float)): Parametrizes the intensity of the Gibbs noise filter applied. Takes values in the interval [0,1] with alpha = 0 acting as the identity mapping. If a length-2 list is given as [a,b] then the value of alpha will be sampled uniformly from the interval [a,b]. 0 <= a <= b <= 1. + If a float is given, then the value of alpha will be sampled uniformly from the interval [0, alpha]. """ backend = GibbsNoise.backend - def __init__(self, prob: float = 0.1, alpha: Sequence[float] = (0.0, 1.0)) -> None: + def __init__(self, prob: float = 0.1, alpha: float | Sequence[float] = (0.0, 1.0)) -> None: + if isinstance(alpha, float): + alpha = (0, alpha) + alpha = ensure_tuple(alpha) if len(alpha) != 2: raise ValueError("alpha length must be 2.") if alpha[1] > 1 or alpha[0] < 0: @@ -2630,6 +2789,9 @@ class UltrasoundConfidenceMapTransform(Transform): It generates a confidence map by setting source and sink points in the image and computing the probability for random walks to reach the source for each pixel. + The official code is available at: + https://campar.in.tum.de/Main/AthanasiosKaramalisCode + Args: alpha (float, optional): Alpha parameter. Defaults to 2.0. beta (float, optional): Beta parameter. Defaults to 90.0. @@ -2637,14 +2799,32 @@ class UltrasoundConfidenceMapTransform(Transform): mode (str, optional): 'RF' or 'B' mode data. Defaults to 'B'. sink_mode (str, optional): Sink mode. Defaults to 'all'. If 'mask' is selected, a mask must be when calling the transform. Can be one of 'all', 'mid', 'min', 'mask'. + use_cg (bool, optional): Use Conjugate Gradient method for solving the linear system. Defaults to False. + cg_tol (float, optional): Tolerance for the Conjugate Gradient method. Defaults to 1e-6. + Will be used only if `use_cg` is True. + cg_maxiter (int, optional): Maximum number of iterations for the Conjugate Gradient method. Defaults to 200. + Will be used only if `use_cg` is True. """ - def __init__(self, alpha: float = 2.0, beta: float = 90.0, gamma: float = 0.05, mode="B", sink_mode="all") -> None: + def __init__( + self, + alpha: float = 2.0, + beta: float = 90.0, + gamma: float = 0.05, + mode="B", + sink_mode="all", + use_cg=False, + cg_tol: float = 1.0e-6, + cg_maxiter: int = 200, + ): self.alpha = alpha self.beta = beta self.gamma = gamma self.mode = mode self.sink_mode = sink_mode + self.use_cg = use_cg + self.cg_tol = cg_tol + self.cg_maxiter = cg_maxiter if self.mode not in ["B", "RF"]: raise ValueError(f"Unknown mode: {self.mode}. Supported modes are 'B' and 'RF'.") @@ -2654,7 +2834,9 @@ def __init__(self, alpha: float = 2.0, beta: float = 90.0, gamma: float = 0.05, f"Unknown sink mode: {self.sink_mode}. Supported modes are 'all', 'mid', 'min' and 'mask'." ) - self._compute_conf_map = UltrasoundConfidenceMap(self.alpha, self.beta, self.gamma, self.mode, self.sink_mode) + self._compute_conf_map = UltrasoundConfidenceMap( + self.alpha, self.beta, self.gamma, self.mode, self.sink_mode, self.use_cg, self.cg_tol, self.cg_maxiter + ) def __call__(self, img: NdarrayOrTensor, mask: NdarrayOrTensor | None = None) -> NdarrayOrTensor: """Compute confidence map from an ultrasound image. diff --git a/monai/transforms/intensity/dictionary.py b/monai/transforms/intensity/dictionary.py index 058ef87b95..5dbac485fe 100644 --- a/monai/transforms/intensity/dictionary.py +++ b/monai/transforms/intensity/dictionary.py @@ -26,6 +26,7 @@ from monai.data.meta_obj import get_track_meta from monai.transforms.intensity.array import ( AdjustContrast, + ClipIntensityPercentiles, ComputeHoVerMaps, ForegroundMask, GaussianSharpen, @@ -77,6 +78,7 @@ "NormalizeIntensityd", "ThresholdIntensityd", "ScaleIntensityRanged", + "ClipIntensityPercentilesd", "AdjustContrastd", "RandAdjustContrastd", "ScaleIntensityRangePercentilesd", @@ -122,6 +124,8 @@ "ThresholdIntensityDict", "ScaleIntensityRangeD", "ScaleIntensityRangeDict", + "ClipIntensityPercentilesD", + "ClipIntensityPercentilesDict", "AdjustContrastD", "AdjustContrastDict", "RandAdjustContrastD", @@ -172,7 +176,7 @@ class RandGaussianNoised(RandomizableTransform, MapTransform): """ Dictionary-based version :py:class:`monai.transforms.RandGaussianNoise`. - Add Gaussian noise to image. This transform assumes all the expected fields have same shape, if want to add + Add Gaussian noise to image. This transform assumes all the expected fields have same shape, if you want to add different noise for every field, please use this transform separately. Args: @@ -183,6 +187,7 @@ class RandGaussianNoised(RandomizableTransform, MapTransform): std: Standard deviation (spread) of distribution. dtype: output data type, if None, same as input image. defaults to float32. allow_missing_keys: don't raise exception if key is missing. + sample_std: If True, sample the spread of the Gaussian distribution uniformly from 0 to std. """ backend = RandGaussianNoise.backend @@ -195,10 +200,11 @@ def __init__( std: float = 0.1, dtype: DtypeLike = np.float32, allow_missing_keys: bool = False, + sample_std: bool = True, ) -> None: MapTransform.__init__(self, keys, allow_missing_keys) RandomizableTransform.__init__(self, prob) - self.rand_gaussian_noise = RandGaussianNoise(mean=mean, std=std, prob=1.0, dtype=dtype) + self.rand_gaussian_noise = RandGaussianNoise(mean=mean, std=std, prob=1.0, dtype=dtype, sample_std=sample_std) def set_random_state( self, seed: int | None = None, state: np.random.RandomState | None = None @@ -884,6 +890,36 @@ def __call__(self, data: Mapping[Hashable, NdarrayOrTensor]) -> dict[Hashable, N return d +class ClipIntensityPercentilesd(MapTransform): + """ + Dictionary-based wrapper of :py:class:`monai.transforms.ClipIntensityPercentiles`. + Clip the intensity values of input image to a specific range based on the intensity distribution of the input. + If `sharpness_factor` is provided, the intensity values will be soft clipped according to + f(x) = x + (1/sharpness_factor) * softplus(- c(x - minv)) - (1/sharpness_factor)*softplus(c(x - maxv)) + """ + + def __init__( + self, + keys: KeysCollection, + lower: float | None, + upper: float | None, + sharpness_factor: float | None = None, + channel_wise: bool = False, + dtype: DtypeLike = np.float32, + allow_missing_keys: bool = False, + ) -> None: + super().__init__(keys, allow_missing_keys) + self.scaler = ClipIntensityPercentiles( + lower=lower, upper=upper, sharpness_factor=sharpness_factor, channel_wise=channel_wise, dtype=dtype + ) + + def __call__(self, data: dict) -> dict: + d = dict(data) + for key in self.key_iterator(d): + d[key] = self.scaler(d[key]) + return d + + class AdjustContrastd(MapTransform): """ Dictionary-based wrapper of :py:class:`monai.transforms.AdjustContrast`. @@ -1421,10 +1457,11 @@ class RandGibbsNoised(RandomizableTransform, MapTransform): keys: 'image', 'label', or ['image', 'label'] depending on which data you need to transform. prob (float): probability of applying the transform. - alpha (float, List[float]): Parametrizes the intensity of the Gibbs noise filter applied. Takes + alpha (float, Sequence[float]): Parametrizes the intensity of the Gibbs noise filter applied. Takes values in the interval [0,1] with alpha = 0 acting as the identity mapping. If a length-2 list is given as [a,b] then the value of alpha will be sampled uniformly from the interval [a,b]. + If a float is given, then the value of alpha will be sampled uniformly from the interval [0, alpha]. allow_missing_keys: do not raise exception if key is missing. """ @@ -1434,7 +1471,7 @@ def __init__( self, keys: KeysCollection, prob: float = 0.1, - alpha: Sequence[float] = (0.0, 1.0), + alpha: float | Sequence[float] = (0.0, 1.0), allow_missing_keys: bool = False, ) -> None: MapTransform.__init__(self, keys, allow_missing_keys) @@ -1926,6 +1963,7 @@ def __call__(self, data: Mapping[Hashable, NdarrayOrTensor]) -> dict[Hashable, N NormalizeIntensityD = NormalizeIntensityDict = NormalizeIntensityd ThresholdIntensityD = ThresholdIntensityDict = ThresholdIntensityd ScaleIntensityRangeD = ScaleIntensityRangeDict = ScaleIntensityRanged +ClipIntensityPercentilesD = ClipIntensityPercentilesDict = ClipIntensityPercentilesd AdjustContrastD = AdjustContrastDict = AdjustContrastd RandAdjustContrastD = RandAdjustContrastDict = RandAdjustContrastd ScaleIntensityRangePercentilesD = ScaleIntensityRangePercentilesDict = ScaleIntensityRangePercentilesd diff --git a/monai/transforms/io/array.py b/monai/transforms/io/array.py index 7222a26fc3..e0ecc127f2 100644 --- a/monai/transforms/io/array.py +++ b/monai/transforms/io/array.py @@ -307,11 +307,11 @@ class SaveImage(Transform): Args: output_dir: output image directory. - Handled by ``folder_layout`` instead, if ``folder_layout`` is not ``None``. + Handled by ``folder_layout`` instead, if ``folder_layout`` is not ``None``. output_postfix: a string appended to all output file names, default to `trans`. - Handled by ``folder_layout`` instead, if ``folder_layout`` is not ``None``. + Handled by ``folder_layout`` instead, if ``folder_layout`` is not ``None``. output_ext: output file extension name. - Handled by ``folder_layout`` instead, if ``folder_layout`` is not ``None``. + Handled by ``folder_layout`` instead, if ``folder_layout`` is not ``None``. output_dtype: data type (if not None) for saving data. Defaults to ``np.float32``. resample: whether to resample image (if needed) before saving the data array, based on the ``"spatial_shape"`` (and ``"original_affine"``) from metadata. diff --git a/monai/transforms/post/array.py b/monai/transforms/post/array.py index da9b23ce57..2e733c4f6c 100644 --- a/monai/transforms/post/array.py +++ b/monai/transforms/post/array.py @@ -211,7 +211,8 @@ def __call__( raise ValueError("`to_onehot=True/False` is deprecated, please use `to_onehot=num_classes` instead.") img = convert_to_tensor(img, track_meta=get_track_meta()) img_t, *_ = convert_data_type(img, torch.Tensor) - if argmax or self.argmax: + argmax = self.argmax if argmax is None else argmax + if argmax: img_t = torch.argmax(img_t, dim=self.kwargs.get("dim", 0), keepdim=self.kwargs.get("keepdim", True)) to_onehot = self.to_onehot if to_onehot is None else to_onehot diff --git a/monai/transforms/regularization/__init__.py b/monai/transforms/regularization/__init__.py new file mode 100644 index 0000000000..1e97f89407 --- /dev/null +++ b/monai/transforms/regularization/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/monai/transforms/regularization/array.py b/monai/transforms/regularization/array.py new file mode 100644 index 0000000000..4bf6cff649 --- /dev/null +++ b/monai/transforms/regularization/array.py @@ -0,0 +1,196 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from abc import abstractmethod +from math import ceil, sqrt + +import torch + +from monai.data.meta_obj import get_track_meta +from monai.utils.type_conversion import convert_to_dst_type, convert_to_tensor + +from ..transform import RandomizableTransform + +__all__ = ["MixUp", "CutMix", "CutOut", "Mixer"] + + +class Mixer(RandomizableTransform): + + def __init__(self, batch_size: int, alpha: float = 1.0) -> None: + """ + Mixer is a base class providing the basic logic for the mixup-class of + augmentations. In all cases, we need to sample the mixing weights for each + sample (lambda in the notation used in the papers). Also, pairs of samples + being mixed are picked by randomly shuffling the batch samples. + + Args: + batch_size (int): number of samples per batch. That is, samples are expected tp + be of size batchsize x channels [x depth] x height x width. + alpha (float, optional): mixing weights are sampled from the Beta(alpha, alpha) + distribution. Defaults to 1.0, the uniform distribution. + """ + super().__init__() + if alpha <= 0: + raise ValueError(f"Expected positive number, but got {alpha = }") + self.alpha = alpha + self.batch_size = batch_size + + @abstractmethod + def apply(self, data: torch.Tensor): + raise NotImplementedError() + + def randomize(self, data=None) -> None: + """ + Sometimes you need may to apply the same transform to different tensors. + The idea is to get a sample and then apply it with apply() as often + as needed. You need to call this method everytime you apply the transform to a new + batch. + """ + super().randomize(None) + self._params = ( + torch.from_numpy(self.R.beta(self.alpha, self.alpha, self.batch_size)).type(torch.float32), + self.R.permutation(self.batch_size), + [torch.from_numpy(self.R.randint(0, d, size=(1,))) for d in data.shape[2:]] if data is not None else [], + ) + + +class MixUp(Mixer): + """MixUp as described in: + Hongyi Zhang, Moustapha Cisse, Yann N. Dauphin, David Lopez-Paz. + mixup: Beyond Empirical Risk Minimization, ICLR 2018 + + Class derived from :py:class:`monai.transforms.Mixer`. See corresponding + documentation for details on the constructor parameters. + """ + + def apply(self, data: torch.Tensor): + weight, perm, _ = self._params + nsamples, *dims = data.shape + if len(weight) != nsamples: + raise ValueError(f"Expected batch of size: {len(weight)}, but got {nsamples}") + + if len(dims) not in [3, 4]: + raise ValueError("Unexpected number of dimensions") + + mixweight = weight[(Ellipsis,) + (None,) * len(dims)] + return mixweight * data + (1 - mixweight) * data[perm, ...] + + def __call__(self, data: torch.Tensor, labels: torch.Tensor | None = None, randomize=True): + data_t = convert_to_tensor(data, track_meta=get_track_meta()) + labels_t = data_t # will not stay this value, needed to satisfy pylint/mypy + if labels is not None: + labels_t = convert_to_tensor(labels, track_meta=get_track_meta()) + if randomize: + self.randomize() + if labels is None: + return convert_to_dst_type(self.apply(data_t), dst=data)[0] + + return ( + convert_to_dst_type(self.apply(data_t), dst=data)[0], + convert_to_dst_type(self.apply(labels_t), dst=labels)[0], + ) + + +class CutMix(Mixer): + """CutMix augmentation as described in: + Sangdoo Yun, Dongyoon Han, Seong Joon Oh, Sanghyuk Chun, Junsuk Choe, Youngjoon Yoo. + CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, + ICCV 2019 + + Class derived from :py:class:`monai.transforms.Mixer`. See corresponding + documentation for details on the constructor parameters. Here, alpha not only determines + the mixing weight but also the size of the random rectangles used during for mixing. + Please refer to the paper for details. + + The most common use case is something close to: + + .. code-block:: python + + cm = CutMix(batch_size=8, alpha=0.5) + for batch in loader: + images, labels = batch + augimg, auglabels = cm(images, labels) + output = model(augimg) + loss = loss_function(output, auglabels) + ... + + """ + + def apply(self, data: torch.Tensor): + weights, perm, coords = self._params + nsamples, _, *dims = data.shape + if len(weights) != nsamples: + raise ValueError(f"Expected batch of size: {len(weights)}, but got {nsamples}") + + mask = torch.ones_like(data) + for s, weight in enumerate(weights): + lengths = [d * sqrt(1 - weight) for d in dims] + idx = [slice(None)] + [slice(c, min(ceil(c + ln), d)) for c, ln, d in zip(coords, lengths, dims)] + mask[s][idx] = 0 + + return mask * data + (1 - mask) * data[perm, ...] + + def apply_on_labels(self, labels: torch.Tensor): + weights, perm, _ = self._params + nsamples, *dims = labels.shape + if len(weights) != nsamples: + raise ValueError(f"Expected batch of size: {len(weights)}, but got {nsamples}") + + mixweight = weights[(Ellipsis,) + (None,) * len(dims)] + return mixweight * labels + (1 - mixweight) * labels[perm, ...] + + def __call__(self, data: torch.Tensor, labels: torch.Tensor | None = None, randomize=True): + data_t = convert_to_tensor(data, track_meta=get_track_meta()) + augmented_label = None + if labels is not None: + labels_t = convert_to_tensor(labels, track_meta=get_track_meta()) + if randomize: + self.randomize(data) + augmented = convert_to_dst_type(self.apply(data_t), dst=data)[0] + + if labels is not None: + augmented_label = convert_to_dst_type(self.apply(labels_t), dst=labels)[0] + return (augmented, augmented_label) if labels is not None else augmented + + +class CutOut(Mixer): + """Cutout as described in the paper: + Terrance DeVries, Graham W. Taylor. + Improved Regularization of Convolutional Neural Networks with Cutout, + arXiv:1708.04552 + + Class derived from :py:class:`monai.transforms.Mixer`. See corresponding + documentation for details on the constructor parameters. Here, alpha not only determines + the mixing weight but also the size of the random rectangles being cut put. + Please refer to the paper for details. + """ + + def apply(self, data: torch.Tensor): + weights, _, coords = self._params + nsamples, _, *dims = data.shape + if len(weights) != nsamples: + raise ValueError(f"Expected batch of size: {len(weights)}, but got {nsamples}") + + mask = torch.ones_like(data) + for s, weight in enumerate(weights): + lengths = [d * sqrt(1 - weight) for d in dims] + idx = [slice(None)] + [slice(c, min(ceil(c + ln), d)) for c, ln, d in zip(coords, lengths, dims)] + mask[s][idx] = 0 + + return mask * data + + def __call__(self, data: torch.Tensor, randomize=True): + data_t = convert_to_tensor(data, track_meta=get_track_meta()) + if randomize: + self.randomize(data) + return convert_to_dst_type(self.apply(data_t), dst=data)[0] diff --git a/monai/transforms/regularization/dictionary.py b/monai/transforms/regularization/dictionary.py new file mode 100644 index 0000000000..d8815e47b9 --- /dev/null +++ b/monai/transforms/regularization/dictionary.py @@ -0,0 +1,129 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections.abc import Hashable + +import numpy as np + +from monai.config import KeysCollection +from monai.config.type_definitions import NdarrayOrTensor +from monai.data.meta_obj import get_track_meta +from monai.utils import convert_to_tensor +from monai.utils.misc import ensure_tuple + +from ..transform import MapTransform, RandomizableTransform +from .array import CutMix, CutOut, MixUp + +__all__ = ["MixUpd", "MixUpD", "MixUpDict", "CutMixd", "CutMixD", "CutMixDict", "CutOutd", "CutOutD", "CutOutDict"] + + +class MixUpd(MapTransform, RandomizableTransform): + """ + Dictionary-based version :py:class:`monai.transforms.MixUp`. + + Notice that the mixup transformation will be the same for all entries + for consistency, i.e. images and labels must be applied the same augmenation. + """ + + def __init__( + self, keys: KeysCollection, batch_size: int, alpha: float = 1.0, allow_missing_keys: bool = False + ) -> None: + MapTransform.__init__(self, keys, allow_missing_keys) + self.mixup = MixUp(batch_size, alpha) + + def set_random_state(self, seed: int | None = None, state: np.random.RandomState | None = None) -> MixUpd: + super().set_random_state(seed, state) + self.mixup.set_random_state(seed, state) + return self + + def __call__(self, data): + d = dict(data) + # all the keys share the same random state + self.mixup.randomize(None) + for k in self.key_iterator(d): + d[k] = self.mixup(data[k], randomize=False) + return d + + +class CutMixd(MapTransform, RandomizableTransform): + """ + Dictionary-based version :py:class:`monai.transforms.CutMix`. + + Notice that the mixture weights will be the same for all entries + for consistency, i.e. images and labels must be aggregated with the same weights, + but the random crops are not. + """ + + def __init__( + self, + keys: KeysCollection, + batch_size: int, + label_keys: KeysCollection | None = None, + alpha: float = 1.0, + allow_missing_keys: bool = False, + ) -> None: + super().__init__(keys, allow_missing_keys) + self.mixer = CutMix(batch_size, alpha) + self.label_keys = ensure_tuple(label_keys) if label_keys is not None else [] + + def set_random_state(self, seed: int | None = None, state: np.random.RandomState | None = None) -> CutMixd: + super().set_random_state(seed, state) + self.mixer.set_random_state(seed, state) + return self + + def __call__(self, data): + d = dict(data) + first_key: Hashable = self.first_key(d) + if first_key == (): + out: dict[Hashable, NdarrayOrTensor] = convert_to_tensor(d, track_meta=get_track_meta()) + return out + self.mixer.randomize(d[first_key]) + for key, label_key in self.key_iterator(d, self.label_keys): + ret = self.mixer(data[key], data.get(label_key, None), randomize=False) + d[key] = ret[0] + if label_key in d: + d[label_key] = ret[1] + return d + + +class CutOutd(MapTransform, RandomizableTransform): + """ + Dictionary-based version :py:class:`monai.transforms.CutOut`. + + Notice that the cutout is different for every entry in the dictionary. + """ + + def __init__(self, keys: KeysCollection, batch_size: int, allow_missing_keys: bool = False) -> None: + super().__init__(keys, allow_missing_keys) + self.cutout = CutOut(batch_size) + + def set_random_state(self, seed: int | None = None, state: np.random.RandomState | None = None) -> CutOutd: + super().set_random_state(seed, state) + self.cutout.set_random_state(seed, state) + return self + + def __call__(self, data): + d = dict(data) + first_key: Hashable = self.first_key(d) + if first_key == (): + out: dict[Hashable, NdarrayOrTensor] = convert_to_tensor(d, track_meta=get_track_meta()) + return out + self.cutout.randomize(d[first_key]) + for k in self.key_iterator(d): + d[k] = self.cutout(data[k], randomize=False) + return d + + +MixUpD = MixUpDict = MixUpd +CutMixD = CutMixDict = CutMixd +CutOutD = CutOutDict = CutOutd diff --git a/monai/transforms/signal/array.py b/monai/transforms/signal/array.py index 938f42192c..97df04f233 100644 --- a/monai/transforms/signal/array.py +++ b/monai/transforms/signal/array.py @@ -28,7 +28,7 @@ from monai.utils.enums import TransformBackends from monai.utils.type_conversion import convert_data_type, convert_to_tensor -shift, has_shift = optional_import("scipy.ndimage.interpolation", name="shift") +shift, has_shift = optional_import("scipy.ndimage", name="shift") iirnotch, has_iirnotch = optional_import("scipy.signal", name="iirnotch") with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) # project-monai/monai#5204 diff --git a/monai/transforms/spatial/array.py b/monai/transforms/spatial/array.py index 094afdd3c4..3739a83e71 100644 --- a/monai/transforms/spatial/array.py +++ b/monai/transforms/spatial/array.py @@ -3441,7 +3441,7 @@ def filter_count(self, image_np: NdarrayOrTensor, locations: np.ndarray) -> tupl idx = self.R.permutation(image_np.shape[0]) idx = idx[: self.num_patches] idx_np = convert_data_type(idx, np.ndarray)[0] - image_np = image_np[idx] + image_np = image_np[idx] # type: ignore[index] locations = locations[idx_np] return image_np, locations elif self.sort_fn not in (None, GridPatchSort.MIN, GridPatchSort.MAX): diff --git a/monai/transforms/utils.py b/monai/transforms/utils.py index e282ecff24..363fce91be 100644 --- a/monai/transforms/utils.py +++ b/monai/transforms/utils.py @@ -22,6 +22,7 @@ import numpy as np import torch +from torch import Tensor import monai from monai.config import DtypeLike, IndexSelection @@ -30,6 +31,7 @@ from monai.networks.utils import meshgrid_ij from monai.transforms.compose import Compose from monai.transforms.transform import MapTransform, Transform, apply_transform +from monai.transforms.utils_morphological_ops import erode from monai.transforms.utils_pytorch_numpy_unification import ( any_np_pt, ascontiguousarray, @@ -38,6 +40,7 @@ nonzero, ravel, searchsorted, + softplus, unique, unravel_index, where, @@ -64,6 +67,8 @@ min_version, optional_import, pytorch_after, + unsqueeze_left, + unsqueeze_right, ) from monai.utils.enums import TransformBackends from monai.utils.type_conversion import ( @@ -102,11 +107,14 @@ "generate_spatial_bounding_box", "get_extreme_points", "get_largest_connected_component_mask", + "get_largest_connected_component_mask_point", + "convert_points_to_disc", "remove_small_objects", "img_bounds", "in_bounds", "is_empty", "is_positive", + "map_and_generate_sampling_centers", "map_binary_to_indices", "map_classes_to_indices", "map_spatial_axes", @@ -131,9 +139,45 @@ "resolves_modes", "has_status_keys", "distance_transform_edt", + "soft_clip", ] +def soft_clip( + arr: NdarrayOrTensor, + sharpness_factor: float = 1.0, + minv: NdarrayOrTensor | float | int | None = None, + maxv: NdarrayOrTensor | float | int | None = None, + dtype: DtypeLike | torch.dtype = np.float32, +) -> NdarrayOrTensor: + """ + Apply soft clip to the input array or tensor. + The intensity values will be soft clipped according to + f(x) = x + (1/sharpness_factor)*softplus(- c(x - minv)) - (1/sharpness_factor)*softplus(c(x - maxv)) + From https://medium.com/life-at-hopper/clip-it-clip-it-good-1f1bf711b291 + + To perform one-sided clipping, set either minv or maxv to None. + Args: + arr: input array to clip. + sharpness_factor: the sharpness of the soft clip function, default to 1. + minv: minimum value of target clipped array. + maxv: maximum value of target clipped array. + dtype: if not None, convert input array to dtype before computation. + + """ + + if dtype is not None: + arr, *_ = convert_data_type(arr, dtype=dtype) + + v = arr + if minv is not None: + v = v + softplus(-sharpness_factor * (arr - minv)) / sharpness_factor + if maxv is not None: + v = v - softplus(sharpness_factor * (arr - maxv)) / sharpness_factor + + return v + + def rand_choice(prob: float = 0.5) -> bool: """ Returns True if a randomly chosen number is less than or equal to `prob`, by default this is a 50/50 chance. @@ -331,6 +375,70 @@ def check_non_lazy_pending_ops( warnings.warn(msg) +def map_and_generate_sampling_centers( + label: NdarrayOrTensor, + spatial_size: Sequence[int] | int, + num_samples: int, + label_spatial_shape: Sequence[int] | None = None, + num_classes: int | None = None, + image: NdarrayOrTensor | None = None, + image_threshold: float = 0.0, + max_samples_per_class: int | None = None, + ratios: list[float | int] | None = None, + rand_state: np.random.RandomState | None = None, + allow_smaller: bool = False, + warn: bool = True, +) -> tuple[tuple]: + """ + Combine "map_classes_to_indices" and "generate_label_classes_crop_centers" functions, return crop center coordinates. + This calls `map_classes_to_indices` to get indices from `label`, gets the shape from `label_spatial_shape` + is given otherwise from the labels, calls `generate_label_classes_crop_centers`, and returns its results. + + Args: + label: use the label data to get the indices of every class. + spatial_size: spatial size of the ROIs to be sampled. + num_samples: total sample centers to be generated. + label_spatial_shape: spatial shape of the original label data to unravel selected centers. + indices: sequence of pre-computed foreground indices of every class in 1 dimension. + num_classes: number of classes for argmax label, not necessary for One-Hot label. + image: if image is not None, only return the indices of every class that are within the valid + region of the image (``image > image_threshold``). + image_threshold: if enabled `image`, use ``image > image_threshold`` to + determine the valid image content area and select class indices only in this area. + max_samples_per_class: maximum length of indices in each class to reduce memory consumption. + Default is None, no subsampling. + ratios: ratios of every class in the label to generate crop centers, including background class. + if None, every class will have the same ratio to generate crop centers. + rand_state: numpy randomState object to align with other modules. + allow_smaller: if `False`, an exception will be raised if the image is smaller than + the requested ROI in any dimension. If `True`, any smaller dimensions will be set to + match the cropped size (i.e., no cropping in that dimension). + warn: if `True` prints a warning if a class is not present in the label. + Returns: + Tuple of crop centres + """ + if label is None: + raise ValueError("label must not be None.") + indices = map_classes_to_indices(label, num_classes, image, image_threshold, max_samples_per_class) + + if label_spatial_shape is not None: + _shape = label_spatial_shape + elif isinstance(label, monai.data.MetaTensor): + _shape = label.peek_pending_shape() + else: + _shape = label.shape[1:] + + if _shape is None: + raise ValueError( + "label_spatial_shape or label with a known shape must be provided to infer the output spatial shape." + ) + centers = generate_label_classes_crop_centers( + spatial_size, num_samples, _shape, indices, ratios, rand_state, allow_smaller, warn + ) + + return ensure_tuple(centers) + + def map_binary_to_indices( label: NdarrayOrTensor, image: NdarrayOrTensor | None = None, image_threshold: float = 0.0 ) -> tuple[NdarrayOrTensor, NdarrayOrTensor]: @@ -625,9 +733,12 @@ def generate_label_classes_crop_centers( for i, array in enumerate(indices): if len(array) == 0: - ratios_[i] = 0 - if warn: - warnings.warn(f"no available indices of class {i} to crop, set the crop ratio of this class to zero.") + if ratios_[i] != 0: + ratios_[i] = 0 + if warn: + warnings.warn( + f"no available indices of class {i} to crop, setting the crop ratio of this class to zero." + ) centers = [] classes = rand_state.choice(len(ratios_), size=num_samples, p=np.asarray(ratios_) / np.sum(ratios_)) @@ -1067,6 +1178,183 @@ def get_largest_connected_component_mask( return convert_to_dst_type(out, dst=img, dtype=out.dtype)[0] +def get_largest_connected_component_mask_point( + img_pos: NdarrayTensor, + img_neg: NdarrayTensor, + point_coords: NdarrayTensor, + point_labels: NdarrayTensor, + pos_val: Sequence[int] = (1, 3), + neg_val: Sequence[int] = (0, 2), + margins: int = 3, +) -> NdarrayTensor: + """ + Gets the connected component of img_pos and img_neg that include the positive points and + negative points separately. The function is used for combining automatic results with interactive + results in VISTA3D. + + Args: + img_pos: bool type tensor, shape [B, 1, H, W, D], where B means the foreground masks from a single 3D image. + img_neg: same format as img_pos but corresponds to negative points. + pos_val: positive point label values. + neg_val: negative point label values. + point_coords: the coordinates of each point, shape [B, N, 3], where N means the number of points. + point_labels: the label of each point, shape [B, N]. + """ + + cucim_skimage, has_cucim = optional_import("cucim.skimage") + + use_cp = has_cp and has_cucim and isinstance(img_pos, torch.Tensor) and img_pos.device != torch.device("cpu") + if use_cp: + img_pos_ = convert_to_cupy(img_pos.short()) # type: ignore + img_neg_ = convert_to_cupy(img_neg.short()) # type: ignore + label = cucim_skimage.measure.label + lib = cp + else: + if not has_measure: + raise RuntimeError("skimage.measure required.") + img_pos_, *_ = convert_data_type(img_pos, np.ndarray) + img_neg_, *_ = convert_data_type(img_neg, np.ndarray) + # for skimage.measure.label, the input must be bool type + if img_pos_.dtype != bool or img_neg_.dtype != bool: + raise ValueError("img_pos and img_neg must be bool type.") + label = measure.label + lib = np + + features_pos, _ = label(img_pos_, connectivity=3, return_num=True) + features_neg, _ = label(img_neg_, connectivity=3, return_num=True) + + outs = np.zeros_like(img_pos_) + for bs in range(point_coords.shape[0]): + for i, p in enumerate(point_coords[bs]): + if point_labels[bs, i] in pos_val: + features = features_pos + elif point_labels[bs, i] in neg_val: + features = features_neg + else: + # if -1 padding point, skip + continue + for margin in range(margins): + if isinstance(p, np.ndarray): + x, y, z = np.round(p).astype(int).tolist() + else: + x, y, z = p.float().round().int().tolist() + l, r = max(x - margin, 0), min(x + margin + 1, features.shape[-3]) + t, d = max(y - margin, 0), min(y + margin + 1, features.shape[-2]) + f, b = max(z - margin, 0), min(z + margin + 1, features.shape[-1]) + if (features[bs, 0, l:r, t:d, f:b] > 0).any(): + index = features[bs, 0, l:r, t:d, f:b].max() + outs[[bs]] += lib.isin(features[[bs]], index) + break + outs[outs > 1] = 1 + return convert_to_dst_type(outs, dst=img_pos, dtype=outs.dtype)[0] + + +def convert_points_to_disc( + image_size: Sequence[int], point: Tensor, point_label: Tensor, radius: int = 2, disc: bool = False +): + """ + Convert a 3D point coordinates into image mask. The returned mask has the same spatial + size as `image_size` while the batch dimension is the same as 'point' batch dimension. + The point is converted to a mask ball with radius defined by `radius`. The output + contains two channels each for negative (first channel) and positive points. + + Args: + image_size: The output size of the converted mask. It should be a 3D tuple. + point: [B, N, 3], 3D point coordinates. + point_label: [B, N], 0 or 2 means negative points, 1 or 3 means postive points. + radius: disc ball radius size. + disc: If true, use regular disc, other use gaussian. + """ + masks = torch.zeros([point.shape[0], 2, image_size[0], image_size[1], image_size[2]], device=point.device) + _array = [ + torch.arange(start=0, end=image_size[i], step=1, dtype=torch.float32, device=point.device) for i in range(3) + ] + coord_rows, coord_cols, coord_z = torch.meshgrid(_array[2], _array[1], _array[0]) + # [1, 3, h, w, d] -> [b, 2, 3, h, w, d] + coords = unsqueeze_left(torch.stack((coord_rows, coord_cols, coord_z), dim=0), 6) + coords = coords.repeat(point.shape[0], 2, 1, 1, 1, 1) + for b, n in np.ndindex(*point.shape[:2]): + point_bn = unsqueeze_right(point[b, n], 4) + if point_label[b, n] > -1: + channel = 0 if (point_label[b, n] == 0 or point_label[b, n] == 2) else 1 + pow_diff = torch.pow(coords[b, channel] - point_bn, 2) + if disc: + masks[b, channel] += pow_diff.sum(0) < radius**2 + else: + masks[b, channel] += torch.exp(-pow_diff.sum(0) / (2 * radius**2)) + return masks + + +def sample_points_from_label( + labels: Tensor, + label_set: Sequence[int], + max_ppoint: int = 1, + max_npoint: int = 0, + device: torch.device | str | None = "cpu", + use_center: bool = False, +): + """Sample points from labels. + + Args: + labels: [1, 1, H, W, D] + label_set: local index, must match values in labels. + max_ppoint: maximum positive point samples. + max_npoint: maximum negative point samples. + device: returned tensor device. + use_center: whether to sample points from center. + + Returns: + point: point coordinates of [B, N, 3]. B equals to the length of label_set. + point_label: [B, N], always 0 for negative, 1 for positive. + """ + if not labels.shape[0] == 1: + raise ValueError("labels must have batch size 1.") + + if device is None: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + labels = labels[0, 0] + unique_labels = labels.unique().cpu().numpy().tolist() + _point = [] + _point_label = [] + for id in label_set: + if id in unique_labels: + plabels = labels == int(id) + nlabels = ~plabels + _plabels = get_largest_connected_component_mask(erode(plabels.unsqueeze(0).unsqueeze(0))[0, 0]) + plabelpoints = torch.nonzero(_plabels).to(device) + if len(plabelpoints) == 0: + plabelpoints = torch.nonzero(plabels).to(device) + nlabelpoints = torch.nonzero(nlabels).to(device) + num_p = min(len(plabelpoints), max_ppoint) + num_n = min(len(nlabelpoints), max_npoint) + pad = max_ppoint + max_npoint - num_p - num_n + if use_center: + pmean = plabelpoints.float().mean(0) + pdis = ((plabelpoints - pmean) ** 2).sum(-1) + _, sorted_indices_tensor = torch.sort(pdis) + sorted_indices = sorted_indices_tensor.cpu().tolist() + else: + sorted_indices = list(range(len(plabelpoints))) + random.shuffle(sorted_indices) + _point.append( + torch.stack( + [plabelpoints[sorted_indices[i]] for i in range(num_p)] + + random.choices(nlabelpoints, k=num_n) + + [torch.tensor([0, 0, 0], device=device)] * pad + ) + ) + _point_label.append(torch.tensor([1] * num_p + [0] * num_n + [-1] * pad).to(device)) + else: + # pad the background labels + _point.append(torch.zeros(max_ppoint + max_npoint, 3).to(device)) + _point_label.append(torch.zeros(max_ppoint + max_npoint).to(device) - 1) + point = torch.stack(_point) + point_label = torch.stack(_point_label) + + return point, point_label + + def remove_small_objects( img: NdarrayTensor, min_size: int = 64, @@ -2150,7 +2438,7 @@ def distance_transform_edt( if return_distances: dtype = torch.float64 if float64_distances else torch.float32 if distances is None: - distances = torch.zeros_like(img, dtype=dtype) # type: ignore + distances = torch.zeros_like(img, memory_format=torch.contiguous_format, dtype=dtype) # type: ignore else: if not isinstance(distances, torch.Tensor) and distances.device != img.device: raise TypeError("distances must be a torch.Tensor on the same device as img") diff --git a/monai/transforms/utils_create_transform_ims.py b/monai/transforms/utils_create_transform_ims.py index 4b5990abd3..a29fd4dbf9 100644 --- a/monai/transforms/utils_create_transform_ims.py +++ b/monai/transforms/utils_create_transform_ims.py @@ -269,11 +269,9 @@ def update_docstring(code_path, transform_name): def pre_process_data(data, ndim, is_map, is_post): - """If transform requires 2D data, then convert to 2D""" + """If transform requires 2D data, then convert to 2D by selecting the middle of the last dimension.""" if ndim == 2: - for k in keys: - data[k] = data[k][..., data[k].shape[-1] // 2] - + data = {k: v[..., v.shape[-1] // 2] for k, v in data.items()} if is_map: return data return data[CommonKeys.LABEL] if is_post else data[CommonKeys.IMAGE] diff --git a/monai/transforms/utils_morphological_ops.py b/monai/transforms/utils_morphological_ops.py new file mode 100644 index 0000000000..b3134c1865 --- /dev/null +++ b/monai/transforms/utils_morphological_ops.py @@ -0,0 +1,172 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Sequence + +import torch +import torch.nn.functional as F +from torch import Tensor + +from monai.config import NdarrayOrTensor +from monai.utils import convert_data_type, convert_to_dst_type, ensure_tuple_rep + +__all__ = ["erode", "dilate"] + + +def erode(mask: NdarrayOrTensor, filter_size: int | Sequence[int] = 3, pad_value: float = 1.0) -> NdarrayOrTensor: + """ + Erode 2D/3D binary mask. + + Args: + mask: input 2D/3D binary mask, [N,C,M,N] or [N,C,M,N,P] torch tensor or ndarray. + filter_size: erosion filter size, has to be odd numbers, default to be 3. + pad_value: the filled value for padding. We need to pad the input before filtering + to keep the output with the same size as input. Usually use default value + and not changed. + + Return: + eroded mask, same shape and data type as input. + + Example: + + .. code-block:: python + + # define a naive mask + mask = torch.zeros(3,2,3,3,3) + mask[:,:,1,1,1] = 1.0 + filter_size = 3 + erode_result = erode(mask, filter_size) # expect torch.zeros(3,2,3,3,3) + dilate_result = dilate(mask, filter_size) # expect torch.ones(3,2,3,3,3) + """ + mask_t, *_ = convert_data_type(mask, torch.Tensor) + res_mask_t = erode_t(mask_t, filter_size=filter_size, pad_value=pad_value) + res_mask: NdarrayOrTensor + res_mask, *_ = convert_to_dst_type(src=res_mask_t, dst=mask) + return res_mask + + +def dilate(mask: NdarrayOrTensor, filter_size: int | Sequence[int] = 3, pad_value: float = 0.0) -> NdarrayOrTensor: + """ + Dilate 2D/3D binary mask. + + Args: + mask: input 2D/3D binary mask, [N,C,M,N] or [N,C,M,N,P] torch tensor or ndarray. + filter_size: dilation filter size, has to be odd numbers, default to be 3. + pad_value: the filled value for padding. We need to pad the input before filtering + to keep the output with the same size as input. Usually use default value + and not changed. + + Return: + dilated mask, same shape and data type as input. + + Example: + + .. code-block:: python + + # define a naive mask + mask = torch.zeros(3,2,3,3,3) + mask[:,:,1,1,1] = 1.0 + filter_size = 3 + erode_result = erode(mask,filter_size) # expect torch.zeros(3,2,3,3,3) + dilate_result = dilate(mask,filter_size) # expect torch.ones(3,2,3,3,3) + """ + mask_t, *_ = convert_data_type(mask, torch.Tensor) + res_mask_t = dilate_t(mask_t, filter_size=filter_size, pad_value=pad_value) + res_mask: NdarrayOrTensor + res_mask, *_ = convert_to_dst_type(src=res_mask_t, dst=mask) + return res_mask + + +def get_morphological_filter_result_t(mask_t: Tensor, filter_size: int | Sequence[int], pad_value: float) -> Tensor: + """ + Apply a morphological filter to a 2D/3D binary mask tensor. + + Args: + mask_t: input 2D/3D binary mask, [N,C,M,N] or [N,C,M,N,P] torch tensor. + filter_size: morphological filter size, has to be odd numbers. + pad_value: the filled value for padding. We need to pad the input before filtering + to keep the output with the same size as input. + + Return: + Tensor: Morphological filter result mask, same shape as input. + """ + spatial_dims = len(mask_t.shape) - 2 + if spatial_dims not in [2, 3]: + raise ValueError( + f"spatial_dims must be either 2 or 3, " + f"got spatial_dims={spatial_dims} for mask tensor with shape of {mask_t.shape}." + ) + + # Define the structuring element + filter_size = ensure_tuple_rep(filter_size, spatial_dims) + if any(size % 2 == 0 for size in filter_size): + raise ValueError(f"All dimensions in filter_size must be odd numbers, got {filter_size}.") + + structuring_element = torch.ones((mask_t.shape[1], mask_t.shape[1]) + filter_size).to(mask_t.device) + + # Pad the input tensor to handle border pixels + # Calculate padding size + pad_size = [size // 2 for size in filter_size for _ in range(2)] + + input_padded = F.pad(mask_t.float(), pad_size, mode="constant", value=pad_value) + + # Apply filter operation + conv_fn = F.conv2d if spatial_dims == 2 else F.conv3d + output = conv_fn(input_padded, structuring_element, padding=0) / torch.sum(structuring_element[0, ...]) + + return output + + +def erode_t(mask_t: Tensor, filter_size: int | Sequence[int] = 3, pad_value: float = 1.0) -> Tensor: + """ + Erode 2D/3D binary mask with data type as torch tensor. + + Args: + mask_t: input 2D/3D binary mask, [N,C,M,N] or [N,C,M,N,P] torch tensor. + filter_size: erosion filter size, has to be odd numbers, default to be 3. + pad_value: the filled value for padding. We need to pad the input before filtering + to keep the output with the same size as input. Usually use default value + and not changed. + + Return: + Tensor: eroded mask, same shape as input. + """ + + output = get_morphological_filter_result_t(mask_t, filter_size, pad_value) + + # Set output values based on the minimum value within the structuring element + output = torch.where(torch.abs(output - 1.0) < 1e-7, 1.0, 0.0) + + return output + + +def dilate_t(mask_t: Tensor, filter_size: int | Sequence[int] = 3, pad_value: float = 0.0) -> Tensor: + """ + Dilate 2D/3D binary mask with data type as torch tensor. + + Args: + mask_t: input 2D/3D binary mask, [N,C,M,N] or [N,C,M,N,P] torch tensor. + filter_size: dilation filter size, has to be odd numbers, default to be 3. + pad_value: the filled value for padding. We need to pad the input before filtering + to keep the output with the same size as input. Usually use default value + and not changed. + + Return: + Tensor: dilated mask, same shape as input. + """ + output = get_morphological_filter_result_t(mask_t, filter_size, pad_value) + + # Set output values based on the minimum value within the structuring element + output = torch.where(output > 0, 1.0, 0.0) + + return output diff --git a/monai/transforms/utils_pytorch_numpy_unification.py b/monai/transforms/utils_pytorch_numpy_unification.py index 0774d50314..020d99af16 100644 --- a/monai/transforms/utils_pytorch_numpy_unification.py +++ b/monai/transforms/utils_pytorch_numpy_unification.py @@ -52,9 +52,24 @@ "median", "mean", "std", + "softplus", ] +def softplus(x: NdarrayOrTensor) -> NdarrayOrTensor: + """stable softplus through `np.logaddexp` with equivalent implementation for torch. + + Args: + x: array/tensor. + + Returns: + Softplus of the input. + """ + if isinstance(x, np.ndarray): + return np.logaddexp(np.zeros_like(x), x) + return torch.logaddexp(torch.zeros_like(x), x) + + def allclose(a: NdarrayTensor, b: NdarrayOrTensor, rtol=1e-5, atol=1e-8, equal_nan=False) -> bool: """`np.allclose` with equivalent implementation for torch.""" b, *_ = convert_to_dst_type(b, a, wrap_sequence=True) diff --git a/monai/utils/__init__.py b/monai/utils/__init__.py index 2c32eb2cf4..03fa1ceed1 100644 --- a/monai/utils/__init__.py +++ b/monai/utils/__init__.py @@ -126,6 +126,7 @@ version_leq, ) from .nvtx import Range +from .ordering import Ordering from .profiling import ( PerfContext, ProfileHandler, diff --git a/monai/utils/misc.py b/monai/utils/misc.py index caa7c067df..40370ca2c6 100644 --- a/monai/utils/misc.py +++ b/monai/utils/misc.py @@ -24,7 +24,6 @@ import warnings from ast import literal_eval from collections.abc import Callable, Iterable, Sequence -from distutils.util import strtobool from math import log10 from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar, cast, overload @@ -78,6 +77,25 @@ "run_cmd", ] + +def _strtobool(val: str) -> bool: + """ + Replaces deprecated (pre python 3.12) + distutils strtobool function. + + True values are y, yes, t, true, on and 1; + False values are n, no, f, false, off and 0. + Raises ValueError if val is anything else. + """ + val = val.lower() + if val in ("y", "yes", "t", "true", "on", "1"): + return True + elif val in ("n", "no", "f", "false", "off", "0"): + return False + else: + raise ValueError(f"invalid truth value {val}") + + _seed = None _flag_deterministic = torch.backends.cudnn.deterministic _flag_cudnn_benchmark = torch.backends.cudnn.benchmark @@ -100,6 +118,7 @@ def star_zip_with(op, *vals): T = TypeVar("T") +NT = TypeVar("NT", np.ndarray, torch.Tensor) @overload @@ -400,7 +419,7 @@ def _parse_var(s): d[key] = literal_eval(value) except ValueError: try: - d[key] = bool(strtobool(str(value))) + d[key] = bool(_strtobool(str(value))) except ValueError: d[key] = value return d @@ -527,7 +546,7 @@ def doc_images() -> str | None: @staticmethod def algo_hash() -> str | None: - return os.environ.get("MONAI_ALGO_HASH", "249bf4b") + return os.environ.get("MONAI_ALGO_HASH", "e4cf5a1") @staticmethod def trace_transform() -> str | None: @@ -796,7 +815,7 @@ def __init__(self, input_unit: str, target_unit: str) -> None: "Both input and target units should be from the same quantity. " f"Input quantity is {input_base} while target quantity is {target_base}" ) - self._calculate_conversion_factor() + self.conversion_factor = self._calculate_conversion_factor() def _get_valid_unit_and_base(self, unit): unit = str(unit).lower() @@ -823,7 +842,7 @@ def _calculate_conversion_factor(self): return 1.0 input_power = self._get_unit_power(self.input_unit) target_power = self._get_unit_power(self.target_unit) - self.conversion_factor = 10 ** (input_power - target_power) + return 10 ** (input_power - target_power) def __call__(self, value: int | float) -> Any: return float(value) * self.conversion_factor @@ -889,11 +908,11 @@ def is_sqrt(num: Sequence[int] | int) -> bool: return ensure_tuple(ret) == num -def unsqueeze_right(arr: NdarrayOrTensor, ndim: int) -> NdarrayOrTensor: +def unsqueeze_right(arr: NT, ndim: int) -> NT: """Append 1-sized dimensions to `arr` to create a result with `ndim` dimensions.""" return arr[(...,) + (None,) * (ndim - arr.ndim)] -def unsqueeze_left(arr: NdarrayOrTensor, ndim: int) -> NdarrayOrTensor: +def unsqueeze_left(arr: NT, ndim: int) -> NT: """Prepend 1-sized dimensions to `arr` to create a result with `ndim` dimensions.""" return arr[(None,) * (ndim - arr.ndim)] diff --git a/monai/utils/module.py b/monai/utils/module.py index 5e058c105b..78087aef84 100644 --- a/monai/utils/module.py +++ b/monai/utils/module.py @@ -209,8 +209,11 @@ def load_submodules( if (is_pkg or load_all) and name not in sys.modules and match(exclude_pattern, name) is None: try: mod = import_module(name) - importer.find_spec(name).loader.load_module(name) # type: ignore - submodules.append(mod) + mod_spec = importer.find_spec(name) # type: ignore + if mod_spec and mod_spec.loader: + loader = mod_spec.loader + loader.exec_module(mod) + submodules.append(mod) except OptionalImportError: pass # could not import the optional deps., they are ignored except ImportError as e: @@ -272,7 +275,7 @@ def instantiate(__path: str, __mode: str, **kwargs: Any) -> Any: return pdb.runcall(component, **kwargs) except Exception as e: raise RuntimeError( - f"Failed to instantiate component '{__path}' with kwargs: {kwargs}" + f"Failed to instantiate component '{__path}' with keywords: {','.join(kwargs.keys())}" f"\n set '_mode_={CompInitMode.DEBUG}' to enter the debugging mode." ) from e @@ -561,7 +564,7 @@ def version_leq(lhs: str, rhs: str) -> bool: """ lhs, rhs = str(lhs), str(rhs) - pkging, has_ver = optional_import("pkg_resources", name="packaging") + pkging, has_ver = optional_import("packaging.Version") if has_ver: try: return cast(bool, pkging.version.Version(lhs) <= pkging.version.Version(rhs)) @@ -588,7 +591,8 @@ def version_geq(lhs: str, rhs: str) -> bool: """ lhs, rhs = str(lhs), str(rhs) - pkging, has_ver = optional_import("pkg_resources", name="packaging") + pkging, has_ver = optional_import("packaging.Version") + if has_ver: try: return cast(bool, pkging.version.Version(lhs) >= pkging.version.Version(rhs)) @@ -626,7 +630,7 @@ def pytorch_after(major: int, minor: int, patch: int = 0, current_ver_string: st if current_ver_string is None: _env_var = os.environ.get("PYTORCH_VER", "") current_ver_string = _env_var if _env_var else torch.__version__ - ver, has_ver = optional_import("pkg_resources", name="parse_version") + ver, has_ver = optional_import("packaging.version", name="parse") if has_ver: return ver(".".join((f"{major}", f"{minor}", f"{patch}"))) <= ver(f"{current_ver_string}") # type: ignore parts = f"{current_ver_string}".split("+", 1)[0].split(".", 3) diff --git a/monai/utils/ordering.py b/monai/utils/ordering.py new file mode 100644 index 0000000000..1be61f98ab --- /dev/null +++ b/monai/utils/ordering.py @@ -0,0 +1,207 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import numpy as np + +from monai.utils.enums import OrderingTransformations, OrderingType + + +class Ordering: + """ + Ordering class that projects a 2D or 3D image into a 1D sequence. It also allows the image to be transformed with + one of the following transformations: + Reflection (see np.flip for more details). + Transposition (see np.transpose for more details). + 90-degree rotation (see np.rot90 for more details). + + The transformations are applied in the order specified by the transformation_order parameter. + + Args: + ordering_type: The ordering type. One of the following: + - 'raster_scan': The image is projected into a 1D sequence by scanning the image from left to right and from + top to bottom. Also called a row major ordering. + - 's_curve': The image is projected into a 1D sequence by scanning the image in a circular snake like + pattern from top left towards right gowing in a spiral towards the center. + - random': The image is projected into a 1D sequence by randomly shuffling the image. + spatial_dims: The number of spatial dimensions of the image. + dimensions: The dimensions of the image. + reflected_spatial_dims: A tuple of booleans indicating whether to reflect the image along each spatial dimension. + transpositions_axes: A tuple of tuples indicating the axes to transpose the image along. + rot90_axes: A tuple of tuples indicating the axes to rotate the image along. + transformation_order: The order in which to apply the transformations. + """ + + def __init__( + self, + ordering_type: str, + spatial_dims: int, + dimensions: tuple[int, int, int] | tuple[int, int, int, int], + reflected_spatial_dims: tuple[bool, bool] | None = None, + transpositions_axes: tuple[tuple[int, int], ...] | tuple[tuple[int, int, int], ...] | None = None, + rot90_axes: tuple[tuple[int, int], ...] | None = None, + transformation_order: tuple[str, ...] = ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + ) -> None: + super().__init__() + self.ordering_type = ordering_type + + if self.ordering_type not in list(OrderingType): + raise ValueError( + f"ordering_type must be one of the following {list(OrderingType)}, but got {self.ordering_type}." + ) + + self.spatial_dims = spatial_dims + self.dimensions = dimensions + + if len(dimensions) != self.spatial_dims + 1: + raise ValueError(f"dimensions must be of length {self.spatial_dims + 1}, but got {len(dimensions)}.") + + self.reflected_spatial_dims = reflected_spatial_dims + self.transpositions_axes = transpositions_axes + self.rot90_axes = rot90_axes + if len(set(transformation_order)) != len(transformation_order): + raise ValueError(f"No duplicates are allowed. Received {transformation_order}.") + + for transformation in transformation_order: + if transformation not in list(OrderingTransformations): + raise ValueError( + f"Valid transformations are {list(OrderingTransformations)} but received {transformation}." + ) + self.transformation_order = transformation_order + + self.template = self._create_template() + self._sequence_ordering = self._create_ordering() + self._revert_sequence_ordering = np.argsort(self._sequence_ordering) + + def __call__(self, x: np.ndarray) -> np.ndarray: + x = x[self._sequence_ordering] + + return x + + def get_sequence_ordering(self) -> np.ndarray: + return self._sequence_ordering + + def get_revert_sequence_ordering(self) -> np.ndarray: + return self._revert_sequence_ordering + + def _create_ordering(self) -> np.ndarray: + self.template = self._transform_template() + order = self._order_template(template=self.template) + + return order + + def _create_template(self) -> np.ndarray: + spatial_dimensions = self.dimensions[1:] + template = np.arange(np.prod(spatial_dimensions)).reshape(*spatial_dimensions) + + return template + + def _transform_template(self) -> np.ndarray: + for transformation in self.transformation_order: + if transformation == OrderingTransformations.TRANSPOSE.value: + self.template = self._transpose_template(template=self.template) + elif transformation == OrderingTransformations.ROTATE_90.value: + self.template = self._rot90_template(template=self.template) + elif transformation == OrderingTransformations.REFLECT.value: + self.template = self._flip_template(template=self.template) + + return self.template + + def _transpose_template(self, template: np.ndarray) -> np.ndarray: + if self.transpositions_axes is not None: + for axes in self.transpositions_axes: + template = np.transpose(template, axes=axes) + + return template + + def _flip_template(self, template: np.ndarray) -> np.ndarray: + if self.reflected_spatial_dims is not None: + for axis, to_reflect in enumerate(self.reflected_spatial_dims): + template = np.flip(template, axis=axis) if to_reflect else template + + return template + + def _rot90_template(self, template: np.ndarray) -> np.ndarray: + if self.rot90_axes is not None: + for axes in self.rot90_axes: + template = np.rot90(template, axes=axes) + + return template + + def _order_template(self, template: np.ndarray) -> np.ndarray: + depths = None + if self.spatial_dims == 2: + rows, columns = template.shape[0], template.shape[1] + else: + rows, columns, depths = (template.shape[0], template.shape[1], template.shape[2]) + + sequence = eval(f"self.{self.ordering_type}_idx")(rows, columns, depths) + + ordering = np.array([template[tuple(e)] for e in sequence]) + + return ordering + + @staticmethod + def raster_scan_idx(rows: int, cols: int, depths: int | None = None) -> np.ndarray: + idx: list[tuple] = [] + + for r in range(rows): + for c in range(cols): + if depths is not None: + for d in range(depths): + idx.append((r, c, d)) + else: + idx.append((r, c)) + + idx_np = np.array(idx) + + return idx_np + + @staticmethod + def s_curve_idx(rows: int, cols: int, depths: int | None = None) -> np.ndarray: + idx: list[tuple] = [] + + for r in range(rows): + col_idx = range(cols) if r % 2 == 0 else range(cols - 1, -1, -1) + for c in col_idx: + if depths: + depth_idx = range(depths) if c % 2 == 0 else range(depths - 1, -1, -1) + + for d in depth_idx: + idx.append((r, c, d)) + else: + idx.append((r, c)) + + idx_np = np.array(idx) + + return idx_np + + @staticmethod + def random_idx(rows: int, cols: int, depths: int | None = None) -> np.ndarray: + idx: list[tuple] = [] + + for r in range(rows): + for c in range(cols): + if depths: + for d in range(depths): + idx.append((r, c, d)) + else: + idx.append((r, c)) + + idx_np = np.array(idx) + np.random.shuffle(idx_np) + + return idx_np diff --git a/monai/visualize/class_activation_maps.py b/monai/visualize/class_activation_maps.py index 6d1e8dfd03..489a563818 100644 --- a/monai/visualize/class_activation_maps.py +++ b/monai/visualize/class_activation_maps.py @@ -290,7 +290,7 @@ def __init__( ) self.fc_layers = fc_layers - def compute_map(self, x, class_idx=None, layer_idx=-1, **kwargs): + def compute_map(self, x, class_idx=None, layer_idx=-1, **kwargs): # type: ignore[override] logits, acti, _ = self.nn_module(x, **kwargs) acti = acti[layer_idx] if class_idx is None: @@ -302,7 +302,7 @@ def compute_map(self, x, class_idx=None, layer_idx=-1, **kwargs): output = torch.stack([output[i, b : b + 1] for i, b in enumerate(class_idx)], dim=0) return output.reshape(b, 1, *spatial) # resume the spatial dims on the selected class - def __call__(self, x, class_idx=None, layer_idx=-1, **kwargs): + def __call__(self, x, class_idx=None, layer_idx=-1, **kwargs): # type: ignore[override] """ Compute the activation map with upsampling and postprocessing. @@ -361,7 +361,7 @@ class GradCAM(CAMBase): """ - def compute_map(self, x, class_idx=None, retain_graph=False, layer_idx=-1, **kwargs): + def compute_map(self, x, class_idx=None, retain_graph=False, layer_idx=-1, **kwargs): # type: ignore[override] _, acti, grad = self.nn_module(x, class_idx=class_idx, retain_graph=retain_graph, **kwargs) acti, grad = acti[layer_idx], grad[layer_idx] b, c, *spatial = grad.shape @@ -369,7 +369,7 @@ def compute_map(self, x, class_idx=None, retain_graph=False, layer_idx=-1, **kwa acti_map = (weights * acti).sum(1, keepdim=True) return F.relu(acti_map) - def __call__(self, x, class_idx=None, layer_idx=-1, retain_graph=False, **kwargs): + def __call__(self, x, class_idx=None, layer_idx=-1, retain_graph=False, **kwargs): # type: ignore[override] """ Compute the activation map with upsampling and postprocessing. @@ -401,7 +401,7 @@ class GradCAMpp(GradCAM): """ - def compute_map(self, x, class_idx=None, retain_graph=False, layer_idx=-1, **kwargs): + def compute_map(self, x, class_idx=None, retain_graph=False, layer_idx=-1, **kwargs): # type: ignore[override] _, acti, grad = self.nn_module(x, class_idx=class_idx, retain_graph=retain_graph, **kwargs) acti, grad = acti[layer_idx], grad[layer_idx] b, c, *spatial = grad.shape diff --git a/monai/visualize/img2tensorboard.py b/monai/visualize/img2tensorboard.py index e7884e9b1f..677640bd04 100644 --- a/monai/visualize/img2tensorboard.py +++ b/monai/visualize/img2tensorboard.py @@ -176,7 +176,9 @@ def plot_2d_or_3d_image( # as the `d` data has no batch dim, reduce the spatial dim index if positive frame_dim = frame_dim - 1 if frame_dim > 0 else frame_dim - d: np.ndarray = data_index.detach().cpu().numpy() if isinstance(data_index, torch.Tensor) else data_index + d: np.ndarray = ( + data_index.detach().cpu().numpy() if isinstance(data_index, torch.Tensor) else np.asarray(data_index) + ) if d.ndim == 2: d = rescale_array(d, 0, 1) # type: ignore diff --git a/monai/visualize/utils.py b/monai/visualize/utils.py index f6718fe7a5..88c9a0d66a 100644 --- a/monai/visualize/utils.py +++ b/monai/visualize/utils.py @@ -24,11 +24,9 @@ from monai.utils.type_conversion import convert_data_type, convert_to_dst_type if TYPE_CHECKING: - from matplotlib import cm from matplotlib import pyplot as plt else: plt, _ = optional_import("matplotlib", name="pyplot") - cm, _ = optional_import("matplotlib", name="cm") __all__ = ["matshow3d", "blend_images"] @@ -210,7 +208,7 @@ def blend_images( image = repeat(image, 3, axis=0) def get_label_rgb(cmap: str, label: NdarrayOrTensor) -> NdarrayOrTensor: - _cmap = cm.get_cmap(cmap) + _cmap = plt.colormaps.get_cmap(cmap) label_np, *_ = convert_data_type(label, np.ndarray) label_rgb_np = _cmap(label_np[0]) label_rgb_np = np.moveaxis(label_rgb_np, -1, 0)[:3] diff --git a/pyproject.toml b/pyproject.toml index cd8a510b04..53ca608d20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,8 +38,7 @@ exclude = "monai/bundle/__main__.py" [tool.ruff] line-length = 133 -ignore-init-module-imports = true -ignore = ["F401", "E741"] +lint.ignore = ["F401", "E741"] [tool.pytype] # Space-separated list of files or directories to exclude. diff --git a/requirements-dev.txt b/requirements-dev.txt index b08fef874b..9aad0804e6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,7 +2,7 @@ -r requirements-min.txt pytorch-ignite==0.4.11 gdown>=4.7.3 -scipy>=1.7.1 +scipy>=1.12.0; python_version >= '3.9' itk>=5.2 nibabel pillow!=8.3.0 # https://github.com/python-pillow/Pillow/issues/5571 @@ -11,7 +11,7 @@ scikit-image>=0.19.0 tqdm>=4.47.0 lmdb flake8>=3.8.1 -flake8-bugbear +flake8-bugbear<=24.2.6 # https://github.com/Project-MONAI/MONAI/issues/7690 flake8-comprehensions mccabe pep8-naming @@ -21,22 +21,22 @@ black>=22.12 isort>=5.1 ruff pytype>=2020.6.1; platform_system != "Windows" -types-pkg_resources +types-setuptools mypy>=1.5.0 ninja torchvision psutil -cucim>=23.2.0; platform_system == "Linux" +cucim-cu12; platform_system == "Linux" and python_version >= "3.9" and python_version <= "3.10" openslide-python imagecodecs; platform_system == "Linux" or platform_system == "Darwin" tifffile; platform_system == "Linux" or platform_system == "Darwin" pandas requests einops -transformers>=4.36.0 -mlflow>=1.28.0 +transformers>=4.36.0, <4.41.0; python_version <= '3.10' +mlflow>=2.12.2 clearml>=1.10.0rc0 -matplotlib!=3.5.0 +matplotlib>=3.6.3 tensorboardX types-PyYAML pyyaml @@ -46,14 +46,16 @@ pynrrd pre-commit pydicom h5py -nni; platform_system == "Linux" +nni==2.10.1; platform_system == "Linux" and "arm" not in platform_machine and "aarch" not in platform_machine optuna git+https://github.com/Project-MONAI/MetricsReloaded@monai-support#egg=MetricsReloaded onnx>=1.13.0 onnxruntime; python_version <= '3.10' typeguard<3 # https://github.com/microsoft/nni/issues/5457 -filelock!=3.12.0 # https://github.com/microsoft/nni/issues/5523 +filelock<3.12.0 # https://github.com/microsoft/nni/issues/5523 zarr lpips==0.1.4 nvidia-ml-py huggingface_hub +pyamg>=5.0.0 +git+https://github.com/facebookresearch/segment-anything.git@6fdee8f2727f4506cfbbe553e23b895e27956588 diff --git a/requirements-min.txt b/requirements-min.txt index ad0bb1ef20..21cf9d5e5c 100644 --- a/requirements-min.txt +++ b/requirements-min.txt @@ -1,5 +1,7 @@ # Requirements for minimal tests -r requirements.txt -setuptools>=50.3.0,<66.0.0,!=60.6.0 +setuptools>=50.3.0,<66.0.0,!=60.6.0 ; python_version < "3.12" +setuptools>=70.2.0; python_version >= "3.12" coverage>=5.5 parameterized +packaging diff --git a/requirements.txt b/requirements.txt index 1569646794..aae455f58c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ torch>=1.9 -numpy>=1.20 +numpy>=1.20,<=1.26.0 diff --git a/runtests.sh b/runtests.sh index 0c60bc0f58..65e3a2bb6b 100755 --- a/runtests.sh +++ b/runtests.sh @@ -167,7 +167,7 @@ function clang_format { } function is_pip_installed() { - return $("${PY_EXE}" -c "import sys, pkgutil; sys.exit(0 if pkgutil.find_loader(sys.argv[1]) else 1)" $1) + return $("${PY_EXE}" -c "import sys, importlib.util; sys.exit(0 if importlib.util.find_spec(sys.argv[1]) else 1)" $1) } function clean_py { @@ -738,12 +738,14 @@ fi # network training/inference/eval integration tests if [ $doNetTests = true ] then + set +e # disable exit on failure so that diagnostics can be given on failure echo "${separator}${blue}integration${noColor}" for i in tests/*integration_*.py do echo "$i" ${cmdPrefix}${cmd} "$i" done + set -e # enable exit on failure fi # run model zoo tests diff --git a/setup.cfg b/setup.cfg index 229e2ace56..2115c30a7f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,9 +21,9 @@ classifiers = Intended Audience :: Healthcare Industry Programming Language :: C++ Programming Language :: Python :: 3 - Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 Topic :: Scientific/Engineering Topic :: Scientific/Engineering :: Artificial Intelligence Topic :: Scientific/Engineering :: Medical Science Apps. @@ -33,7 +33,7 @@ classifiers = Typing :: Typed [options] -python_requires = >= 3.8 +python_requires = >= 3.9 # for compiling and develop setup only # no need to specify the versions so that we could # compile for multiple targeted versions. @@ -49,7 +49,7 @@ all = nibabel ninja scikit-image>=0.14.2 - scipy>=1.7.1 + scipy>=1.12.0; python_version >= '3.9' pillow tensorboard gdown>=4.7.3 @@ -59,16 +59,16 @@ all = tqdm>=4.47.0 lmdb psutil - cucim>=23.2.0 + cucim-cu12; python_version >= '3.9' and python_version <= '3.10' openslide-python tifffile imagecodecs pandas einops - transformers<4.22; python_version <= '3.10' - mlflow>=1.28.0 + transformers>=4.36.0, <4.41.0; python_version <= '3.10' + mlflow>=2.12.2 clearml>=1.10.0rc0 - matplotlib + matplotlib>=3.6.3 tensorboardX pyyaml fire @@ -84,6 +84,7 @@ all = lpips==0.1.4 nvidia-ml-py huggingface_hub + pyamg>=5.0.0 nibabel = nibabel ninja = @@ -91,7 +92,7 @@ ninja = skimage = scikit-image>=0.14.2 scipy = - scipy>=1.7.1 + scipy>=1.12.0; python_version >= '3.9' pillow = pillow!=8.3.0 tensorboard = @@ -111,7 +112,7 @@ lmdb = psutil = psutil cucim = - cucim>=23.2.0 + cucim-cu12 openslide = openslide-python tifffile = @@ -123,11 +124,11 @@ pandas = einops = einops transformers = - transformers<4.22; python_version <= '3.10' + transformers>=4.36.0, <4.41.0; python_version <= '3.10' mlflow = - mlflow + mlflow>=2.12.2 matplotlib = - matplotlib + matplotlib>=3.6.3 clearml = clearml tensorboardX = @@ -136,6 +137,8 @@ pyyaml = pyyaml fire = fire +packaging = + packaging jsonschema = jsonschema pynrrd = @@ -159,9 +162,13 @@ pynvml = nvidia-ml-py # # workaround https://github.com/Project-MONAI/MONAI/issues/5882 # MetricsReloaded = -# MetricsReloaded @ git+https://github.com/Project-MONAI/MetricsReloaded@monai-support#egg=MetricsReloaded + # MetricsReloaded @ git+https://github.com/Project-MONAI/MetricsReloaded@monai-support#egg=MetricsReloaded huggingface_hub = huggingface_hub +pyamg = + pyamg>=5.0.0 +# segment-anything = +# segment-anything @ git+https://github.com/facebookresearch/segment-anything@6fdee8f2727f4506cfbbe553e23b895e27956588#egg=segment-anything [flake8] select = B,C,E,F,N,P,T4,W,B9 diff --git a/setup.py b/setup.py index b90d9d0976..576743c1f7 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ import sys import warnings -import pkg_resources +from packaging import version from setuptools import find_packages, setup import versioneer @@ -40,7 +40,7 @@ BUILD_CUDA = FORCE_CUDA or (torch.cuda.is_available() and (CUDA_HOME is not None)) - _pt_version = pkg_resources.parse_version(torch.__version__).release + _pt_version = version.parse(torch.__version__).release if _pt_version is None or len(_pt_version) < 3: raise AssertionError("unknown torch version") TORCH_VERSION = int(_pt_version[0]) * 10000 + int(_pt_version[1]) * 100 + int(_pt_version[2]) diff --git a/tests/hvd_evenly_divisible_all_gather.py b/tests/hvd_evenly_divisible_all_gather.py index 78c6ca06bc..732ad13b83 100644 --- a/tests/hvd_evenly_divisible_all_gather.py +++ b/tests/hvd_evenly_divisible_all_gather.py @@ -30,10 +30,10 @@ def test_data(self): self._run() def _run(self): - if hvd.rank() == 0: - data1 = torch.tensor([[1, 2], [3, 4]]) - data2 = torch.tensor([[1.0, 2.0]]) - data3 = torch.tensor(7) + # if hvd.rank() == 0: + data1 = torch.tensor([[1, 2], [3, 4]]) + data2 = torch.tensor([[1.0, 2.0]]) + data3 = torch.tensor(7) if hvd.rank() == 1: data1 = torch.tensor([[5, 6]]) diff --git a/tests/min_tests.py b/tests/min_tests.py index 8128bb7b84..479c4c8dc2 100644 --- a/tests/min_tests.py +++ b/tests/min_tests.py @@ -154,6 +154,7 @@ def run_testsuit(): "test_plot_2d_or_3d_image", "test_png_rw", "test_prepare_batch_default", + "test_prepare_batch_diffusion", "test_prepare_batch_extra_input", "test_prepare_batch_hovernet", "test_rand_grid_patch", @@ -208,6 +209,7 @@ def run_testsuit(): "test_zarr_avg_merger", "test_perceptual_loss", "test_ultrasound_confidence_map_transform", + "test_vista3d_utils", ] assert sorted(exclude_cases) == sorted(set(exclude_cases)), f"Duplicated items in {exclude_cases}" diff --git a/tests/ngc_bundle_download.py b/tests/ngc_bundle_download.py index 01dc044870..107114861c 100644 --- a/tests/ngc_bundle_download.py +++ b/tests/ngc_bundle_download.py @@ -127,7 +127,7 @@ def test_loading_mmar(self, item): in_channels=1, img_size=(96, 96, 96), patch_size=(16, 16, 16), - pos_embed="conv", + proj_type="conv", hidden_size=768, mlp_dim=3072, ) diff --git a/tests/nonconfig_workflow.py b/tests/nonconfig_workflow.py index 7b5328bf72..b2c44c12c6 100644 --- a/tests/nonconfig_workflow.py +++ b/tests/nonconfig_workflow.py @@ -36,8 +36,8 @@ class NonConfigWorkflow(BundleWorkflow): """ - def __init__(self, filename, output_dir): - super().__init__(workflow_type="inference") + def __init__(self, filename, output_dir, meta_file=None, logging_file=None): + super().__init__(workflow_type="inference", meta_file=meta_file, logging_file=logging_file) self.filename = filename self.output_dir = output_dir self._bundle_root = "will override" diff --git a/tests/test_affine_transform.py b/tests/test_affine_transform.py index 6ea036bce8..11464070e0 100644 --- a/tests/test_affine_transform.py +++ b/tests/test_affine_transform.py @@ -133,28 +133,17 @@ def test_to_norm_affine_ill(self, affine, src_size, dst_size, align_corners): class TestAffineTransform(unittest.TestCase): - def test_affine_shift(self): - affine = torch.as_tensor([[1.0, 0.0, 0.0], [0.0, 1.0, -1.0]]) - image = torch.as_tensor([[[[4.0, 1.0, 3.0, 2.0], [7.0, 6.0, 8.0, 5.0], [3.0, 5.0, 3.0, 6.0]]]]) - out = AffineTransform(align_corners=False)(image, affine) - out = out.detach().cpu().numpy() - expected = [[[[0, 4, 1, 3], [0, 7, 6, 8], [0, 3, 5, 3]]]] - np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol) - - def test_affine_shift_1(self): - affine = torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, -1.0]]) - image = torch.as_tensor([[[[4.0, 1.0, 3.0, 2.0], [7.0, 6.0, 8.0, 5.0], [3.0, 5.0, 3.0, 6.0]]]]) - out = AffineTransform(align_corners=False)(image, affine) - out = out.detach().cpu().numpy() - expected = [[[[0, 0, 0, 0], [0, 4, 1, 3], [0, 7, 6, 8]]]] - np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol) - - def test_affine_shift_2(self): - affine = torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, 0.0]]) + @parameterized.expand( + [ + (torch.as_tensor([[1.0, 0.0, 0.0], [0.0, 1.0, -1.0]]), [[[[0, 4, 1, 3], [0, 7, 6, 8], [0, 3, 5, 3]]]]), + (torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, -1.0]]), [[[[0, 0, 0, 0], [0, 4, 1, 3], [0, 7, 6, 8]]]]), + (torch.as_tensor([[1.0, 0.0, -1.0], [0.0, 1.0, 0.0]]), [[[[0, 0, 0, 0], [4, 1, 3, 2], [7, 6, 8, 5]]]]), + ] + ) + def test_affine_transforms(self, affine, expected): image = torch.as_tensor([[[[4.0, 1.0, 3.0, 2.0], [7.0, 6.0, 8.0, 5.0], [3.0, 5.0, 3.0, 6.0]]]]) out = AffineTransform(align_corners=False)(image, affine) out = out.detach().cpu().numpy() - expected = [[[[0, 0, 0, 0], [4, 1, 3, 2], [7, 6, 8, 5]]]] np.testing.assert_allclose(out, expected, atol=1e-5, rtol=_rtol) def test_zoom(self): diff --git a/tests/test_arraydataset.py b/tests/test_arraydataset.py index efc014a267..03239a9764 100644 --- a/tests/test_arraydataset.py +++ b/tests/test_arraydataset.py @@ -40,8 +40,9 @@ class TestCompose(Compose): + __test__ = False # indicate to pytest that this class is not intended for collection - def __call__(self, input_, lazy): + def __call__(self, input_, lazy=False): img = self.transforms[0](input_) metadata = img.meta img = self.transforms[1](img) diff --git a/tests/test_attentionunet.py b/tests/test_attentionunet.py index 83f6cabc5e..6a577f763f 100644 --- a/tests/test_attentionunet.py +++ b/tests/test_attentionunet.py @@ -14,11 +14,17 @@ import unittest import torch +import torch.nn as nn import monai.networks.nets.attentionunet as att from tests.utils import skip_if_no_cuda, skip_if_quick +def get_net_parameters(net: nn.Module) -> int: + """Returns the total number of parameters in a Module.""" + return sum(param.numel() for param in net.parameters()) + + class TestAttentionUnet(unittest.TestCase): def test_attention_block(self): @@ -50,6 +56,20 @@ def test_attentionunet(self): self.assertEqual(output.shape[0], input.shape[0]) self.assertEqual(output.shape[1], 2) + def test_attentionunet_kernel_size(self): + args_dict = { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 2, + "channels": (3, 4, 5), + "up_kernel_size": 5, + "strides": (1, 2), + } + model_a = att.AttentionUnet(**args_dict, kernel_size=5) + model_b = att.AttentionUnet(**args_dict, kernel_size=7) + self.assertEqual(get_net_parameters(model_a), 3534) + self.assertEqual(get_net_parameters(model_b), 5574) + @skip_if_no_cuda def test_attentionunet_gpu(self): for dims in [2, 3]: diff --git a/tests/test_auto3dseg.py b/tests/test_auto3dseg.py index e2097679e2..5273f0663a 100644 --- a/tests/test_auto3dseg.py +++ b/tests/test_auto3dseg.py @@ -123,6 +123,8 @@ class TestOperations(Operations): Test example for user operation """ + __test__ = False # indicate to pytest that this class is not intended for collection + def __init__(self) -> None: self.data = {"max": np.max, "mean": np.mean, "min": np.min} @@ -132,6 +134,8 @@ class TestAnalyzer(Analyzer): Test example for a simple Analyzer """ + __test__ = False # indicate to pytest that this class is not intended for collection + def __init__(self, key, report_format, stats_name="test"): self.key = key super().__init__(stats_name, report_format) @@ -149,6 +153,8 @@ class TestImageAnalyzer(Analyzer): Test example for a simple Analyzer """ + __test__ = False # indicate to pytest that this class is not intended for collection + def __init__(self, image_key="image", stats_name="test_image"): self.image_key = image_key report_format = {"test_stats": None} @@ -367,7 +373,6 @@ def test_filename_case_analyzer(self): for batch_data in self.dataset: d = transform(batch_data[0]) assert DataStatsKeys.BY_CASE_IMAGE_PATH in d - assert DataStatsKeys.BY_CASE_IMAGE_PATH in d def test_filename_case_analyzer_image_only(self): analyzer_image = FilenameStats("image", DataStatsKeys.BY_CASE_IMAGE_PATH) diff --git a/tests/test_auto3dseg_hpo.py b/tests/test_auto3dseg_hpo.py index 34d00336ec..53d09defa0 100644 --- a/tests/test_auto3dseg_hpo.py +++ b/tests/test_auto3dseg_hpo.py @@ -181,7 +181,7 @@ def test_get_history(self) -> None: NNIGen().run_algo(obj_filename, self.work_dir) history = import_bundle_algo_history(self.work_dir, only_trained=True) - assert len(history) == 3 + assert len(history) == 1 def tearDown(self) -> None: self.test_dir.cleanup() diff --git a/tests/test_autoencoderkl.py b/tests/test_autoencoderkl.py new file mode 100644 index 0000000000..d15cb79084 --- /dev/null +++ b/tests/test_autoencoderkl.py @@ -0,0 +1,337 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +import tempfile +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.apps import download_url +from monai.networks import eval_mode +from monai.networks.nets import AutoencoderKL +from monai.utils import optional_import +from tests.utils import SkipIfBeforePyTorchVersion, skip_if_downloading_fails, testing_data_config + +tqdm, has_tqdm = optional_import("tqdm", name="tqdm") +_, has_einops = optional_import("einops") + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +CASES_NO_ATTENTION = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": 1, + "norm_num_groups": 4, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + }, + (1, 1, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": 1, + "norm_num_groups": 4, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + }, + (1, 1, 16, 16, 16), + (1, 1, 16, 16, 16), + (1, 4, 4, 4, 4), + ], +] + +CASES_ATTENTION = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": 1, + "norm_num_groups": 4, + }, + (1, 1, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": (1, 1, 2), + "norm_num_groups": 4, + }, + (1, 1, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": 1, + "norm_num_groups": 4, + }, + (1, 1, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": 1, + "norm_num_groups": 4, + "with_encoder_nonlocal_attn": False, + }, + (1, 1, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, True), + "num_res_blocks": 1, + "norm_num_groups": 4, + }, + (1, 1, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, True), + "num_res_blocks": 1, + "norm_num_groups": 4, + }, + (1, 1, 16, 16, 16), + (1, 1, 16, 16, 16), + (1, 4, 4, 4, 4), + ], +] + +if has_einops: + CASES = CASES_NO_ATTENTION + CASES_ATTENTION +else: + CASES = CASES_NO_ATTENTION + + +class TestAutoEncoderKL(unittest.TestCase): + @parameterized.expand(CASES) + def test_shape(self, input_param, input_shape, expected_shape, expected_latent_shape): + net = AutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.forward(torch.randn(input_shape).to(device)) + self.assertEqual(result[0].shape, expected_shape) + self.assertEqual(result[1].shape, expected_latent_shape) + self.assertEqual(result[2].shape, expected_latent_shape) + + @parameterized.expand(CASES) + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_with_convtranspose_and_checkpointing( + self, input_param, input_shape, expected_shape, expected_latent_shape + ): + input_param = input_param.copy() + input_param.update({"use_checkpoint": True, "use_convtranspose": True}) + net = AutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.forward(torch.randn(input_shape).to(device)) + self.assertEqual(result[0].shape, expected_shape) + self.assertEqual(result[1].shape, expected_latent_shape) + self.assertEqual(result[2].shape, expected_latent_shape) + + def test_model_channels_not_multiple_of_norm_num_group(self): + with self.assertRaises(ValueError): + AutoencoderKL( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(24, 24, 24), + attention_levels=(False, False, False), + latent_channels=8, + num_res_blocks=1, + norm_num_groups=16, + ) + + def test_model_num_channels_not_same_size_of_attention_levels(self): + with self.assertRaises(ValueError): + AutoencoderKL( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(24, 24, 24), + attention_levels=(False, False), + latent_channels=8, + num_res_blocks=1, + norm_num_groups=16, + ) + + def test_model_num_channels_not_same_size_of_num_res_blocks(self): + with self.assertRaises(ValueError): + AutoencoderKL( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(24, 24, 24), + attention_levels=(False, False, False), + latent_channels=8, + num_res_blocks=(8, 8), + norm_num_groups=16, + ) + + def test_shape_reconstruction(self): + input_param, input_shape, expected_shape, _ = CASES[0] + net = AutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.reconstruct(torch.randn(input_shape).to(device)) + self.assertEqual(result.shape, expected_shape) + + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_reconstruction_with_convtranspose_and_checkpointing(self): + input_param, input_shape, expected_shape, _ = CASES[0] + input_param = input_param.copy() + input_param.update({"use_checkpoint": True, "use_convtranspose": True}) + net = AutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.reconstruct(torch.randn(input_shape).to(device)) + self.assertEqual(result.shape, expected_shape) + + def test_shape_encode(self): + input_param, input_shape, _, expected_latent_shape = CASES[0] + net = AutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.encode(torch.randn(input_shape).to(device)) + self.assertEqual(result[0].shape, expected_latent_shape) + self.assertEqual(result[1].shape, expected_latent_shape) + + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_encode_with_convtranspose_and_checkpointing(self): + input_param, input_shape, _, expected_latent_shape = CASES[0] + input_param = input_param.copy() + input_param.update({"use_checkpoint": True, "use_convtranspose": True}) + net = AutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.encode(torch.randn(input_shape).to(device)) + self.assertEqual(result[0].shape, expected_latent_shape) + self.assertEqual(result[1].shape, expected_latent_shape) + + def test_shape_sampling(self): + input_param, _, _, expected_latent_shape = CASES[0] + net = AutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.sampling( + torch.randn(expected_latent_shape).to(device), torch.randn(expected_latent_shape).to(device) + ) + self.assertEqual(result.shape, expected_latent_shape) + + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_sampling_convtranspose_and_checkpointing(self): + input_param, _, _, expected_latent_shape = CASES[0] + input_param = input_param.copy() + input_param.update({"use_checkpoint": True, "use_convtranspose": True}) + net = AutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.sampling( + torch.randn(expected_latent_shape).to(device), torch.randn(expected_latent_shape).to(device) + ) + self.assertEqual(result.shape, expected_latent_shape) + + def test_shape_decode(self): + input_param, expected_input_shape, _, latent_shape = CASES[0] + net = AutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.decode(torch.randn(latent_shape).to(device)) + self.assertEqual(result.shape, expected_input_shape) + + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_decode_convtranspose_and_checkpointing(self): + input_param, expected_input_shape, _, latent_shape = CASES[0] + input_param = input_param.copy() + input_param.update({"use_checkpoint": True, "use_convtranspose": True}) + net = AutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.decode(torch.randn(latent_shape).to(device)) + self.assertEqual(result.shape, expected_input_shape) + + @skipUnless(has_einops, "Requires einops") + def test_compatibility_with_monai_generative(self): + # test loading weights from a model saved in MONAI Generative, version 0.2.3 + with skip_if_downloading_fails(): + net = AutoencoderKL( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(4, 4, 4), + latent_channels=4, + attention_levels=(False, False, True), + num_res_blocks=1, + norm_num_groups=4, + ).to(device) + + tmpdir = tempfile.mkdtemp() + key = "autoencoderkl_monai_generative_weights" + url = testing_data_config("models", key, "url") + hash_type = testing_data_config("models", key, "hash_type") + hash_val = testing_data_config("models", key, "hash_val") + filename = "autoencoderkl_monai_generative_weights.pt" + + weight_path = os.path.join(tmpdir, filename) + download_url(url=url, filepath=weight_path, hash_val=hash_val, hash_type=hash_type) + + net.load_old_state_dict(torch.load(weight_path), verbose=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_autoencoderkl_maisi.py b/tests/test_autoencoderkl_maisi.py new file mode 100644 index 0000000000..0e9f427fb6 --- /dev/null +++ b/tests/test_autoencoderkl_maisi.py @@ -0,0 +1,225 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.apps.generation.maisi.networks.autoencoderkl_maisi import AutoencoderKlMaisi +from monai.networks import eval_mode +from monai.utils import optional_import +from tests.utils import SkipIfBeforePyTorchVersion + +tqdm, has_tqdm = optional_import("tqdm", name="tqdm") +_, has_einops = optional_import("einops") + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +CASES_NO_ATTENTION = [ + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": (1, 1, 1), + "norm_num_groups": 4, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "num_splits": 2, + "print_info": False, + }, + (1, 1, 32, 32, 32), + (1, 1, 32, 32, 32), + (1, 4, 8, 8, 8), + ] +] + +CASES_ATTENTION = [ + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, True), + "num_res_blocks": (1, 1, 1), + "norm_num_groups": 4, + "with_encoder_nonlocal_attn": True, + "with_decoder_nonlocal_attn": True, + "num_splits": 2, + "print_info": False, + }, + (1, 1, 32, 32, 32), + (1, 1, 32, 32, 32), + (1, 4, 8, 8, 8), + ] +] + +if has_einops: + CASES = CASES_NO_ATTENTION + CASES_ATTENTION +else: + CASES = CASES_NO_ATTENTION + + +class TestAutoencoderKlMaisi(unittest.TestCase): + + @parameterized.expand(CASES) + def test_shape(self, input_param, input_shape, expected_shape, expected_latent_shape): + net = AutoencoderKlMaisi(**input_param).to(device) + with eval_mode(net): + result = net.forward(torch.randn(input_shape).to(device)) + self.assertEqual(result[0].shape, expected_shape) + self.assertEqual(result[1].shape, expected_latent_shape) + self.assertEqual(result[2].shape, expected_latent_shape) + + @parameterized.expand(CASES) + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_with_convtranspose_and_checkpointing( + self, input_param, input_shape, expected_shape, expected_latent_shape + ): + input_param = input_param.copy() + input_param.update({"use_checkpointing": True, "use_convtranspose": True}) + net = AutoencoderKlMaisi(**input_param).to(device) + with eval_mode(net): + result = net.forward(torch.randn(input_shape).to(device)) + self.assertEqual(result[0].shape, expected_shape) + self.assertEqual(result[1].shape, expected_latent_shape) + self.assertEqual(result[2].shape, expected_latent_shape) + + def test_model_channels_not_multiple_of_norm_num_group(self): + with self.assertRaises(ValueError): + AutoencoderKlMaisi( + spatial_dims=3, + in_channels=1, + out_channels=1, + num_channels=(24, 24, 24), + attention_levels=(False, False, False), + latent_channels=8, + num_res_blocks=(1, 1, 1), + norm_num_groups=16, + num_splits=2, + print_info=False, + ) + + def test_model_num_channels_not_same_size_of_attention_levels(self): + with self.assertRaises(ValueError): + AutoencoderKlMaisi( + spatial_dims=3, + in_channels=1, + out_channels=1, + num_channels=(24, 24, 24), + attention_levels=(False, False), + latent_channels=8, + num_res_blocks=(1, 1, 1), + norm_num_groups=16, + num_splits=2, + print_info=False, + ) + + def test_model_num_channels_not_same_size_of_num_res_blocks(self): + with self.assertRaises(ValueError): + AutoencoderKlMaisi( + spatial_dims=3, + in_channels=1, + out_channels=1, + num_channels=(24, 24), + attention_levels=(False, False, False), + latent_channels=8, + num_res_blocks=(8, 8, 8), + norm_num_groups=16, + num_splits=2, + print_info=False, + ) + + def test_shape_reconstruction(self): + input_param, input_shape, expected_shape, _ = CASES[0] + net = AutoencoderKlMaisi(**input_param).to(device) + with eval_mode(net): + result = net.reconstruct(torch.randn(input_shape).to(device)) + self.assertEqual(result.shape, expected_shape) + + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_reconstruction_with_convtranspose_and_checkpointing(self): + input_param, input_shape, expected_shape, _ = CASES[0] + input_param = input_param.copy() + input_param.update({"use_checkpointing": True, "use_convtranspose": True}) + net = AutoencoderKlMaisi(**input_param).to(device) + with eval_mode(net): + result = net.reconstruct(torch.randn(input_shape).to(device)) + self.assertEqual(result.shape, expected_shape) + + def test_shape_encode(self): + input_param, input_shape, _, expected_latent_shape = CASES[0] + net = AutoencoderKlMaisi(**input_param).to(device) + with eval_mode(net): + result = net.encode(torch.randn(input_shape).to(device)) + self.assertEqual(result[0].shape, expected_latent_shape) + self.assertEqual(result[1].shape, expected_latent_shape) + + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_encode_with_convtranspose_and_checkpointing(self): + input_param, input_shape, _, expected_latent_shape = CASES[0] + input_param = input_param.copy() + input_param.update({"use_checkpointing": True, "use_convtranspose": True}) + net = AutoencoderKlMaisi(**input_param).to(device) + with eval_mode(net): + result = net.encode(torch.randn(input_shape).to(device)) + self.assertEqual(result[0].shape, expected_latent_shape) + self.assertEqual(result[1].shape, expected_latent_shape) + + def test_shape_sampling(self): + input_param, _, _, expected_latent_shape = CASES[0] + net = AutoencoderKlMaisi(**input_param).to(device) + with eval_mode(net): + result = net.sampling( + torch.randn(expected_latent_shape).to(device), torch.randn(expected_latent_shape).to(device) + ) + self.assertEqual(result.shape, expected_latent_shape) + + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_sampling_convtranspose_and_checkpointing(self): + input_param, _, _, expected_latent_shape = CASES[0] + input_param = input_param.copy() + input_param.update({"use_checkpointing": True, "use_convtranspose": True}) + net = AutoencoderKlMaisi(**input_param).to(device) + with eval_mode(net): + result = net.sampling( + torch.randn(expected_latent_shape).to(device), torch.randn(expected_latent_shape).to(device) + ) + self.assertEqual(result.shape, expected_latent_shape) + + def test_shape_decode(self): + input_param, expected_input_shape, _, latent_shape = CASES[0] + net = AutoencoderKlMaisi(**input_param).to(device) + with eval_mode(net): + result = net.decode(torch.randn(latent_shape).to(device)) + self.assertEqual(result.shape, expected_input_shape) + + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_decode_convtranspose_and_checkpointing(self): + input_param, expected_input_shape, _, latent_shape = CASES[0] + input_param = input_param.copy() + input_param.update({"use_checkpointing": True, "use_convtranspose": True}) + net = AutoencoderKlMaisi(**input_param).to(device) + with eval_mode(net): + result = net.decode(torch.randn(latent_shape).to(device)) + self.assertEqual(result.shape, expected_input_shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_barlow_twins_loss.py b/tests/test_barlow_twins_loss.py new file mode 100644 index 0000000000..81f4032e0c --- /dev/null +++ b/tests/test_barlow_twins_loss.py @@ -0,0 +1,109 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import numpy as np +import torch +from parameterized import parameterized + +from monai.losses import BarlowTwinsLoss + +TEST_CASES = [ + [ # shape: (2, 4), (2, 4) + {"lambd": 5e-3}, + { + "input": torch.tensor([[1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]]), + "target": torch.tensor([[1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]]), + }, + 4.0, + ], + [ # shape: (2, 4), (2, 4) + {"lambd": 5e-3}, + { + "input": torch.tensor([[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]]), + "target": torch.tensor([[1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]]), + }, + 4.0, + ], + [ # shape: (2, 4), (2, 4) + {"lambd": 5e-3}, + { + "input": torch.tensor([[1.0, 0.0, 1.0, 1.0], [0.0, 1.0, 1.0, 0.0]]), + "target": torch.tensor([[1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 0.0, 1.0]]), + }, + 5.2562, + ], + [ # shape: (2, 4), (2, 4) + {"lambd": 5e-4}, + { + "input": torch.tensor([[2.0, 3.0, 1.0, 2.0], [0.0, 1.0, 2.0, 5.0]]), + "target": torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]]), + }, + 5.0015, + ], + [ # shape: (4, 4), (4, 4) + {"lambd": 5e-3}, + { + "input": torch.tensor( + [[1.0, 2.0, 1.0, 1.0], [3.0, 1.0, 1.0, 2.0], [1.0, 1.0, 1.0, 1.0], [2.0, 1.0, 1.0, 0.0]] + ), + "target": torch.tensor( + [ + [0.0, 1.0, -1.0, 0.0], + [1 / 3, 0.0, -2 / 3, 1 / 3], + [-2 / 3, -1.0, 7 / 3, 1 / 3], + [1 / 3, 0.0, 1 / 3, -2 / 3], + ] + ), + }, + 1.4736, + ], +] + + +class TestBarlowTwinsLoss(unittest.TestCase): + + @parameterized.expand(TEST_CASES) + def test_result(self, input_param, input_data, expected_val): + barlowtwinsloss = BarlowTwinsLoss(**input_param) + result = barlowtwinsloss(**input_data) + np.testing.assert_allclose(result.detach().cpu().numpy(), expected_val, atol=1e-4, rtol=1e-4) + + def test_ill_shape(self): + loss = BarlowTwinsLoss(lambd=5e-3) + with self.assertRaises(ValueError): + loss(torch.ones((1, 2, 3)), torch.ones((1, 1, 2, 3))) + + def test_ill_batch_size(self): + loss = BarlowTwinsLoss(lambd=5e-3) + with self.assertRaises(ValueError): + loss(torch.ones((1, 2)), torch.ones((1, 2))) + + def test_with_cuda(self): + loss = BarlowTwinsLoss(lambd=5e-3) + i = torch.ones((2, 10)) + j = torch.ones((2, 10)) + if torch.cuda.is_available(): + i = i.cuda() + j = j.cuda() + output = loss(i, j) + np.testing.assert_allclose(output.detach().cpu().numpy(), 10.0, atol=1e-4, rtol=1e-4) + + def check_warning_raised(self): + with self.assertWarns(Warning): + BarlowTwinsLoss(lambd=5e-3, batch_size=1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_bundle_ckpt_export.py b/tests/test_bundle_ckpt_export.py index 8f376a06d5..cfcadcfc4c 100644 --- a/tests/test_bundle_ckpt_export.py +++ b/tests/test_bundle_ckpt_export.py @@ -72,9 +72,9 @@ def test_export(self, key_in_ckpt, use_trace): _, metadata, extra_files = load_net_with_metadata( ts_file, more_extra_files=["inference.json", "def_args.json"] ) - self.assertTrue("schema" in metadata) - self.assertTrue("meta_file" in json.loads(extra_files["def_args.json"])) - self.assertTrue("network_def" in json.loads(extra_files["inference.json"])) + self.assertIn("schema", metadata) + self.assertIn("meta_file", json.loads(extra_files["def_args.json"])) + self.assertIn("network_def", json.loads(extra_files["inference.json"])) @parameterized.expand([TEST_CASE_1, TEST_CASE_2, TEST_CASE_3]) def test_default_value(self, key_in_ckpt, use_trace): diff --git a/tests/test_bundle_download.py b/tests/test_bundle_download.py index 89fbe5e8b2..331d228f1e 100644 --- a/tests/test_bundle_download.py +++ b/tests/test_bundle_download.py @@ -16,6 +16,7 @@ import tempfile import unittest from unittest.case import skipUnless +from unittest.mock import patch import numpy as np import torch @@ -24,6 +25,7 @@ import monai.networks.nets as nets from monai.apps import check_hash from monai.bundle import ConfigParser, create_workflow, load +from monai.bundle.scripts import _examine_monai_version, _list_latest_versions, download from monai.utils import optional_import from tests.utils import ( SkipIfBeforePyTorchVersion, @@ -56,7 +58,7 @@ TEST_CASE_5 = [ ["models/model.pt", "models/model.ts", "configs/train.json"], "brats_mri_segmentation", - "https://api.ngc.nvidia.com/v2/models/nvidia/monaihosting/brats_mri_segmentation/versions/0.3.9/files/brats_mri_segmentation_v0.3.9.zip", + "https://api.ngc.nvidia.com/v2/models/nvidia/monaihosting/brats_mri_segmentation/versions/0.4.0/files/brats_mri_segmentation_v0.4.0.zip", ] TEST_CASE_6 = [["models/model.pt", "configs/train.json"], "renalStructures_CECT_segmentation", "0.1.0"] @@ -173,6 +175,23 @@ def test_monaihosting_url_download_bundle(self, bundle_files, bundle_name, url): file_path = os.path.join(tempdir, bundle_name, file) self.assertTrue(os.path.exists(file_path)) + @parameterized.expand([TEST_CASE_5]) + @skip_if_quick + def test_ngc_private_source_download_bundle(self, bundle_files, bundle_name, _url): + with skip_if_downloading_fails(): + # download a single file from url, also use `args_file` + with tempfile.TemporaryDirectory() as tempdir: + def_args = {"name": bundle_name, "bundle_dir": tempdir} + def_args_file = os.path.join(tempdir, "def_args.json") + parser = ConfigParser() + parser.export_config_file(config=def_args, filepath=def_args_file) + cmd = ["coverage", "run", "-m", "monai.bundle", "download", "--args_file", def_args_file] + cmd += ["--progress", "False", "--source", "ngc_private"] + command_line_tests(cmd) + for file in bundle_files: + file_path = os.path.join(tempdir, bundle_name, file) + self.assertTrue(os.path.exists(file_path)) + @parameterized.expand([TEST_CASE_6]) @skip_if_quick def test_monaihosting_source_download_bundle(self, bundle_files, bundle_name, version): @@ -190,6 +209,55 @@ def test_monaihosting_source_download_bundle(self, bundle_files, bundle_name, ve file_path = os.path.join(tempdir, bundle_name, file) self.assertTrue(os.path.exists(file_path)) + @patch("monai.bundle.scripts.get_versions", return_value={"version": "1.2"}) + def test_examine_monai_version(self, mock_get_versions): + self.assertTrue(_examine_monai_version("1.1")[0]) # Should return True, compatible + self.assertTrue(_examine_monai_version("1.2rc1")[0]) # Should return True, compatible + self.assertFalse(_examine_monai_version("1.3")[0]) # Should return False, not compatible + + @patch("monai.bundle.scripts.get_versions", return_value={"version": "1.2rc1"}) + def test_examine_monai_version_rc(self, mock_get_versions): + self.assertTrue(_examine_monai_version("1.2")[0]) # Should return True, compatible + self.assertFalse(_examine_monai_version("1.3")[0]) # Should return False, not compatible + + def test_list_latest_versions(self): + """Test listing of the latest versions.""" + data = { + "modelVersions": [ + {"createdDate": "2021-01-01", "versionId": "1.0"}, + {"createdDate": "2021-01-02", "versionId": "1.1"}, + {"createdDate": "2021-01-03", "versionId": "1.2"}, + ] + } + self.assertEqual(_list_latest_versions(data), ["1.2", "1.1", "1.0"]) + self.assertEqual(_list_latest_versions(data, max_versions=2), ["1.2", "1.1"]) + data = { + "modelVersions": [ + {"createdDate": "2021-01-01", "versionId": "1.0"}, + {"createdDate": "2021-01-02", "versionId": "1.1"}, + ] + } + self.assertEqual(_list_latest_versions(data), ["1.1", "1.0"]) + + @skip_if_quick + @patch("monai.bundle.scripts.get_versions", return_value={"version": "1.2"}) + def test_download_monaihosting(self, mock_get_versions): + """Test checking MONAI version from a metadata file.""" + with patch("monai.bundle.scripts.logger") as mock_logger: + with tempfile.TemporaryDirectory() as tempdir: + download(name="spleen_ct_segmentation", bundle_dir=tempdir, source="monaihosting") + # Should have a warning message because the latest version is using monai > 1.2 + mock_logger.warning.assert_called_once() + + @skip_if_quick + @patch("monai.bundle.scripts.get_versions", return_value={"version": "1.2"}) + def test_download_ngc(self, mock_get_versions): + """Test checking MONAI version from a metadata file.""" + with patch("monai.bundle.scripts.logger") as mock_logger: + with tempfile.TemporaryDirectory() as tempdir: + download(name="spleen_ct_segmentation", bundle_dir=tempdir, source="ngc") + mock_logger.warning.assert_not_called() + @skip_if_no_cuda class TestLoad(unittest.TestCase): diff --git a/tests/test_bundle_get_data.py b/tests/test_bundle_get_data.py index 605b3945bb..f84713fbe3 100644 --- a/tests/test_bundle_get_data.py +++ b/tests/test_bundle_get_data.py @@ -51,8 +51,8 @@ class TestGetBundleData(unittest.TestCase): def test_get_all_bundles_list(self, params): with skip_if_downloading_fails(): output = get_all_bundles_list(**params) - self.assertTrue(isinstance(output, list)) - self.assertTrue(isinstance(output[0], tuple)) + self.assertIsInstance(output, list) + self.assertIsInstance(output[0], tuple) self.assertTrue(len(output[0]) == 2) @parameterized.expand([TEST_CASE_1, TEST_CASE_5]) @@ -60,16 +60,17 @@ def test_get_all_bundles_list(self, params): def test_get_bundle_versions(self, params): with skip_if_downloading_fails(): output = get_bundle_versions(**params) - self.assertTrue(isinstance(output, dict)) - self.assertTrue("latest_version" in output and "all_versions" in output) - self.assertTrue("0.1.0" in output["all_versions"]) + self.assertIsInstance(output, dict) + self.assertIn("latest_version", output) + self.assertIn("all_versions", output) + self.assertIn("0.1.0", output["all_versions"]) @parameterized.expand([TEST_CASE_1, TEST_CASE_2]) @skip_if_quick def test_get_bundle_info(self, params): with skip_if_downloading_fails(): output = get_bundle_info(**params) - self.assertTrue(isinstance(output, dict)) + self.assertIsInstance(output, dict) for key in ["id", "name", "size", "download_count", "browser_download_url"]: self.assertTrue(key in output) @@ -78,7 +79,7 @@ def test_get_bundle_info(self, params): def test_get_bundle_info_monaihosting(self, params): with skip_if_downloading_fails(): output = get_bundle_info(**params) - self.assertTrue(isinstance(output, dict)) + self.assertIsInstance(output, dict) for key in ["name", "browser_download_url"]: self.assertTrue(key in output) diff --git a/tests/test_bundle_trt_export.py b/tests/test_bundle_trt_export.py index 47034852ef..833a0ca1dc 100644 --- a/tests/test_bundle_trt_export.py +++ b/tests/test_bundle_trt_export.py @@ -91,9 +91,9 @@ def test_trt_export(self, convert_precision, input_shape, dynamic_batch): _, metadata, extra_files = load_net_with_metadata( ts_file, more_extra_files=["inference.json", "def_args.json"] ) - self.assertTrue("schema" in metadata) - self.assertTrue("meta_file" in json.loads(extra_files["def_args.json"])) - self.assertTrue("network_def" in json.loads(extra_files["inference.json"])) + self.assertIn("schema", metadata) + self.assertIn("meta_file", json.loads(extra_files["def_args.json"])) + self.assertIn("network_def", json.loads(extra_files["inference.json"])) @parameterized.expand([TEST_CASE_3, TEST_CASE_4]) @unittest.skipUnless( @@ -129,9 +129,9 @@ def test_onnx_trt_export(self, convert_precision, input_shape, dynamic_batch): _, metadata, extra_files = load_net_with_metadata( ts_file, more_extra_files=["inference.json", "def_args.json"] ) - self.assertTrue("schema" in metadata) - self.assertTrue("meta_file" in json.loads(extra_files["def_args.json"])) - self.assertTrue("network_def" in json.loads(extra_files["inference.json"])) + self.assertIn("schema", metadata) + self.assertIn("meta_file", json.loads(extra_files["def_args.json"])) + self.assertIn("network_def", json.loads(extra_files["inference.json"])) if __name__ == "__main__": diff --git a/tests/test_bundle_workflow.py b/tests/test_bundle_workflow.py index f7da37acef..1727fcdf53 100644 --- a/tests/test_bundle_workflow.py +++ b/tests/test_bundle_workflow.py @@ -35,6 +35,8 @@ TEST_CASE_3 = [os.path.join(os.path.dirname(__file__), "testing_data", "config_fl_train.json")] +TEST_CASE_NON_CONFIG_WRONG_LOG = [None, "logging.conf", "Cannot find the logging config file: logging.conf."] + class TestBundleWorkflow(unittest.TestCase): @@ -103,6 +105,16 @@ def test_inference_config(self, config_file): ) self._test_inferer(inferer) + # test property path + inferer = ConfigWorkflow( + config_file=config_file, + properties_path=os.path.join(os.path.dirname(__file__), "testing_data", "fl_infer_properties.json"), + logging_file=os.path.join(os.path.dirname(__file__), "testing_data", "logging.conf"), + **override, + ) + self._test_inferer(inferer) + self.assertEqual(inferer.workflow_type, None) + @parameterized.expand([TEST_CASE_3]) def test_train_config(self, config_file): # test standard MONAI model-zoo config workflow @@ -126,11 +138,11 @@ def test_train_config(self, config_file): self.assertListEqual(trainer.check_properties(), []) # test read / write the properties dataset = trainer.train_dataset - self.assertTrue(isinstance(dataset, Dataset)) + self.assertIsInstance(dataset, Dataset) inferer = trainer.train_inferer - self.assertTrue(isinstance(inferer, SimpleInferer)) + self.assertIsInstance(inferer, SimpleInferer) # test optional properties get - self.assertTrue(trainer.train_key_metric is None) + self.assertIsNone(trainer.train_key_metric) trainer.train_dataset = deepcopy(dataset) trainer.train_inferer = deepcopy(inferer) # test optional properties set @@ -144,8 +156,14 @@ def test_train_config(self, config_file): def test_non_config(self): # test user defined python style workflow inferer = NonConfigWorkflow(self.filename, self.data_dir) + self.assertEqual(inferer.meta_file, None) self._test_inferer(inferer) + @parameterized.expand([TEST_CASE_NON_CONFIG_WRONG_LOG]) + def test_non_config_wrong_log_cases(self, meta_file, logging_file, expected_error): + with self.assertRaisesRegex(FileNotFoundError, expected_error): + NonConfigWorkflow(self.filename, self.data_dir, meta_file, logging_file) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_cell_sam_wrapper.py b/tests/test_cell_sam_wrapper.py new file mode 100644 index 0000000000..2f1ee2b901 --- /dev/null +++ b/tests/test_cell_sam_wrapper.py @@ -0,0 +1,58 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.networks import eval_mode +from monai.networks.nets.cell_sam_wrapper import CellSamWrapper +from monai.utils import optional_import + +build_sam_vit_b, has_sam = optional_import("segment_anything.build_sam", name="build_sam_vit_b") + +device = "cuda" if torch.cuda.is_available() else "cpu" +TEST_CASE_CELLSEGWRAPPER = [] +for dims in [128, 256, 512, 1024]: + test_case = [ + {"auto_resize_inputs": True, "network_resize_roi": [1024, 1024], "checkpoint": None}, + (1, 3, *([dims] * 2)), + (1, 3, *([dims] * 2)), + ] + TEST_CASE_CELLSEGWRAPPER.append(test_case) + + +@unittest.skipUnless(has_sam, "Requires SAM installation") +class TestResNetDS(unittest.TestCase): + + @parameterized.expand(TEST_CASE_CELLSEGWRAPPER) + def test_shape(self, input_param, input_shape, expected_shape): + net = CellSamWrapper(**input_param).to(device) + with eval_mode(net): + result = net(torch.randn(input_shape).to(device)) + self.assertEqual(result.shape, expected_shape, msg=str(input_param)) + + def test_ill_arg0(self): + with self.assertRaises(RuntimeError): + net = CellSamWrapper(auto_resize_inputs=False, checkpoint=None).to(device) + net(torch.randn([1, 3, 256, 256]).to(device)) + + def test_ill_arg1(self): + with self.assertRaises(RuntimeError): + net = CellSamWrapper(network_resize_roi=[256, 256], checkpoint=None).to(device) + net(torch.randn([1, 3, 1024, 1024]).to(device)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_clip_intensity_percentiles.py b/tests/test_clip_intensity_percentiles.py new file mode 100644 index 0000000000..77f811db87 --- /dev/null +++ b/tests/test_clip_intensity_percentiles.py @@ -0,0 +1,198 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.transforms import ClipIntensityPercentiles +from monai.transforms.utils import soft_clip +from monai.transforms.utils_pytorch_numpy_unification import clip, percentile +from monai.utils.type_conversion import convert_to_tensor +from tests.utils import TEST_NDARRAYS, NumpyImageTestCase2D, NumpyImageTestCase3D, assert_allclose + + +def test_hard_clip_func(im, lower, upper): + im_t = convert_to_tensor(im) + if lower is None: + upper = percentile(im_t, upper) + elif upper is None: + lower = percentile(im_t, lower) + else: + lower, upper = percentile(im_t, (lower, upper)) + return clip(im_t, lower, upper) + + +def test_soft_clip_func(im, lower, upper): + im_t = convert_to_tensor(im) + if lower is None: + upper = percentile(im_t, upper) + elif upper is None: + lower = percentile(im_t, lower) + else: + lower, upper = percentile(im_t, (lower, upper)) + return soft_clip(im_t, minv=lower, maxv=upper, sharpness_factor=1.0, dtype=torch.float32) + + +class TestClipIntensityPercentiles2D(NumpyImageTestCase2D): + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_two_sided(self, p): + hard_clipper = ClipIntensityPercentiles(upper=95, lower=5) + im = p(self.imt) + result = hard_clipper(im) + expected = test_hard_clip_func(im, 5, 95) + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_one_sided_high(self, p): + hard_clipper = ClipIntensityPercentiles(upper=95, lower=None) + im = p(self.imt) + result = hard_clipper(im) + expected = test_hard_clip_func(im, 0, 95) + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_one_sided_low(self, p): + hard_clipper = ClipIntensityPercentiles(upper=None, lower=5) + im = p(self.imt) + result = hard_clipper(im) + expected = test_hard_clip_func(im, 5, 100) + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_two_sided(self, p): + soft_clipper = ClipIntensityPercentiles(upper=95, lower=5, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper(im) + expected = test_soft_clip_func(im, 5, 95) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_one_sided_high(self, p): + soft_clipper = ClipIntensityPercentiles(upper=95, lower=None, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper(im) + expected = test_soft_clip_func(im, None, 95) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_one_sided_low(self, p): + soft_clipper = ClipIntensityPercentiles(upper=None, lower=5, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper(im) + expected = test_soft_clip_func(im, 5, None) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_channel_wise(self, p): + clipper = ClipIntensityPercentiles(upper=95, lower=5, channel_wise=True) + im = p(self.imt) + result = clipper(im) + im_t = convert_to_tensor(self.imt) + for i, c in enumerate(im_t): + lower, upper = percentile(c, (5, 95)) + expected = clip(c, lower, upper) + assert_allclose(result[i], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + def test_ill_sharpness_factor(self): + with self.assertRaises(ValueError): + ClipIntensityPercentiles(upper=95, lower=5, sharpness_factor=0.0) + + def test_ill_lower_percentile(self): + with self.assertRaises(ValueError): + ClipIntensityPercentiles(upper=None, lower=-1) + + def test_ill_upper_percentile(self): + with self.assertRaises(ValueError): + ClipIntensityPercentiles(upper=101, lower=None) + + def test_ill_percentiles(self): + with self.assertRaises(ValueError): + ClipIntensityPercentiles(upper=95, lower=96) + + def test_ill_both_none(self): + with self.assertRaises(ValueError): + ClipIntensityPercentiles(upper=None, lower=None) + + +class TestClipIntensityPercentiles3D(NumpyImageTestCase3D): + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_two_sided(self, p): + hard_clipper = ClipIntensityPercentiles(upper=95, lower=5) + im = p(self.imt) + result = hard_clipper(im) + expected = test_hard_clip_func(im, 5, 95) + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_one_sided_high(self, p): + hard_clipper = ClipIntensityPercentiles(upper=95, lower=None) + im = p(self.imt) + result = hard_clipper(im) + expected = test_hard_clip_func(im, 0, 95) + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_one_sided_low(self, p): + hard_clipper = ClipIntensityPercentiles(upper=None, lower=5) + im = p(self.imt) + result = hard_clipper(im) + expected = test_hard_clip_func(im, 5, 100) + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_two_sided(self, p): + soft_clipper = ClipIntensityPercentiles(upper=95, lower=5, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper(im) + expected = test_soft_clip_func(im, 5, 95) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_one_sided_high(self, p): + soft_clipper = ClipIntensityPercentiles(upper=95, lower=None, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper(im) + expected = test_soft_clip_func(im, None, 95) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_one_sided_low(self, p): + soft_clipper = ClipIntensityPercentiles(upper=None, lower=5, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper(im) + expected = test_soft_clip_func(im, 5, None) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result, p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_channel_wise(self, p): + clipper = ClipIntensityPercentiles(upper=95, lower=5, channel_wise=True) + im = p(self.imt) + result = clipper(im) + im_t = convert_to_tensor(self.imt) + for i, c in enumerate(im_t): + lower, upper = percentile(c, (5, 95)) + expected = clip(c, lower, upper) + assert_allclose(result[i], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_clip_intensity_percentilesd.py b/tests/test_clip_intensity_percentilesd.py new file mode 100644 index 0000000000..3e06b18418 --- /dev/null +++ b/tests/test_clip_intensity_percentilesd.py @@ -0,0 +1,196 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +from parameterized import parameterized + +from monai.transforms import ClipIntensityPercentilesd +from monai.transforms.utils_pytorch_numpy_unification import clip, percentile +from monai.utils.type_conversion import convert_to_tensor +from tests.utils import TEST_NDARRAYS, NumpyImageTestCase2D, NumpyImageTestCase3D, assert_allclose + +from .test_clip_intensity_percentiles import test_hard_clip_func, test_soft_clip_func + + +class TestClipIntensityPercentilesd2D(NumpyImageTestCase2D): + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_two_sided(self, p): + key = "img" + hard_clipper = ClipIntensityPercentilesd(keys=[key], upper=95, lower=5) + im = p(self.imt) + result = hard_clipper({key: im}) + expected = test_hard_clip_func(im, 5, 95) + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_one_sided_high(self, p): + key = "img" + hard_clipper = ClipIntensityPercentilesd(keys=[key], upper=95, lower=None) + im = p(self.imt) + result = hard_clipper({key: im}) + expected = test_hard_clip_func(im, 0, 95) + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_one_sided_low(self, p): + key = "img" + hard_clipper = ClipIntensityPercentilesd(keys=[key], upper=None, lower=5) + im = p(self.imt) + result = hard_clipper({key: im}) + expected = test_hard_clip_func(im, 5, 100) + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_two_sided(self, p): + key = "img" + soft_clipper = ClipIntensityPercentilesd(keys=[key], upper=95, lower=5, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper({key: im}) + expected = test_soft_clip_func(im, 5, 95) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_one_sided_high(self, p): + key = "img" + soft_clipper = ClipIntensityPercentilesd(keys=[key], upper=95, lower=None, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper({key: im}) + expected = test_soft_clip_func(im, None, 95) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_one_sided_low(self, p): + key = "img" + soft_clipper = ClipIntensityPercentilesd(keys=[key], upper=None, lower=5, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper({key: im}) + expected = test_soft_clip_func(im, 5, None) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_channel_wise(self, p): + key = "img" + clipper = ClipIntensityPercentilesd(keys=[key], upper=95, lower=5, channel_wise=True) + im = p(self.imt) + result = clipper({key: im}) + im_t = convert_to_tensor(self.imt) + for i, c in enumerate(im_t): + lower, upper = percentile(c, (5, 95)) + expected = clip(c, lower, upper) + assert_allclose(result[key][i], p(expected), type_test="tensor", rtol=1e-3, atol=0) + + def test_ill_sharpness_factor(self): + key = "img" + with self.assertRaises(ValueError): + ClipIntensityPercentilesd(keys=[key], upper=95, lower=5, sharpness_factor=0.0) + + def test_ill_lower_percentile(self): + key = "img" + with self.assertRaises(ValueError): + ClipIntensityPercentilesd(keys=[key], upper=None, lower=-1) + + def test_ill_upper_percentile(self): + key = "img" + with self.assertRaises(ValueError): + ClipIntensityPercentilesd(keys=[key], upper=101, lower=None) + + def test_ill_percentiles(self): + key = "img" + with self.assertRaises(ValueError): + ClipIntensityPercentilesd(keys=[key], upper=95, lower=96) + + def test_ill_both_none(self): + key = "img" + with self.assertRaises(ValueError): + ClipIntensityPercentilesd(keys=[key], upper=None, lower=None) + + +class TestClipIntensityPercentilesd3D(NumpyImageTestCase3D): + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_two_sided(self, p): + key = "img" + hard_clipper = ClipIntensityPercentilesd(keys=[key], upper=95, lower=5) + im = p(self.imt) + result = hard_clipper({key: im}) + expected = test_hard_clip_func(im, 5, 95) + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_one_sided_high(self, p): + key = "img" + hard_clipper = ClipIntensityPercentilesd(keys=[key], upper=95, lower=None) + im = p(self.imt) + result = hard_clipper({key: im}) + expected = test_hard_clip_func(im, 0, 95) + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_hard_clipping_one_sided_low(self, p): + key = "img" + hard_clipper = ClipIntensityPercentilesd(keys=[key], upper=None, lower=5) + im = p(self.imt) + result = hard_clipper({key: im}) + expected = test_hard_clip_func(im, 5, 100) + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_two_sided(self, p): + key = "img" + soft_clipper = ClipIntensityPercentilesd(keys=[key], upper=95, lower=5, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper({key: im}) + expected = test_soft_clip_func(im, 5, 95) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_one_sided_high(self, p): + key = "img" + soft_clipper = ClipIntensityPercentilesd(keys=[key], upper=95, lower=None, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper({key: im}) + expected = test_soft_clip_func(im, None, 95) + # the rtol is set to 1e-4 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_soft_clipping_one_sided_low(self, p): + key = "img" + soft_clipper = ClipIntensityPercentilesd(keys=[key], upper=None, lower=5, sharpness_factor=1.0) + im = p(self.imt) + result = soft_clipper({key: im}) + expected = test_soft_clip_func(im, 5, None) + # the rtol is set to 1e-6 because the logaddexp function used in softplus is not stable accross torch and numpy + assert_allclose(result[key], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + @parameterized.expand([[p] for p in TEST_NDARRAYS]) + def test_channel_wise(self, p): + key = "img" + clipper = ClipIntensityPercentilesd(keys=[key], upper=95, lower=5, channel_wise=True) + im = p(self.imt) + result = clipper({key: im}) + im_t = convert_to_tensor(im) + for i, c in enumerate(im_t): + lower, upper = percentile(c, (5, 95)) + expected = clip(c, lower, upper) + assert_allclose(result[key][i], p(expected), type_test="tensor", rtol=1e-4, atol=0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_component_store.py b/tests/test_component_store.py index 424eceb3d1..7e7c6dd19d 100644 --- a/tests/test_component_store.py +++ b/tests/test_component_store.py @@ -48,17 +48,17 @@ def test_add2(self): self.cs.add("test_obj2", "Test object", test_obj2) self.assertEqual(len(self.cs), 2) - self.assertTrue("test_obj1" in self.cs) - self.assertTrue("test_obj2" in self.cs) + self.assertIn("test_obj1", self.cs) + self.assertIn("test_obj2", self.cs) def test_add_def(self): - self.assertFalse("test_func" in self.cs) + self.assertNotIn("test_func", self.cs) @self.cs.add_def("test_func", "Test function") def test_func(): return 123 - self.assertTrue("test_func" in self.cs) + self.assertIn("test_func", self.cs) self.assertEqual(len(self.cs), 1) self.assertEqual(list(self.cs), [("test_func", test_func)]) diff --git a/tests/test_compose.py b/tests/test_compose.py index 309767833b..3c53ac4a22 100644 --- a/tests/test_compose.py +++ b/tests/test_compose.py @@ -716,15 +716,15 @@ def test_compose_execute_equivalence_with_flags(self, flags, data, pipeline): for k in actual.keys(): self.assertEqual(expected[k], actual[k]) else: - self.assertTrue(expected, actual) + self.assertEqual(expected, actual) p = deepcopy(pipeline) actual = execute_compose(execute_compose(data, p, start=0, end=cutoff, **flags), p, start=cutoff, **flags) if isinstance(actual, dict): for k in actual.keys(): - self.assertTrue(expected[k], actual[k]) + self.assertEqual(expected[k], actual[k]) else: - self.assertTrue(expected, actual) + self.assertEqual(expected, actual) class TestComposeCallableInput(unittest.TestCase): diff --git a/tests/test_compute_f_beta.py b/tests/test_compute_f_beta.py index 85997577cf..43ebb6a6d5 100644 --- a/tests/test_compute_f_beta.py +++ b/tests/test_compute_f_beta.py @@ -15,6 +15,7 @@ import numpy as np import torch +from parameterized import parameterized from monai.metrics import FBetaScore from tests.utils import assert_allclose @@ -33,26 +34,21 @@ def test_expecting_success_and_device(self): assert_allclose(result, torch.Tensor([0.714286]), atol=1e-6, rtol=1e-6) np.testing.assert_equal(result.device, y_pred.device) - def test_expecting_success2(self): - metric = FBetaScore(beta=0.5) - metric( - y_pred=torch.Tensor([[1, 1, 1], [1, 1, 1], [1, 1, 1]]), y=torch.Tensor([[1, 0, 1], [0, 1, 0], [1, 0, 1]]) - ) - assert_allclose(metric.aggregate()[0], torch.Tensor([0.609756]), atol=1e-6, rtol=1e-6) - - def test_expecting_success3(self): - metric = FBetaScore(beta=2) - metric( - y_pred=torch.Tensor([[1, 1, 1], [1, 1, 1], [1, 1, 1]]), y=torch.Tensor([[1, 0, 1], [0, 1, 0], [1, 0, 1]]) - ) - assert_allclose(metric.aggregate()[0], torch.Tensor([0.862069]), atol=1e-6, rtol=1e-6) - - def test_denominator_is_zero(self): - metric = FBetaScore(beta=2) - metric( - y_pred=torch.Tensor([[1, 1, 1], [1, 1, 1], [1, 1, 1]]), y=torch.Tensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]]) - ) - assert_allclose(metric.aggregate()[0], torch.Tensor([0.0]), atol=1e-6, rtol=1e-6) + @parameterized.expand( + [ + (0.5, torch.Tensor([[1, 0, 1], [0, 1, 0], [1, 0, 1]]), torch.Tensor([0.609756])), # success_beta_0_5 + (2, torch.Tensor([[1, 0, 1], [0, 1, 0], [1, 0, 1]]), torch.Tensor([0.862069])), # success_beta_2 + ( + 2, # success_beta_2, denominator_zero + torch.Tensor([[0, 0, 0], [0, 0, 0], [0, 0, 0]]), + torch.Tensor([0.0]), + ), + ] + ) + def test_success_and_zero(self, beta, y, expected_score): + metric = FBetaScore(beta=beta) + metric(y_pred=torch.Tensor([[1, 1, 1], [1, 1, 1], [1, 1, 1]]), y=y) + assert_allclose(metric.aggregate()[0], expected_score, atol=1e-6, rtol=1e-6) def test_number_of_dimensions_less_than_2_should_raise_error(self): metric = FBetaScore() diff --git a/tests/test_compute_ho_ver_maps.py b/tests/test_compute_ho_ver_maps.py index bbd5230f04..6e46cf2b1e 100644 --- a/tests/test_compute_ho_ver_maps.py +++ b/tests/test_compute_ho_ver_maps.py @@ -67,8 +67,8 @@ class ComputeHoVerMapsTests(unittest.TestCase): def test_horizontal_certical_maps(self, in_type, arguments, mask, hv_mask): input_image = in_type(mask) result = ComputeHoVerMaps(**arguments)(input_image) - self.assertTrue(isinstance(result, torch.Tensor)) - self.assertTrue(str(result.dtype).split(".")[1] == arguments.get("dtype", "float32")) + self.assertIsInstance(result, torch.Tensor) + self.assertEqual(str(result.dtype).split(".")[1], arguments.get("dtype", "float32")) assert_allclose(result, hv_mask, type_test="tensor") diff --git a/tests/test_compute_ho_ver_maps_d.py b/tests/test_compute_ho_ver_maps_d.py index 7b5ac0d9d7..0734e2e731 100644 --- a/tests/test_compute_ho_ver_maps_d.py +++ b/tests/test_compute_ho_ver_maps_d.py @@ -71,8 +71,8 @@ def test_horizontal_certical_maps(self, in_type, arguments, mask, hv_mask): for k in mask.keys(): input_image[k] = in_type(mask[k]) result = ComputeHoVerMapsd(keys="mask", **arguments)(input_image)[hv_key] - self.assertTrue(isinstance(result, torch.Tensor)) - self.assertTrue(str(result.dtype).split(".")[1] == arguments.get("dtype", "float32")) + self.assertIsInstance(result, torch.Tensor) + self.assertEqual(str(result.dtype).split(".")[1], arguments.get("dtype", "float32")) assert_allclose(result, hv_mask[hv_key], type_test="tensor") diff --git a/tests/test_compute_regression_metrics.py b/tests/test_compute_regression_metrics.py index a8b7f03e47..c407ab6ba6 100644 --- a/tests/test_compute_regression_metrics.py +++ b/tests/test_compute_regression_metrics.py @@ -70,22 +70,24 @@ def test_shape_reduction(self): mt = mt_fn(reduction="mean") mt(in_tensor, in_tensor) out_tensor = mt.aggregate() - self.assertTrue(len(out_tensor.shape) == 1) + self.assertEqual(len(out_tensor.shape), 1) mt = mt_fn(reduction="sum") mt(in_tensor, in_tensor) out_tensor = mt.aggregate() - self.assertTrue(len(out_tensor.shape) == 0) + self.assertEqual(len(out_tensor.shape), 0) mt = mt_fn(reduction="sum") # test reduction arg overriding mt(in_tensor, in_tensor) out_tensor = mt.aggregate(reduction="mean_channel") - self.assertTrue(len(out_tensor.shape) == 1 and out_tensor.shape[0] == batch) + self.assertEqual(len(out_tensor.shape), 1) + self.assertEqual(out_tensor.shape[0], batch) mt = mt_fn(reduction="sum_channel") mt(in_tensor, in_tensor) out_tensor = mt.aggregate() - self.assertTrue(len(out_tensor.shape) == 1 and out_tensor.shape[0] == batch) + self.assertEqual(len(out_tensor.shape), 1) + self.assertEqual(out_tensor.shape[0], batch) def test_compare_numpy(self): set_determinism(seed=123) diff --git a/tests/test_concat_itemsd.py b/tests/test_concat_itemsd.py index 64c5d6e255..564ddf5c1f 100644 --- a/tests/test_concat_itemsd.py +++ b/tests/test_concat_itemsd.py @@ -30,7 +30,7 @@ def test_tensor_values(self): "img2": torch.tensor([[0, 1], [1, 2]], device=device), } result = ConcatItemsd(keys=["img1", "img2"], name="cat_img")(input_data) - self.assertTrue("cat_img" in result) + self.assertIn("cat_img", result) result["cat_img"] += 1 assert_allclose(result["img1"], torch.tensor([[0, 1], [1, 2]], device=device)) assert_allclose(result["cat_img"], torch.tensor([[1, 2], [2, 3], [1, 2], [2, 3]], device=device)) @@ -42,8 +42,8 @@ def test_metatensor_values(self): "img2": MetaTensor([[0, 1], [1, 2]], device=device), } result = ConcatItemsd(keys=["img1", "img2"], name="cat_img")(input_data) - self.assertTrue("cat_img" in result) - self.assertTrue(isinstance(result["cat_img"], MetaTensor)) + self.assertIn("cat_img", result) + self.assertIsInstance(result["cat_img"], MetaTensor) self.assertEqual(result["img1"].meta, result["cat_img"].meta) result["cat_img"] += 1 assert_allclose(result["img1"], torch.tensor([[0, 1], [1, 2]], device=device)) @@ -52,7 +52,7 @@ def test_metatensor_values(self): def test_numpy_values(self): input_data = {"img1": np.array([[0, 1], [1, 2]]), "img2": np.array([[0, 1], [1, 2]])} result = ConcatItemsd(keys=["img1", "img2"], name="cat_img")(input_data) - self.assertTrue("cat_img" in result) + self.assertIn("cat_img", result) result["cat_img"] += 1 np.testing.assert_allclose(result["img1"], np.array([[0, 1], [1, 2]])) np.testing.assert_allclose(result["cat_img"], np.array([[1, 2], [2, 3], [1, 2], [2, 3]])) diff --git a/tests/test_config_parser.py b/tests/test_config_parser.py index cc890a0522..cf1edc8f08 100644 --- a/tests/test_config_parser.py +++ b/tests/test_config_parser.py @@ -185,7 +185,7 @@ def test_function(self, config): if id in ("compute", "cls_compute"): parser[f"{id}#_mode_"] = "callable" func = parser.get_parsed_content(id=id) - self.assertTrue(id in parser.ref_resolver.resolved_content) + self.assertIn(id, parser.ref_resolver.resolved_content) if id == "error_func": with self.assertRaises(TypeError): func(1, 2) diff --git a/tests/test_conjugate_gradient.py b/tests/test_conjugate_gradient.py new file mode 100644 index 0000000000..64efe3b168 --- /dev/null +++ b/tests/test_conjugate_gradient.py @@ -0,0 +1,56 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch + +from monai.networks.layers import ConjugateGradient + + +class TestConjugateGradient(unittest.TestCase): + + def test_real_valued_inverse(self): + """Test ConjugateGradient with real-valued input: when the input is real + value, the output should be the inverse of the matrix.""" + a_dim = 3 + a_mat = torch.tensor([[1, 2, 3], [2, 1, 2], [3, 2, 1]], dtype=torch.float) + + def a_op(x): + return a_mat @ x + + cg_solver = ConjugateGradient(a_op, num_iter=100) + # define the measurement + y = torch.tensor([1, 2, 3], dtype=torch.float) + # solve for x + x = cg_solver(torch.zeros(a_dim), y) + x_ref = torch.linalg.solve(a_mat, y) + # assert torch.allclose(x, x_ref, atol=1e-6), 'CG solver failed to converge to reference solution' + self.assertTrue(torch.allclose(x, x_ref, atol=1e-6)) + + def test_complex_valued_inverse(self): + a_dim = 3 + a_mat = torch.tensor([[1, 2, 3], [2, 1, 2], [3, 2, 1]], dtype=torch.complex64) + + def a_op(x): + return a_mat @ x + + cg_solver = ConjugateGradient(a_op, num_iter=100) + y = torch.tensor([1, 2, 3], dtype=torch.complex64) + x = cg_solver(torch.zeros(a_dim, dtype=torch.complex64), y) + x_ref = torch.linalg.solve(a_mat, y) + self.assertTrue(torch.allclose(x, x_ref, atol=1e-6)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_controlnet.py b/tests/test_controlnet.py new file mode 100644 index 0000000000..4746c7ce22 --- /dev/null +++ b/tests/test_controlnet.py @@ -0,0 +1,215 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +import tempfile +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.apps import download_url +from monai.networks import eval_mode +from monai.networks.nets.controlnet import ControlNet +from monai.utils import optional_import +from tests.utils import skip_if_downloading_fails, testing_data_config + +_, has_einops = optional_import("einops") +UNCOND_CASES_2D = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + }, + (1, 8, 4, 4), + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "resblock_updown": True, + }, + (1, 8, 4, 4), + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "num_res_blocks": 1, + "channels": (4, 4, 4), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 4, + }, + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "resblock_updown": True, + }, + (1, 8, 4, 4), + ], +] + +UNCOND_CASES_3D = [ + [ + { + "spatial_dims": 3, + "in_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + }, + (1, 8, 4, 4, 4), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "num_res_blocks": 1, + "channels": (4, 4, 4), + "num_head_channels": 4, + "attention_levels": (False, False, False), + "norm_num_groups": 4, + "resblock_updown": True, + }, + (1, 4, 4, 4, 4), + ], +] + +COND_CASES_2D = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + }, + (1, 8, 4, 4), + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "resblock_updown": True, + }, + (1, 8, 4, 4), + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "upcast_attention": True, + }, + (1, 8, 4, 4), + ], +] + + +class TestControlNet(unittest.TestCase): + @parameterized.expand(UNCOND_CASES_2D + UNCOND_CASES_3D) + @skipUnless(has_einops, "Requires einops") + def test_shape_unconditioned_models(self, input_param, expected_output_shape): + input_param["conditioning_embedding_in_channels"] = input_param["in_channels"] + input_param["conditioning_embedding_num_channels"] = (input_param["channels"][0],) + net = ControlNet(**input_param) + with eval_mode(net): + x = torch.rand((1, 1) + (16,) * input_param["spatial_dims"]) + timesteps = torch.randint(0, 1000, (1,)).long() + controlnet_cond = torch.rand((1, 1) + (16,) * input_param["spatial_dims"]) + result = net.forward(x, timesteps=timesteps, controlnet_cond=controlnet_cond) + self.assertEqual(len(result[0]), 2 * len(input_param["channels"])) + self.assertEqual(result[1].shape, expected_output_shape) + + @parameterized.expand(COND_CASES_2D) + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models(self, input_param, expected_output_shape): + input_param["conditioning_embedding_in_channels"] = input_param["in_channels"] + input_param["conditioning_embedding_num_channels"] = (input_param["channels"][0],) + net = ControlNet(**input_param) + with eval_mode(net): + x = torch.rand((1, 1) + (16,) * input_param["spatial_dims"]) + timesteps = torch.randint(0, 1000, (1,)).long() + controlnet_cond = torch.rand((1, 1) + (16,) * input_param["spatial_dims"]) + result = net.forward(x, timesteps=timesteps, controlnet_cond=controlnet_cond, context=torch.rand((1, 1, 3))) + self.assertEqual(len(result[0]), 2 * len(input_param["channels"])) + self.assertEqual(result[1].shape, expected_output_shape) + + @skipUnless(has_einops, "Requires einops") + def test_compatibility_with_monai_generative(self): + # test loading weights from a model saved in MONAI Generative, version 0.2.3 + with skip_if_downloading_fails(): + net = ControlNet( + spatial_dims=2, + in_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + norm_num_groups=8, + with_conditioning=True, + transformer_num_layers=1, + cross_attention_dim=3, + resblock_updown=True, + ) + + tmpdir = tempfile.mkdtemp() + key = "controlnet_monai_generative_weights" + url = testing_data_config("models", key, "url") + hash_type = testing_data_config("models", key, "hash_type") + hash_val = testing_data_config("models", key, "hash_val") + filename = "controlnet_monai_generative_weights.pt" + + weight_path = os.path.join(tmpdir, filename) + download_url(url=url, filepath=weight_path, hash_val=hash_val, hash_type=hash_type) + + net.load_old_state_dict(torch.load(weight_path), verbose=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_controlnet_inferers.py b/tests/test_controlnet_inferers.py new file mode 100644 index 0000000000..e3b0aeb5a2 --- /dev/null +++ b/tests/test_controlnet_inferers.py @@ -0,0 +1,1310 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.inferers import ControlNetDiffusionInferer, ControlNetLatentDiffusionInferer +from monai.networks.nets import ( + VQVAE, + AutoencoderKL, + ControlNet, + DiffusionModelUNet, + SPADEAutoencoderKL, + SPADEDiffusionModelUNet, +) +from monai.networks.schedulers import DDIMScheduler, DDPMScheduler +from monai.utils import optional_import + +_, has_scipy = optional_import("scipy") +_, has_einops = optional_import("einops") + + +CNDM_TEST_CASES = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": [8], + "norm_num_groups": 8, + "attention_levels": [True], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + { + "spatial_dims": 2, + "in_channels": 1, + "channels": [8], + "attention_levels": [True], + "norm_num_groups": 8, + "num_res_blocks": 1, + "num_head_channels": 8, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (2, 1, 8, 8), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": [8], + "norm_num_groups": 8, + "attention_levels": [True], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + { + "spatial_dims": 3, + "in_channels": 1, + "channels": [8], + "attention_levels": [True], + "num_res_blocks": 1, + "norm_num_groups": 8, + "num_head_channels": 8, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (2, 1, 8, 8, 8), + ], +] +LATENT_CNDM_TEST_CASES = [ + [ + "AutoencoderKL", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "DiffusionModelUNet", + { + "spatial_dims": 2, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + { + "spatial_dims": 2, + "in_channels": 3, + "channels": [4, 4], + "attention_levels": [False, False], + "num_res_blocks": 1, + "norm_num_groups": 4, + "num_head_channels": 4, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (1, 1, 8, 8), + (1, 3, 4, 4), + ], + [ + "VQVAE", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": [4, 4], + "num_res_layers": 1, + "num_res_channels": [4, 4], + "downsample_parameters": ((2, 4, 1, 1), (2, 4, 1, 1)), + "upsample_parameters": ((2, 4, 1, 1, 0), (2, 4, 1, 1, 0)), + "num_embeddings": 16, + "embedding_dim": 3, + }, + "DiffusionModelUNet", + { + "spatial_dims": 2, + "in_channels": 3, + "out_channels": 3, + "channels": [8, 8], + "norm_num_groups": 8, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + { + "spatial_dims": 2, + "in_channels": 3, + "channels": [8, 8], + "attention_levels": [False, False], + "num_res_blocks": 1, + "norm_num_groups": 8, + "num_head_channels": 8, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (1, 1, 16, 16), + (1, 3, 4, 4), + ], + [ + "VQVAE", + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": [4, 4], + "num_res_layers": 1, + "num_res_channels": [4, 4], + "downsample_parameters": ((2, 4, 1, 1), (2, 4, 1, 1)), + "upsample_parameters": ((2, 4, 1, 1, 0), (2, 4, 1, 1, 0)), + "num_embeddings": 16, + "embedding_dim": 3, + }, + "DiffusionModelUNet", + { + "spatial_dims": 3, + "in_channels": 3, + "out_channels": 3, + "channels": [8, 8], + "norm_num_groups": 8, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + { + "spatial_dims": 3, + "in_channels": 3, + "channels": [8, 8], + "attention_levels": [False, False], + "num_res_blocks": 1, + "norm_num_groups": 8, + "num_head_channels": 8, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (1, 1, 16, 16, 16), + (1, 3, 4, 4, 4), + ], +] +LATENT_CNDM_TEST_CASES_DIFF_SHAPES = [ + [ + "AutoencoderKL", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "DiffusionModelUNet", + { + "spatial_dims": 2, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + { + "spatial_dims": 2, + "in_channels": 3, + "channels": [4, 4], + "attention_levels": [False, False], + "num_res_blocks": 1, + "norm_num_groups": 4, + "num_head_channels": 4, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (1, 1, 12, 12), + (1, 3, 8, 8), + ], + [ + "VQVAE", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": [4, 4], + "num_res_layers": 1, + "num_res_channels": [4, 4], + "downsample_parameters": ((2, 4, 1, 1), (2, 4, 1, 1)), + "upsample_parameters": ((2, 4, 1, 1, 0), (2, 4, 1, 1, 0)), + "num_embeddings": 16, + "embedding_dim": 3, + }, + "DiffusionModelUNet", + { + "spatial_dims": 2, + "in_channels": 3, + "out_channels": 3, + "channels": [8, 8], + "norm_num_groups": 8, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + { + "spatial_dims": 2, + "in_channels": 3, + "channels": [8, 8], + "attention_levels": [False, False], + "num_res_blocks": 1, + "norm_num_groups": 8, + "num_head_channels": 8, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (1, 1, 12, 12), + (1, 3, 8, 8), + ], + [ + "VQVAE", + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": [4, 4], + "num_res_layers": 1, + "num_res_channels": [4, 4], + "downsample_parameters": ((2, 4, 1, 1), (2, 4, 1, 1)), + "upsample_parameters": ((2, 4, 1, 1, 0), (2, 4, 1, 1, 0)), + "num_embeddings": 16, + "embedding_dim": 3, + }, + "DiffusionModelUNet", + { + "spatial_dims": 3, + "in_channels": 3, + "out_channels": 3, + "channels": [8, 8], + "norm_num_groups": 8, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + { + "spatial_dims": 3, + "in_channels": 3, + "channels": [8, 8], + "attention_levels": [False, False], + "num_res_blocks": 1, + "norm_num_groups": 8, + "num_head_channels": 8, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (1, 1, 12, 12, 12), + (1, 3, 8, 8, 8), + ], + [ + "SPADEAutoencoderKL", + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "DiffusionModelUNet", + { + "spatial_dims": 2, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + { + "spatial_dims": 2, + "in_channels": 3, + "channels": [4, 4], + "attention_levels": [False, False], + "num_res_blocks": 1, + "norm_num_groups": 4, + "num_head_channels": 4, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (1, 1, 8, 8), + (1, 3, 4, 4), + ], + [ + "AutoencoderKL", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "SPADEDiffusionModelUNet", + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + { + "spatial_dims": 2, + "in_channels": 3, + "channels": [4, 4], + "attention_levels": [False, False], + "num_res_blocks": 1, + "norm_num_groups": 4, + "num_head_channels": 4, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (1, 1, 8, 8), + (1, 3, 4, 4), + ], + [ + "SPADEAutoencoderKL", + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "SPADEDiffusionModelUNet", + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + { + "spatial_dims": 2, + "in_channels": 3, + "channels": [4, 4], + "attention_levels": [False, False], + "num_res_blocks": 1, + "norm_num_groups": 4, + "num_head_channels": 4, + "conditioning_embedding_num_channels": [16], + "conditioning_embedding_in_channels": 1, + }, + (1, 1, 8, 8), + (1, 3, 4, 4), + ], +] + + +class ControlNetTestDiffusionSamplingInferer(unittest.TestCase): + @parameterized.expand(CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_call(self, model_params, controlnet_params, input_shape): + model = DiffusionModelUNet(**model_params) + controlnet = ControlNet(**controlnet_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + controlnet.to(device) + controlnet.eval() + input = torch.randn(input_shape).to(device) + mask = torch.randn(input_shape).to(device) + noise = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetDiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + timesteps = torch.randint(0, scheduler.num_train_timesteps, (input_shape[0],), device=input.device).long() + sample = inferer( + inputs=input, noise=noise, diffusion_model=model, controlnet=controlnet, timesteps=timesteps, cn_cond=mask + ) + self.assertEqual(sample.shape, input_shape) + + @parameterized.expand(CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sample_intermediates(self, model_params, controlnet_params, input_shape): + model = DiffusionModelUNet(**model_params) + controlnet = ControlNet(**controlnet_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + controlnet.to(device) + controlnet.eval() + noise = torch.randn(input_shape).to(device) + mask = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetDiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + sample, intermediates = inferer.sample( + input_noise=noise, + diffusion_model=model, + scheduler=scheduler, + controlnet=controlnet, + cn_cond=mask, + save_intermediates=True, + intermediate_steps=1, + ) + self.assertEqual(len(intermediates), 10) + + @parameterized.expand(CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_ddpm_sampler(self, model_params, controlnet_params, input_shape): + model = DiffusionModelUNet(**model_params) + controlnet = ControlNet(**controlnet_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + controlnet.to(device) + controlnet.eval() + mask = torch.randn(input_shape).to(device) + noise = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=1000) + inferer = ControlNetDiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + sample, intermediates = inferer.sample( + input_noise=noise, + diffusion_model=model, + scheduler=scheduler, + controlnet=controlnet, + cn_cond=mask, + save_intermediates=True, + intermediate_steps=1, + ) + self.assertEqual(len(intermediates), 10) + + @parameterized.expand(CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_ddim_sampler(self, model_params, controlnet_params, input_shape): + model = DiffusionModelUNet(**model_params) + controlnet = ControlNet(**controlnet_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + controlnet.to(device) + controlnet.eval() + mask = torch.randn(input_shape).to(device) + noise = torch.randn(input_shape).to(device) + scheduler = DDIMScheduler(num_train_timesteps=1000) + inferer = ControlNetDiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + sample, intermediates = inferer.sample( + input_noise=noise, + diffusion_model=model, + scheduler=scheduler, + controlnet=controlnet, + cn_cond=mask, + save_intermediates=True, + intermediate_steps=1, + ) + self.assertEqual(len(intermediates), 10) + + @parameterized.expand(CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sampler_conditioned(self, model_params, controlnet_params, input_shape): + model_params["with_conditioning"] = True + model_params["cross_attention_dim"] = 3 + model = DiffusionModelUNet(**model_params) + controlnet = ControlNet(**controlnet_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + controlnet.to(device) + controlnet.eval() + mask = torch.randn(input_shape).to(device) + noise = torch.randn(input_shape).to(device) + scheduler = DDIMScheduler(num_train_timesteps=1000) + inferer = ControlNetDiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + conditioning = torch.randn([input_shape[0], 1, 3]).to(device) + sample, intermediates = inferer.sample( + input_noise=noise, + diffusion_model=model, + controlnet=controlnet, + cn_cond=mask, + scheduler=scheduler, + save_intermediates=True, + intermediate_steps=1, + conditioning=conditioning, + ) + self.assertEqual(len(intermediates), 10) + + @parameterized.expand(CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_get_likelihood(self, model_params, controlnet_params, input_shape): + model = DiffusionModelUNet(**model_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + controlnet = ControlNet(**controlnet_params) + controlnet.to(device) + controlnet.eval() + input = torch.randn(input_shape).to(device) + mask = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetDiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + likelihood, intermediates = inferer.get_likelihood( + inputs=input, + diffusion_model=model, + scheduler=scheduler, + controlnet=controlnet, + cn_cond=mask, + save_intermediates=True, + ) + self.assertEqual(intermediates[0].shape, input.shape) + self.assertEqual(likelihood.shape[0], input.shape[0]) + + @unittest.skipUnless(has_scipy, "Requires scipy library.") + def test_normal_cdf(self): + from scipy.stats import norm + + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetDiffusionInferer(scheduler=scheduler) + x = torch.linspace(-10, 10, 20) + cdf_approx = inferer._approx_standard_normal_cdf(x) + cdf_true = norm.cdf(x) + torch.testing.assert_allclose(cdf_approx, cdf_true, atol=1e-3, rtol=1e-5) + + @parameterized.expand(CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sampler_conditioned_concat(self, model_params, controlnet_params, input_shape): + # copy the model_params dict to prevent from modifying test cases + model_params = model_params.copy() + n_concat_channel = 2 + model_params["in_channels"] = model_params["in_channels"] + n_concat_channel + model_params["cross_attention_dim"] = None + model_params["with_conditioning"] = False + model = DiffusionModelUNet(**model_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + controlnet = ControlNet(**controlnet_params) + controlnet.to(device) + controlnet.eval() + noise = torch.randn(input_shape).to(device) + mask = torch.randn(input_shape).to(device) + conditioning_shape = list(input_shape) + conditioning_shape[1] = n_concat_channel + conditioning = torch.randn(conditioning_shape).to(device) + scheduler = DDIMScheduler(num_train_timesteps=1000) + inferer = ControlNetDiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + sample, intermediates = inferer.sample( + input_noise=noise, + diffusion_model=model, + controlnet=controlnet, + cn_cond=mask, + scheduler=scheduler, + save_intermediates=True, + intermediate_steps=1, + conditioning=conditioning, + mode="concat", + ) + self.assertEqual(len(intermediates), 10) + + +class LatentControlNetTestDiffusionSamplingInferer(unittest.TestCase): + @parameterized.expand(LATENT_CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_prediction_shape( + self, + ae_model_type, + autoencoder_params, + dm_model_type, + stage_2_params, + controlnet_params, + input_shape, + latent_shape, + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + controlnet = ControlNet(**controlnet_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + controlnet.to(device) + stage_1.eval() + stage_2.eval() + controlnet.eval() + + input = torch.randn(input_shape).to(device) + mask = torch.randn(input_shape).to(device) + noise = torch.randn(latent_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetLatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + timesteps = torch.randint(0, scheduler.num_train_timesteps, (input_shape[0],), device=input.device).long() + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + prediction = inferer( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + controlnet=controlnet, + cn_cond=mask, + seg=input_seg, + noise=noise, + timesteps=timesteps, + ) + else: + prediction = inferer( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + noise=noise, + timesteps=timesteps, + controlnet=controlnet, + cn_cond=mask, + ) + self.assertEqual(prediction.shape, latent_shape) + + @parameterized.expand(LATENT_CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sample_shape( + self, + ae_model_type, + autoencoder_params, + dm_model_type, + stage_2_params, + controlnet_params, + input_shape, + latent_shape, + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + controlnet = ControlNet(**controlnet_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + controlnet.to(device) + stage_1.eval() + stage_2.eval() + controlnet.eval() + + noise = torch.randn(latent_shape).to(device) + mask = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetLatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + if ae_model_type == "SPADEAutoencoderKL" or dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + sample = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + controlnet=controlnet, + cn_cond=mask, + scheduler=scheduler, + seg=input_seg, + ) + else: + sample = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + controlnet=controlnet, + cn_cond=mask, + ) + self.assertEqual(sample.shape, input_shape) + + @parameterized.expand(LATENT_CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sample_intermediates( + self, + ae_model_type, + autoencoder_params, + dm_model_type, + stage_2_params, + controlnet_params, + input_shape, + latent_shape, + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + controlnet = ControlNet(**controlnet_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + controlnet.to(device) + stage_1.eval() + stage_2.eval() + controlnet.eval() + + noise = torch.randn(latent_shape).to(device) + mask = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetLatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + if ae_model_type == "SPADEAutoencoderKL" or dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + sample = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + seg=input_seg, + controlnet=controlnet, + cn_cond=mask, + ) + + # TODO: this isn't correct, should the above produce intermediates as well? + # This test has always passed so is this branch not being used? + intermediates = None + else: + sample, intermediates = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + save_intermediates=True, + intermediate_steps=1, + controlnet=controlnet, + cn_cond=mask, + ) + + self.assertEqual(len(intermediates), 10) + self.assertEqual(intermediates[0].shape, input_shape) + + @parameterized.expand(LATENT_CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_get_likelihoods( + self, + ae_model_type, + autoencoder_params, + dm_model_type, + stage_2_params, + controlnet_params, + input_shape, + latent_shape, + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + controlnet = ControlNet(**controlnet_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + controlnet.to(device) + stage_1.eval() + stage_2.eval() + controlnet.eval() + + input = torch.randn(input_shape).to(device) + mask = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetLatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + sample, intermediates = inferer.get_likelihood( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + controlnet=controlnet, + cn_cond=mask, + scheduler=scheduler, + save_intermediates=True, + seg=input_seg, + ) + else: + sample, intermediates = inferer.get_likelihood( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + controlnet=controlnet, + cn_cond=mask, + save_intermediates=True, + ) + self.assertEqual(len(intermediates), 10) + self.assertEqual(intermediates[0].shape, latent_shape) + + @parameterized.expand(LATENT_CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_resample_likelihoods( + self, + ae_model_type, + autoencoder_params, + dm_model_type, + stage_2_params, + controlnet_params, + input_shape, + latent_shape, + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + controlnet = ControlNet(**controlnet_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + controlnet.to(device) + stage_1.eval() + stage_2.eval() + controlnet.eval() + + input = torch.randn(input_shape).to(device) + mask = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetLatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + sample, intermediates = inferer.get_likelihood( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + controlnet=controlnet, + cn_cond=mask, + save_intermediates=True, + resample_latent_likelihoods=True, + seg=input_seg, + ) + else: + sample, intermediates = inferer.get_likelihood( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + controlnet=controlnet, + cn_cond=mask, + save_intermediates=True, + resample_latent_likelihoods=True, + ) + self.assertEqual(len(intermediates), 10) + self.assertEqual(intermediates[0].shape[2:], input_shape[2:]) + + @parameterized.expand(LATENT_CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_prediction_shape_conditioned_concat( + self, + ae_model_type, + autoencoder_params, + dm_model_type, + stage_2_params, + controlnet_params, + input_shape, + latent_shape, + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + stage_2_params = stage_2_params.copy() + n_concat_channel = 3 + stage_2_params["in_channels"] = stage_2_params["in_channels"] + n_concat_channel + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + controlnet = ControlNet(**controlnet_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + controlnet.to(device) + stage_1.eval() + stage_2.eval() + controlnet.eval() + + input = torch.randn(input_shape).to(device) + mask = torch.randn(input_shape).to(device) + noise = torch.randn(latent_shape).to(device) + conditioning_shape = list(latent_shape) + conditioning_shape[1] = n_concat_channel + conditioning = torch.randn(conditioning_shape).to(device) + + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetLatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + timesteps = torch.randint(0, scheduler.num_train_timesteps, (input_shape[0],), device=input.device).long() + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + prediction = inferer( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + noise=noise, + controlnet=controlnet, + cn_cond=mask, + timesteps=timesteps, + condition=conditioning, + mode="concat", + seg=input_seg, + ) + else: + prediction = inferer( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + noise=noise, + controlnet=controlnet, + cn_cond=mask, + timesteps=timesteps, + condition=conditioning, + mode="concat", + ) + self.assertEqual(prediction.shape, latent_shape) + + @parameterized.expand(LATENT_CNDM_TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sample_shape_conditioned_concat( + self, + ae_model_type, + autoencoder_params, + dm_model_type, + stage_2_params, + controlnet_params, + input_shape, + latent_shape, + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + stage_2_params = stage_2_params.copy() + n_concat_channel = 3 + stage_2_params["in_channels"] = stage_2_params["in_channels"] + n_concat_channel + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + controlnet = ControlNet(**controlnet_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + controlnet.to(device) + stage_1.eval() + stage_2.eval() + controlnet.eval() + + noise = torch.randn(latent_shape).to(device) + mask = torch.randn(input_shape).to(device) + conditioning_shape = list(latent_shape) + conditioning_shape[1] = n_concat_channel + conditioning = torch.randn(conditioning_shape).to(device) + + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetLatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + sample = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + controlnet=controlnet, + cn_cond=mask, + scheduler=scheduler, + conditioning=conditioning, + mode="concat", + seg=input_seg, + ) + else: + sample = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + controlnet=controlnet, + cn_cond=mask, + scheduler=scheduler, + conditioning=conditioning, + mode="concat", + ) + self.assertEqual(sample.shape, input_shape) + + @parameterized.expand(LATENT_CNDM_TEST_CASES_DIFF_SHAPES) + @skipUnless(has_einops, "Requires einops") + def test_sample_shape_different_latents( + self, + ae_model_type, + autoencoder_params, + dm_model_type, + stage_2_params, + controlnet_params, + input_shape, + latent_shape, + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + controlnet = ControlNet(**controlnet_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + controlnet.to(device) + stage_1.eval() + stage_2.eval() + controlnet.eval() + + input = torch.randn(input_shape).to(device) + noise = torch.randn(latent_shape).to(device) + mask = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + # We infer the VAE shape + autoencoder_latent_shape = [i // (2 ** (len(autoencoder_params["channels"]) - 1)) for i in input_shape[2:]] + inferer = ControlNetLatentDiffusionInferer( + scheduler=scheduler, + scale_factor=1.0, + ldm_latent_shape=list(latent_shape[2:]), + autoencoder_latent_shape=autoencoder_latent_shape, + ) + scheduler.set_timesteps(num_inference_steps=10) + + timesteps = torch.randint(0, scheduler.num_train_timesteps, (input_shape[0],), device=input.device).long() + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + prediction = inferer( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + controlnet=controlnet, + cn_cond=mask, + noise=noise, + timesteps=timesteps, + seg=input_seg, + ) + else: + prediction = inferer( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + noise=noise, + controlnet=controlnet, + cn_cond=mask, + timesteps=timesteps, + ) + self.assertEqual(prediction.shape, latent_shape) + + @skipUnless(has_einops, "Requires einops") + def test_incompatible_spade_setup(self): + stage_1 = SPADEAutoencoderKL( + spatial_dims=2, + label_nc=6, + in_channels=1, + out_channels=1, + channels=(4, 4), + latent_channels=3, + attention_levels=[False, False], + num_res_blocks=1, + with_encoder_nonlocal_attn=False, + with_decoder_nonlocal_attn=False, + norm_num_groups=4, + ) + stage_2 = SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=3, + out_channels=3, + channels=[4, 4], + norm_num_groups=4, + attention_levels=[False, False], + num_res_blocks=1, + num_head_channels=4, + ) + controlnet = ControlNet( + spatial_dims=2, + in_channels=1, + channels=[4, 4], + norm_num_groups=4, + attention_levels=[False, False], + num_res_blocks=1, + num_head_channels=4, + conditioning_embedding_num_channels=[16], + ) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + controlnet.to(device) + controlnet.to(device) + stage_1.eval() + stage_2.eval() + controlnet.eval() + noise = torch.randn((1, 3, 4, 4)).to(device) + mask = torch.randn((1, 1, 4, 4)).to(device) + input_seg = torch.randn((1, 3, 8, 8)).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = ControlNetLatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + with self.assertRaises(ValueError): + _ = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + controlnet=controlnet, + cn_cond=mask, + seg=input_seg, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_controlnet_maisi.py b/tests/test_controlnet_maisi.py new file mode 100644 index 0000000000..bfdf25ec6e --- /dev/null +++ b/tests/test_controlnet_maisi.py @@ -0,0 +1,171 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.apps.generation.maisi.networks.controlnet_maisi import ControlNetMaisi +from monai.networks import eval_mode +from monai.utils import optional_import +from tests.utils import SkipIfBeforePyTorchVersion + +_, has_einops = optional_import("einops") + +TEST_CASES = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "conditioning_embedding_in_channels": 1, + "conditioning_embedding_num_channels": (8, 8), + "use_checkpointing": False, + }, + 6, + (1, 8, 4, 4), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "conditioning_embedding_in_channels": 1, + "conditioning_embedding_num_channels": (8, 8), + "use_checkpointing": True, + }, + 6, + (1, 8, 4, 4, 4), + ], +] + +TEST_CASES_CONDITIONAL = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "conditioning_embedding_in_channels": 1, + "conditioning_embedding_num_channels": (8, 8), + "use_checkpointing": False, + "with_conditioning": True, + "cross_attention_dim": 2, + }, + 6, + (1, 8, 4, 4), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "conditioning_embedding_in_channels": 1, + "conditioning_embedding_num_channels": (8, 8), + "use_checkpointing": True, + "with_conditioning": True, + "cross_attention_dim": 2, + }, + 6, + (1, 8, 4, 4, 4), + ], +] + +TEST_CASES_ERROR = [ + [ + {"spatial_dims": 2, "in_channels": 1, "with_conditioning": True, "cross_attention_dim": None}, + "ControlNet expects dimension of the cross-attention conditioning (cross_attention_dim) " + "to be specified when with_conditioning=True.", + ], + [ + {"spatial_dims": 2, "in_channels": 1, "with_conditioning": False, "cross_attention_dim": 2}, + "ControlNet expects with_conditioning=True when specifying the cross_attention_dim.", + ], + [ + {"spatial_dims": 2, "in_channels": 1, "num_channels": (8, 16), "norm_num_groups": 16}, + f"ControlNet expects all channels to be a multiple of norm_num_groups, but got" + f" channels={(8, 16)} and norm_num_groups={16}", + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "num_channels": (8, 16), + "attention_levels": (True,), + "norm_num_groups": 8, + }, + f"ControlNet expects channels to have the same length as attention_levels, but got " + f"channels={(8, 16)} and attention_levels={(True,)}", + ], +] + + +@SkipIfBeforePyTorchVersion((2, 0)) +class TestControlNet(unittest.TestCase): + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_shape_unconditioned_models(self, input_param, expected_num_down_blocks_residuals, expected_shape): + net = ControlNetMaisi(**input_param) + with eval_mode(net): + x = torch.rand((1, 1, 16, 16)) if input_param["spatial_dims"] == 2 else torch.rand((1, 1, 16, 16, 16)) + timesteps = torch.randint(0, 1000, (1,)).long() + controlnet_cond = ( + torch.rand((1, 1, 32, 32)) if input_param["spatial_dims"] == 2 else torch.rand((1, 1, 32, 32, 32)) + ) + result = net.forward(x, timesteps, controlnet_cond) + self.assertEqual(len(result[0]), expected_num_down_blocks_residuals) + self.assertEqual(result[1].shape, expected_shape) + + @parameterized.expand(TEST_CASES_CONDITIONAL) + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models(self, input_param, expected_num_down_blocks_residuals, expected_shape): + net = ControlNetMaisi(**input_param) + with eval_mode(net): + x = torch.rand((1, 1, 16, 16)) if input_param["spatial_dims"] == 2 else torch.rand((1, 1, 16, 16, 16)) + timesteps = torch.randint(0, 1000, (1,)).long() + controlnet_cond = ( + torch.rand((1, 1, 32, 32)) if input_param["spatial_dims"] == 2 else torch.rand((1, 1, 32, 32, 32)) + ) + context = torch.randn((1, 1, input_param["cross_attention_dim"])) + result = net.forward(x, timesteps, controlnet_cond, context=context) + self.assertEqual(len(result[0]), expected_num_down_blocks_residuals) + self.assertEqual(result[1].shape, expected_shape) + + @parameterized.expand(TEST_CASES_ERROR) + def test_error_input(self, input_param, expected_error): + with self.assertRaises(ValueError) as context: # output shape too small + _ = ControlNetMaisi(**input_param) + runtime_error = context.exception + self.assertEqual(str(runtime_error), expected_error) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_convert_data_type.py b/tests/test_convert_data_type.py index b95539f4b7..a27a05cf28 100644 --- a/tests/test_convert_data_type.py +++ b/tests/test_convert_data_type.py @@ -73,6 +73,7 @@ class TestTensor(torch.Tensor): + __test__ = False # indicate to pytest that this class is not intended for collection pass diff --git a/tests/test_convert_to_onnx.py b/tests/test_convert_to_onnx.py index 398d260c52..798c510800 100644 --- a/tests/test_convert_to_onnx.py +++ b/tests/test_convert_to_onnx.py @@ -12,6 +12,7 @@ from __future__ import annotations import itertools +import platform import unittest import torch @@ -29,6 +30,12 @@ TESTS = list(itertools.product(TORCH_DEVICE_OPTIONS, [True, False], [True, False])) TESTS_ORT = list(itertools.product(TORCH_DEVICE_OPTIONS, [True])) +ON_AARCH64 = platform.machine() == "aarch64" +if ON_AARCH64: + rtol, atol = 1e-1, 1e-2 +else: + rtol, atol = 1e-3, 1e-4 + onnx, _ = optional_import("onnx") @@ -56,8 +63,8 @@ def test_unet(self, device, use_trace, use_ort): device=device, use_ort=use_ort, use_trace=use_trace, - rtol=1e-3, - atol=1e-4, + rtol=rtol, + atol=atol, ) else: # https://github.com/pytorch/pytorch/blob/release/1.9/torch/onnx/__init__.py#L182 @@ -72,8 +79,8 @@ def test_unet(self, device, use_trace, use_ort): device=device, use_ort=use_ort, use_trace=use_trace, - rtol=1e-3, - atol=1e-4, + rtol=rtol, + atol=atol, ) self.assertTrue(isinstance(onnx_model, onnx.ModelProto)) @@ -107,8 +114,8 @@ def test_seg_res_net(self, device, use_ort): device=device, use_ort=use_ort, use_trace=True, - rtol=1e-3, - atol=1e-4, + rtol=rtol, + atol=atol, ) self.assertTrue(isinstance(onnx_model, onnx.ModelProto)) diff --git a/tests/test_crossattention.py b/tests/test_crossattention.py new file mode 100644 index 0000000000..e034e42290 --- /dev/null +++ b/tests/test_crossattention.py @@ -0,0 +1,186 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest import skipUnless + +import numpy as np +import torch +from parameterized import parameterized + +from monai.networks import eval_mode +from monai.networks.blocks.crossattention import CrossAttentionBlock +from monai.networks.layers.factories import RelPosEmbedding +from monai.utils import optional_import +from tests.utils import SkipIfBeforePyTorchVersion, assert_allclose + +einops, has_einops = optional_import("einops") + +TEST_CASE_CABLOCK = [] +for dropout_rate in np.linspace(0, 1, 4): + for hidden_size in [360, 480, 600, 768]: + for num_heads in [4, 6, 8, 12]: + for rel_pos_embedding in [None, RelPosEmbedding.DECOMPOSED]: + for input_size in [(16, 32), (8, 8, 8)]: + for flash_attn in [True, False]: + test_case = [ + { + "hidden_size": hidden_size, + "num_heads": num_heads, + "dropout_rate": dropout_rate, + "rel_pos_embedding": rel_pos_embedding if not flash_attn else None, + "input_size": input_size, + "use_flash_attention": flash_attn, + }, + (2, 512, hidden_size), + (2, 512, hidden_size), + ] + TEST_CASE_CABLOCK.append(test_case) + + +class TestResBlock(unittest.TestCase): + + @parameterized.expand(TEST_CASE_CABLOCK) + @skipUnless(has_einops, "Requires einops") + @SkipIfBeforePyTorchVersion((2, 0)) + def test_shape(self, input_param, input_shape, expected_shape): + # Without flash attention + net = CrossAttentionBlock(**input_param) + with eval_mode(net): + result = net(torch.randn(input_shape), context=torch.randn(2, 512, input_param["hidden_size"])) + self.assertEqual(result.shape, expected_shape) + + def test_ill_arg(self): + with self.assertRaises(ValueError): + CrossAttentionBlock(hidden_size=128, num_heads=12, dropout_rate=6.0) + + with self.assertRaises(ValueError): + CrossAttentionBlock(hidden_size=620, num_heads=8, dropout_rate=0.4) + + @SkipIfBeforePyTorchVersion((2, 0)) + def test_save_attn_with_flash_attention(self): + with self.assertRaises(ValueError): + CrossAttentionBlock( + hidden_size=128, num_heads=3, dropout_rate=0.1, use_flash_attention=True, save_attn=True + ) + + @SkipIfBeforePyTorchVersion((2, 0)) + def test_rel_pos_embedding_with_flash_attention(self): + with self.assertRaises(ValueError): + CrossAttentionBlock( + hidden_size=128, + num_heads=3, + dropout_rate=0.1, + use_flash_attention=True, + save_attn=False, + rel_pos_embedding=RelPosEmbedding.DECOMPOSED, + ) + + @skipUnless(has_einops, "Requires einops") + def test_attention_dim_not_multiple_of_heads(self): + with self.assertRaises(ValueError): + CrossAttentionBlock(hidden_size=128, num_heads=3, dropout_rate=0.1) + + @skipUnless(has_einops, "Requires einops") + def test_inner_dim_different(self): + CrossAttentionBlock(hidden_size=128, num_heads=4, dropout_rate=0.1, dim_head=30) + + def test_causal_no_sequence_length(self): + with self.assertRaises(ValueError): + CrossAttentionBlock(hidden_size=128, num_heads=4, dropout_rate=0.1, causal=True) + + @skipUnless(has_einops, "Requires einops") + @SkipIfBeforePyTorchVersion((2, 0)) + def test_causal_flash_attention(self): + block = CrossAttentionBlock( + hidden_size=128, + num_heads=1, + dropout_rate=0.1, + causal=True, + sequence_length=16, + save_attn=False, + use_flash_attention=True, + ) + input_shape = (1, 16, 128) + # Check it runs correctly + block(torch.randn(input_shape)) + + @skipUnless(has_einops, "Requires einops") + def test_causal(self): + block = CrossAttentionBlock( + hidden_size=128, num_heads=1, dropout_rate=0.1, causal=True, sequence_length=16, save_attn=True + ) + input_shape = (1, 16, 128) + block(torch.randn(input_shape)) + # check upper triangular part of the attention matrix is zero + assert torch.triu(block.att_mat, diagonal=1).sum() == 0 + + @skipUnless(has_einops, "Requires einops") + def test_context_input(self): + block = CrossAttentionBlock( + hidden_size=128, num_heads=1, dropout_rate=0.1, causal=True, sequence_length=16, context_input_size=12 + ) + input_shape = (1, 16, 128) + block(torch.randn(input_shape), context=torch.randn(1, 3, 12)) + + @skipUnless(has_einops, "Requires einops") + def test_context_wrong_input_size(self): + block = CrossAttentionBlock( + hidden_size=128, num_heads=1, dropout_rate=0.1, causal=True, sequence_length=16, context_input_size=12 + ) + input_shape = (1, 16, 128) + with self.assertRaises(RuntimeError): + block(torch.randn(input_shape), context=torch.randn(1, 3, 24)) + + @skipUnless(has_einops, "Requires einops") + def test_access_attn_matrix(self): + # input format + hidden_size = 128 + num_heads = 2 + dropout_rate = 0 + input_shape = (2, 256, hidden_size) + + # be not able to access the matrix + no_matrix_acess_blk = CrossAttentionBlock( + hidden_size=hidden_size, num_heads=num_heads, dropout_rate=dropout_rate + ) + no_matrix_acess_blk(torch.randn(input_shape)) + assert isinstance(no_matrix_acess_blk.att_mat, torch.Tensor) + # no of elements is zero + assert no_matrix_acess_blk.att_mat.nelement() == 0 + + # be able to acess the attention matrix. + matrix_acess_blk = CrossAttentionBlock( + hidden_size=hidden_size, num_heads=num_heads, dropout_rate=dropout_rate, save_attn=True + ) + matrix_acess_blk(torch.randn(input_shape)) + assert matrix_acess_blk.att_mat.shape == (input_shape[0], input_shape[0], input_shape[1], input_shape[1]) + + @parameterized.expand([[True], [False]]) + @skipUnless(has_einops, "Requires einops") + @SkipIfBeforePyTorchVersion((2, 0)) + def test_flash_attention(self, causal): + input_param = {"hidden_size": 128, "num_heads": 1, "causal": causal, "sequence_length": 16 if causal else None} + device = "cuda:0" if torch.cuda.is_available() else "cpu" + block_w_flash_attention = CrossAttentionBlock(**input_param, use_flash_attention=True).to(device) + block_wo_flash_attention = CrossAttentionBlock(**input_param, use_flash_attention=False).to(device) + block_wo_flash_attention.load_state_dict(block_w_flash_attention.state_dict()) + test_data = torch.randn(1, 16, 128).to(device) + + out_1 = block_w_flash_attention(test_data) + out_2 = block_wo_flash_attention(test_data) + assert_allclose(out_1, out_2, atol=1e-4) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_cucim_dict_transform.py b/tests/test_cucim_dict_transform.py index d2dcc6aa5f..3c5703a34c 100644 --- a/tests/test_cucim_dict_transform.py +++ b/tests/test_cucim_dict_transform.py @@ -80,8 +80,8 @@ class TestCuCIMDict(unittest.TestCase): def test_tramsforms_numpy_single(self, params, input, expected): input = {"image": input} output = CuCIMd(keys="image", **params)(input)["image"] - self.assertTrue(output.dtype == expected.dtype) - self.assertTrue(isinstance(output, np.ndarray)) + self.assertEqual(output.dtype, expected.dtype) + self.assertIsInstance(output, np.ndarray) cp.testing.assert_allclose(output, expected) @parameterized.expand( @@ -98,8 +98,8 @@ def test_tramsforms_numpy_batch(self, params, input, expected): input = {"image": input[cp.newaxis, ...]} expected = expected[cp.newaxis, ...] output = CuCIMd(keys="image", **params)(input)["image"] - self.assertTrue(output.dtype == expected.dtype) - self.assertTrue(isinstance(output, np.ndarray)) + self.assertEqual(output.dtype, expected.dtype) + self.assertIsInstance(output, np.ndarray) cp.testing.assert_allclose(output, expected) @parameterized.expand( @@ -116,8 +116,8 @@ def test_tramsforms_cupy_single(self, params, input, expected): input = {"image": cp.asarray(input)} expected = cp.asarray(expected) output = CuCIMd(keys="image", **params)(input)["image"] - self.assertTrue(output.dtype == expected.dtype) - self.assertTrue(isinstance(output, cp.ndarray)) + self.assertEqual(output.dtype, expected.dtype) + self.assertIsInstance(output, cp.ndarray) cp.testing.assert_allclose(output, expected) @parameterized.expand( @@ -134,8 +134,8 @@ def test_tramsforms_cupy_batch(self, params, input, expected): input = {"image": cp.asarray(input)[cp.newaxis, ...]} expected = cp.asarray(expected)[cp.newaxis, ...] output = CuCIMd(keys="image", **params)(input)["image"] - self.assertTrue(output.dtype == expected.dtype) - self.assertTrue(isinstance(output, cp.ndarray)) + self.assertEqual(output.dtype, expected.dtype) + self.assertIsInstance(output, cp.ndarray) cp.testing.assert_allclose(output, expected) diff --git a/tests/test_cucim_transform.py b/tests/test_cucim_transform.py index 5f16c11589..162e16b52a 100644 --- a/tests/test_cucim_transform.py +++ b/tests/test_cucim_transform.py @@ -79,8 +79,8 @@ class TestCuCIM(unittest.TestCase): ) def test_tramsforms_numpy_single(self, params, input, expected): output = CuCIM(**params)(input) - self.assertTrue(output.dtype == expected.dtype) - self.assertTrue(isinstance(output, np.ndarray)) + self.assertEqual(output.dtype, expected.dtype) + self.assertIsInstance(output, np.ndarray) cp.testing.assert_allclose(output, expected) @parameterized.expand( @@ -97,8 +97,8 @@ def test_tramsforms_numpy_batch(self, params, input, expected): input = input[cp.newaxis, ...] expected = expected[cp.newaxis, ...] output = CuCIM(**params)(input) - self.assertTrue(output.dtype == expected.dtype) - self.assertTrue(isinstance(output, np.ndarray)) + self.assertEqual(output.dtype, expected.dtype) + self.assertIsInstance(output, np.ndarray) cp.testing.assert_allclose(output, expected) @parameterized.expand( @@ -115,8 +115,8 @@ def test_tramsforms_cupy_single(self, params, input, expected): input = cp.asarray(input) expected = cp.asarray(expected) output = CuCIM(**params)(input) - self.assertTrue(output.dtype == expected.dtype) - self.assertTrue(isinstance(output, cp.ndarray)) + self.assertEqual(output.dtype, expected.dtype) + self.assertIsInstance(output, cp.ndarray) cp.testing.assert_allclose(output, expected) @parameterized.expand( @@ -133,8 +133,8 @@ def test_tramsforms_cupy_batch(self, params, input, expected): input = cp.asarray(input)[cp.newaxis, ...] expected = cp.asarray(expected)[cp.newaxis, ...] output = CuCIM(**params)(input) - self.assertTrue(output.dtype == expected.dtype) - self.assertTrue(isinstance(output, cp.ndarray)) + self.assertEqual(output.dtype, expected.dtype) + self.assertIsInstance(output, cp.ndarray) cp.testing.assert_allclose(output, expected) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 1398009c63..0d37ae2efd 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -23,7 +23,7 @@ from parameterized import parameterized from monai.data import Dataset -from monai.transforms import Compose, LoadImaged, SimulateDelayd +from monai.transforms import Compose, Lambda, LoadImage, LoadImaged, SimulateDelay, SimulateDelayd from tests.test_compose import TEST_COMPOSE_LAZY_ON_CALL_LOGGING_TEST_CASES, data_from_keys TEST_CASE_1 = [(128, 128, 128)] @@ -99,6 +99,72 @@ def test_dataset_lazy_on_call(self): data[0, 0:2, 0:2] = 1 +class TestTupleDataset(unittest.TestCase): + + @parameterized.expand([TEST_CASE_1]) + def test_shape(self, expected_shape): + test_image = nib.Nifti1Image(np.random.randint(0, 2, size=[128, 128, 128]).astype(float), np.eye(4)) + with tempfile.TemporaryDirectory() as tempdir: + nib.save(test_image, os.path.join(tempdir, "test_image1.nii.gz")) + nib.save(test_image, os.path.join(tempdir, "test_label1.nii.gz")) + nib.save(test_image, os.path.join(tempdir, "test_image2.nii.gz")) + nib.save(test_image, os.path.join(tempdir, "test_label2.nii.gz")) + test_data = [ + (os.path.join(tempdir, "test_image1.nii.gz"), os.path.join(tempdir, "test_label1.nii.gz")), + (os.path.join(tempdir, "test_image2.nii.gz"), os.path.join(tempdir, "test_label2.nii.gz")), + ] + + test_transform = Compose([LoadImage(), SimulateDelay(delay_time=1e-5)]) + + # Here test_transform is applied element by element for the tuple. + dataset = Dataset(data=test_data, transform=test_transform) + data1 = dataset[0] + data2 = dataset[1] + + # Output is a list/tuple + self.assertTrue(isinstance(data1, (list, tuple))) + self.assertTrue(isinstance(data2, (list, tuple))) + + # Number of elements are 2 + self.assertEqual(len(data1), 2) + self.assertEqual(len(data2), 2) + + # Output shapes are as expected + self.assertTupleEqual(data1[0].shape, expected_shape) + self.assertTupleEqual(data1[1].shape, expected_shape) + self.assertTupleEqual(data2[0].shape, expected_shape) + self.assertTupleEqual(data2[1].shape, expected_shape) + + # Here test_transform is applied to the tuple as a whole. + test_transform = Compose( + [ + # LoadImage creates a channel-stacked image when applied to a tuple + LoadImage(), + # Get the channel-stacked image and the label + Lambda(func=lambda x: (x[0].permute(2, 1, 0), x[1])), + ], + map_items=False, + ) + + dataset = Dataset(data=test_data, transform=test_transform) + data1 = dataset[0] + data2 = dataset[1] + + # Output is a list/tuple + self.assertTrue(isinstance(data1, (list, tuple))) + self.assertTrue(isinstance(data2, (list, tuple))) + + # Number of elements are 2 + self.assertEqual(len(data1), 2) + self.assertEqual(len(data2), 2) + + # Output shapes are as expected + self.assertTupleEqual(data1[0].shape, expected_shape) + self.assertTupleEqual(data1[1].shape, expected_shape) + self.assertTupleEqual(data2[0].shape, expected_shape) + self.assertTupleEqual(data2[1].shape, expected_shape) + + class TestDatsesetWithLazy(unittest.TestCase): LOGGER_NAME = "a_logger_name" diff --git a/tests/test_decathlondataset.py b/tests/test_decathlondataset.py index d220cd9097..70a2a6c06c 100644 --- a/tests/test_decathlondataset.py +++ b/tests/test_decathlondataset.py @@ -80,7 +80,7 @@ def _test_dataset(dataset): self.assertDictEqual(properties["labels"], {"0": "background", "1": "Anterior", "2": "Posterior"}) shutil.rmtree(os.path.join(testing_dir, "Task04_Hippocampus")) - try: + with self.assertRaisesRegex(RuntimeError, "^Cannot find dataset directory"): DecathlonDataset( root_dir=testing_dir, task="Task04_Hippocampus", @@ -88,9 +88,6 @@ def _test_dataset(dataset): section="validation", download=False, ) - except RuntimeError as e: - print(str(e)) - self.assertTrue(str(e).startswith("Cannot find dataset directory")) if __name__ == "__main__": diff --git a/tests/test_detect_envelope.py b/tests/test_detect_envelope.py index e2efefeb77..f9c2b5ac53 100644 --- a/tests/test_detect_envelope.py +++ b/tests/test_detect_envelope.py @@ -147,7 +147,7 @@ def test_value_error(self, arguments, image, method): elif method == "__call__": self.assertRaises(ValueError, DetectEnvelope(**arguments), image) else: - raise ValueError("Expected raising method invalid. Should be __init__ or __call__.") + self.fail("Expected raising method invalid. Should be __init__ or __call__.") @SkipIfModule("torch.fft") diff --git a/tests/test_dice_ce_loss.py b/tests/test_dice_ce_loss.py index 225618ed2c..97c7ae5050 100644 --- a/tests/test_dice_ce_loss.py +++ b/tests/test_dice_ce_loss.py @@ -93,10 +93,20 @@ def test_result(self, input_param, input_data, expected_val): result = diceceloss(**input_data) np.testing.assert_allclose(result.detach().cpu().numpy(), expected_val, atol=1e-4, rtol=1e-4) - # def test_ill_shape(self): - # loss = DiceCELoss() - # with self.assertRaisesRegex(ValueError, ""): - # loss(torch.ones((1, 2, 3)), torch.ones((1, 1, 2, 3))) + def test_ill_shape(self): + loss = DiceCELoss() + with self.assertRaises(AssertionError): + loss.forward(torch.ones((1, 2, 3)), torch.ones((1, 2, 5))) + + def test_ill_shape2(self): + loss = DiceCELoss() + with self.assertRaises(ValueError): + loss.forward(torch.ones((1, 2, 3)), torch.ones((1, 1, 2, 3))) + + def test_ill_shape3(self): + loss = DiceCELoss() + with self.assertRaises(ValueError): + loss.forward(torch.ones((1, 3, 4, 4)), torch.ones((1, 2, 4, 4))) # def test_ill_reduction(self): # with self.assertRaisesRegex(ValueError, ""): diff --git a/tests/test_dice_focal_loss.py b/tests/test_dice_focal_loss.py index 13899da003..f769aac69f 100644 --- a/tests/test_dice_focal_loss.py +++ b/tests/test_dice_focal_loss.py @@ -69,8 +69,18 @@ def test_result_no_onehot_no_bg(self, size, onehot): def test_ill_shape(self): loss = DiceFocalLoss() - with self.assertRaisesRegex(ValueError, ""): - loss(torch.ones((1, 2, 3)), torch.ones((1, 1, 2, 3))) + with self.assertRaises(AssertionError): + loss.forward(torch.ones((1, 2, 3)), torch.ones((1, 2, 5))) + + def test_ill_shape2(self): + loss = DiceFocalLoss() + with self.assertRaises(ValueError): + loss.forward(torch.ones((1, 2, 3)), torch.ones((1, 1, 2, 3))) + + def test_ill_shape3(self): + loss = DiceFocalLoss() + with self.assertRaises(ValueError): + loss.forward(torch.ones((1, 3, 4, 4)), torch.ones((1, 2, 4, 4))) def test_ill_lambda(self): with self.assertRaisesRegex(ValueError, ""): @@ -81,6 +91,35 @@ def test_script(self): test_input = torch.ones(2, 1, 8, 8) test_script_save(loss, test_input, test_input) + @parameterized.expand( + [ + ("sum_None_0.5_0.25", "sum", None, 0.5, 0.25), + ("sum_weight_0.5_0.25", "sum", torch.tensor([1.0, 1.0, 2.0]), 0.5, 0.25), + ("sum_weight_tuple_0.5_0.25", "sum", (3, 2.0, 1), 0.5, 0.25), + ("mean_None_0.5_0.25", "mean", None, 0.5, 0.25), + ("mean_weight_0.5_0.25", "mean", torch.tensor([1.0, 1.0, 2.0]), 0.5, 0.25), + ("mean_weight_tuple_0.5_0.25", "mean", (3, 2.0, 1), 0.5, 0.25), + ("none_None_0.5_0.25", "none", None, 0.5, 0.25), + ("none_weight_0.5_0.25", "none", torch.tensor([1.0, 1.0, 2.0]), 0.5, 0.25), + ("none_weight_tuple_0.5_0.25", "none", (3, 2.0, 1), 0.5, 0.25), + ] + ) + def test_with_alpha(self, name, reduction, weight, lambda_focal, alpha): + size = [3, 3, 5, 5] + label = torch.randint(low=0, high=2, size=size) + pred = torch.randn(size) + + common_params = {"include_background": True, "to_onehot_y": False, "reduction": reduction, "weight": weight} + + dice_focal = DiceFocalLoss(gamma=1.0, lambda_focal=lambda_focal, alpha=alpha, **common_params) + dice = DiceLoss(**common_params) + focal = FocalLoss(gamma=1.0, alpha=alpha, **common_params) + + result = dice_focal(pred, label) + expected_val = dice(pred, label) + lambda_focal * focal(pred, label) + + np.testing.assert_allclose(result, expected_val, err_msg=f"Failed on case: {name}") + if __name__ == "__main__": unittest.main() diff --git a/tests/test_diffusion_inferer.py b/tests/test_diffusion_inferer.py new file mode 100644 index 0000000000..7f37025d3c --- /dev/null +++ b/tests/test_diffusion_inferer.py @@ -0,0 +1,236 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.inferers import DiffusionInferer +from monai.networks.nets import DiffusionModelUNet +from monai.networks.schedulers import DDIMScheduler, DDPMScheduler +from monai.utils import optional_import + +_, has_scipy = optional_import("scipy") +_, has_einops = optional_import("einops") + +TEST_CASES = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": [8], + "norm_num_groups": 8, + "attention_levels": [True], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + (2, 1, 8, 8), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": [8], + "norm_num_groups": 8, + "attention_levels": [True], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + (2, 1, 8, 8, 8), + ], +] + + +class TestDiffusionSamplingInferer(unittest.TestCase): + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_call(self, model_params, input_shape): + model = DiffusionModelUNet(**model_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + input = torch.randn(input_shape).to(device) + noise = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = DiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + timesteps = torch.randint(0, scheduler.num_train_timesteps, (input_shape[0],), device=input.device).long() + sample = inferer(inputs=input, noise=noise, diffusion_model=model, timesteps=timesteps) + self.assertEqual(sample.shape, input_shape) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sample_intermediates(self, model_params, input_shape): + model = DiffusionModelUNet(**model_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + noise = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = DiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + sample, intermediates = inferer.sample( + input_noise=noise, diffusion_model=model, scheduler=scheduler, save_intermediates=True, intermediate_steps=1 + ) + self.assertEqual(len(intermediates), 10) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_ddpm_sampler(self, model_params, input_shape): + model = DiffusionModelUNet(**model_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + noise = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=1000) + inferer = DiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + sample, intermediates = inferer.sample( + input_noise=noise, diffusion_model=model, scheduler=scheduler, save_intermediates=True, intermediate_steps=1 + ) + self.assertEqual(len(intermediates), 10) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_ddim_sampler(self, model_params, input_shape): + model = DiffusionModelUNet(**model_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + noise = torch.randn(input_shape).to(device) + scheduler = DDIMScheduler(num_train_timesteps=1000) + inferer = DiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + sample, intermediates = inferer.sample( + input_noise=noise, diffusion_model=model, scheduler=scheduler, save_intermediates=True, intermediate_steps=1 + ) + self.assertEqual(len(intermediates), 10) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sampler_conditioned(self, model_params, input_shape): + model_params["with_conditioning"] = True + model_params["cross_attention_dim"] = 3 + model = DiffusionModelUNet(**model_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + noise = torch.randn(input_shape).to(device) + scheduler = DDIMScheduler(num_train_timesteps=1000) + inferer = DiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + conditioning = torch.randn([input_shape[0], 1, 3]).to(device) + sample, intermediates = inferer.sample( + input_noise=noise, + diffusion_model=model, + scheduler=scheduler, + save_intermediates=True, + intermediate_steps=1, + conditioning=conditioning, + ) + self.assertEqual(len(intermediates), 10) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_get_likelihood(self, model_params, input_shape): + model = DiffusionModelUNet(**model_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + input = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = DiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + likelihood, intermediates = inferer.get_likelihood( + inputs=input, diffusion_model=model, scheduler=scheduler, save_intermediates=True + ) + self.assertEqual(intermediates[0].shape, input.shape) + self.assertEqual(likelihood.shape[0], input.shape[0]) + + @unittest.skipUnless(has_scipy, "Requires scipy library.") + def test_normal_cdf(self): + from scipy.stats import norm + + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = DiffusionInferer(scheduler=scheduler) + + x = torch.linspace(-10, 10, 20) + cdf_approx = inferer._approx_standard_normal_cdf(x) + cdf_true = norm.cdf(x) + torch.testing.assert_allclose(cdf_approx, cdf_true, atol=1e-3, rtol=1e-5) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sampler_conditioned_concat(self, model_params, input_shape): + # copy the model_params dict to prevent from modifying test cases + model_params = model_params.copy() + n_concat_channel = 2 + model_params["in_channels"] = model_params["in_channels"] + n_concat_channel + model_params["cross_attention_dim"] = None + model_params["with_conditioning"] = False + model = DiffusionModelUNet(**model_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + noise = torch.randn(input_shape).to(device) + conditioning_shape = list(input_shape) + conditioning_shape[1] = n_concat_channel + conditioning = torch.randn(conditioning_shape).to(device) + scheduler = DDIMScheduler(num_train_timesteps=1000) + inferer = DiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + sample, intermediates = inferer.sample( + input_noise=noise, + diffusion_model=model, + scheduler=scheduler, + save_intermediates=True, + intermediate_steps=1, + conditioning=conditioning, + mode="concat", + ) + self.assertEqual(len(intermediates), 10) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_call_conditioned_concat(self, model_params, input_shape): + # copy the model_params dict to prevent from modifying test cases + model_params = model_params.copy() + n_concat_channel = 2 + model_params["in_channels"] = model_params["in_channels"] + n_concat_channel + model_params["cross_attention_dim"] = None + model_params["with_conditioning"] = False + model = DiffusionModelUNet(**model_params) + device = "cuda:0" if torch.cuda.is_available() else "cpu" + model.to(device) + model.eval() + input = torch.randn(input_shape).to(device) + noise = torch.randn(input_shape).to(device) + conditioning_shape = list(input_shape) + conditioning_shape[1] = n_concat_channel + conditioning = torch.randn(conditioning_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = DiffusionInferer(scheduler=scheduler) + scheduler.set_timesteps(num_inference_steps=10) + timesteps = torch.randint(0, scheduler.num_train_timesteps, (input_shape[0],), device=input.device).long() + sample = inferer( + inputs=input, noise=noise, diffusion_model=model, timesteps=timesteps, condition=conditioning, mode="concat" + ) + self.assertEqual(sample.shape, input_shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_diffusion_model_unet.py b/tests/test_diffusion_model_unet.py new file mode 100644 index 0000000000..7f764d85de --- /dev/null +++ b/tests/test_diffusion_model_unet.py @@ -0,0 +1,585 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +import tempfile +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.apps import download_url +from monai.networks import eval_mode +from monai.networks.nets import DiffusionModelUNet +from monai.utils import optional_import +from tests.utils import skip_if_downloading_fails, testing_data_config + +_, has_einops = optional_import("einops") + +UNCOND_CASES_2D = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": (1, 1, 2), + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "resblock_updown": True, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "resblock_updown": True, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, True, True), + "num_head_channels": (0, 2, 4), + "norm_num_groups": 8, + } + ], +] + +UNCOND_CASES_3D = [ + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "resblock_updown": True, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "resblock_updown": True, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": (0, 0, 4), + "norm_num_groups": 8, + } + ], +] + +COND_CASES_2D = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "resblock_updown": True, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "upcast_attention": True, + } + ], +] + +DROPOUT_OK = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "dropout_cattn": 0.25, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + } + ], +] + +DROPOUT_WRONG = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "dropout_cattn": 3.0, + } + ] +] + + +class TestDiffusionModelUNet2D(unittest.TestCase): + @parameterized.expand(UNCOND_CASES_2D) + @skipUnless(has_einops, "Requires einops") + def test_shape_unconditioned_models(self, input_param): + net = DiffusionModelUNet(**input_param) + with eval_mode(net): + result = net.forward(torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1,)).long()) + self.assertEqual(result.shape, (1, 1, 16, 16)) + + @skipUnless(has_einops, "Requires einops") + def test_timestep_with_wrong_shape(self): + net = DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + with self.assertRaises(ValueError): + with eval_mode(net): + net.forward(torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1, 1)).long()) + + @skipUnless(has_einops, "Requires einops") + def test_shape_with_different_in_channel_out_channel(self): + in_channels = 6 + out_channels = 3 + net = DiffusionModelUNet( + spatial_dims=2, + in_channels=in_channels, + out_channels=out_channels, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + with eval_mode(net): + result = net.forward(torch.rand((1, in_channels, 16, 16)), torch.randint(0, 1000, (1,)).long()) + self.assertEqual(result.shape, (1, out_channels, 16, 16)) + + def test_model_channels_not_multiple_of_norm_num_group(self): + with self.assertRaises(ValueError): + DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 12), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + + def test_attention_levels_with_different_length_num_head_channels(self): + with self.assertRaises(ValueError): + DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, False), + num_head_channels=(0, 2), + norm_num_groups=8, + ) + + def test_num_res_blocks_with_different_length_channels(self): + with self.assertRaises(ValueError): + DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=(1, 1), + channels=(8, 8, 8), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models(self): + net = DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + with_conditioning=True, + transformer_num_layers=1, + cross_attention_dim=3, + norm_num_groups=8, + num_head_channels=8, + ) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 32)), + timesteps=torch.randint(0, 1000, (1,)).long(), + context=torch.rand((1, 1, 3)), + ) + self.assertEqual(result.shape, (1, 1, 16, 32)) + + def test_with_conditioning_cross_attention_dim_none(self): + with self.assertRaises(ValueError): + DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + with_conditioning=True, + transformer_num_layers=1, + cross_attention_dim=None, + norm_num_groups=8, + ) + + @skipUnless(has_einops, "Requires einops") + def test_context_with_conditioning_none(self): + net = DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + with_conditioning=False, + transformer_num_layers=1, + norm_num_groups=8, + ) + + with self.assertRaises(ValueError): + with eval_mode(net): + net.forward( + x=torch.rand((1, 1, 16, 32)), + timesteps=torch.randint(0, 1000, (1,)).long(), + context=torch.rand((1, 1, 3)), + ) + + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models_class_conditioning(self): + net = DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + norm_num_groups=8, + num_head_channels=8, + num_class_embeds=2, + ) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 32)), + timesteps=torch.randint(0, 1000, (1,)).long(), + class_labels=torch.randint(0, 2, (1,)).long(), + ) + self.assertEqual(result.shape, (1, 1, 16, 32)) + + @skipUnless(has_einops, "Requires einops") + def test_conditioned_models_no_class_labels(self): + net = DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + norm_num_groups=8, + num_head_channels=8, + num_class_embeds=2, + ) + + with self.assertRaises(ValueError): + net.forward(x=torch.rand((1, 1, 16, 32)), timesteps=torch.randint(0, 1000, (1,)).long()) + + @skipUnless(has_einops, "Requires einops") + def test_model_channels_not_same_size_of_attention_levels(self): + with self.assertRaises(ValueError): + DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False), + norm_num_groups=8, + num_head_channels=8, + num_class_embeds=2, + ) + + @parameterized.expand(COND_CASES_2D) + @skipUnless(has_einops, "Requires einops") + def test_conditioned_2d_models_shape(self, input_param): + net = DiffusionModelUNet(**input_param) + with eval_mode(net): + result = net.forward(torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1,)).long(), torch.rand((1, 1, 3))) + self.assertEqual(result.shape, (1, 1, 16, 16)) + + +class TestDiffusionModelUNet3D(unittest.TestCase): + @parameterized.expand(UNCOND_CASES_3D) + @skipUnless(has_einops, "Requires einops") + def test_shape_unconditioned_models(self, input_param): + net = DiffusionModelUNet(**input_param) + with eval_mode(net): + result = net.forward(torch.rand((1, 1, 16, 16, 16)), torch.randint(0, 1000, (1,)).long()) + self.assertEqual(result.shape, (1, 1, 16, 16, 16)) + + @skipUnless(has_einops, "Requires einops") + def test_shape_with_different_in_channel_out_channel(self): + in_channels = 6 + out_channels = 3 + net = DiffusionModelUNet( + spatial_dims=3, + in_channels=in_channels, + out_channels=out_channels, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + norm_num_groups=4, + ) + with eval_mode(net): + result = net.forward(torch.rand((1, in_channels, 16, 16, 16)), torch.randint(0, 1000, (1,)).long()) + self.assertEqual(result.shape, (1, out_channels, 16, 16, 16)) + + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models(self): + net = DiffusionModelUNet( + spatial_dims=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(16, 16, 16), + attention_levels=(False, False, True), + norm_num_groups=16, + with_conditioning=True, + transformer_num_layers=1, + cross_attention_dim=3, + ) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 16, 16)), + timesteps=torch.randint(0, 1000, (1,)).long(), + context=torch.rand((1, 1, 3)), + ) + self.assertEqual(result.shape, (1, 1, 16, 16, 16)) + + # Test dropout specification for cross-attention blocks + @parameterized.expand(DROPOUT_WRONG) + def test_wrong_dropout(self, input_param): + with self.assertRaises(ValueError): + _ = DiffusionModelUNet(**input_param) + + @parameterized.expand(DROPOUT_OK) + @skipUnless(has_einops, "Requires einops") + def test_right_dropout(self, input_param): + _ = DiffusionModelUNet(**input_param) + + @skipUnless(has_einops, "Requires einops") + def test_compatibility_with_monai_generative(self): + # test loading weights from a model saved in MONAI Generative, version 0.2.3 + with skip_if_downloading_fails(): + net = DiffusionModelUNet( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + with_conditioning=True, + cross_attention_dim=3, + transformer_num_layers=1, + norm_num_groups=8, + ) + + tmpdir = tempfile.mkdtemp() + key = "diffusion_model_unet_monai_generative_weights" + url = testing_data_config("models", key, "url") + hash_type = testing_data_config("models", key, "hash_type") + hash_val = testing_data_config("models", key, "hash_val") + filename = "diffusion_model_unet_monai_generative_weights.pt" + + weight_path = os.path.join(tmpdir, filename) + download_url(url=url, filepath=weight_path, hash_val=hash_val, hash_type=hash_type) + + net.load_old_state_dict(torch.load(weight_path), verbose=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_diffusion_model_unet_maisi.py b/tests/test_diffusion_model_unet_maisi.py new file mode 100644 index 0000000000..f9384e6d82 --- /dev/null +++ b/tests/test_diffusion_model_unet_maisi.py @@ -0,0 +1,588 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.apps.generation.maisi.networks.diffusion_model_unet_maisi import DiffusionModelUNetMaisi +from monai.networks import eval_mode +from monai.utils import optional_import + +_, has_einops = optional_import("einops") + +UNCOND_CASES_2D = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": (1, 1, 2), + "num_channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "resblock_updown": True, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "resblock_updown": True, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, True, True), + "num_head_channels": (0, 2, 4), + "norm_num_groups": 8, + } + ], +] + +UNCOND_CASES_3D = [ + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "resblock_updown": True, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "resblock_updown": True, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": (0, 0, 4), + "norm_num_groups": 8, + } + ], +] + +COND_CASES_2D = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "resblock_updown": True, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "upcast_attention": True, + } + ], +] + +DROPOUT_OK = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "dropout_cattn": 0.25, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + } + ], +] + +DROPOUT_WRONG = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "num_channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "dropout_cattn": 3.0, + } + ] +] + + +class TestDiffusionModelUNetMaisi2D(unittest.TestCase): + + @parameterized.expand(UNCOND_CASES_2D) + @skipUnless(has_einops, "Requires einops") + def test_shape_unconditioned_models(self, input_param): + net = DiffusionModelUNetMaisi(**input_param) + with eval_mode(net): + result = net.forward(torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1,)).long()) + self.assertEqual(result.shape, (1, 1, 16, 16)) + + @skipUnless(has_einops, "Requires einops") + def test_timestep_with_wrong_shape(self): + net = DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + num_channels=(8, 8, 8), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + with self.assertRaises(ValueError): + with eval_mode(net): + net.forward(torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1, 1)).long()) + + @skipUnless(has_einops, "Requires einops") + def test_shape_with_different_in_channel_out_channel(self): + in_channels = 6 + out_channels = 3 + net = DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=in_channels, + out_channels=out_channels, + num_res_blocks=1, + num_channels=(8, 8, 8), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + with eval_mode(net): + result = net.forward(torch.rand((1, in_channels, 16, 16)), torch.randint(0, 1000, (1,)).long()) + self.assertEqual(result.shape, (1, out_channels, 16, 16)) + + def test_model_channels_not_multiple_of_norm_num_group(self): + with self.assertRaises(ValueError): + DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + num_channels=(8, 8, 12), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + + def test_attention_levels_with_different_length_num_head_channels(self): + with self.assertRaises(ValueError): + DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + num_channels=(8, 8, 8), + attention_levels=(False, False, False), + num_head_channels=(0, 2), + norm_num_groups=8, + ) + + def test_num_res_blocks_with_different_length_channels(self): + with self.assertRaises(ValueError): + DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=(1, 1), + num_channels=(8, 8, 8), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models(self): + net = DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + num_channels=(8, 8, 8), + attention_levels=(False, False, True), + with_conditioning=True, + transformer_num_layers=1, + cross_attention_dim=3, + norm_num_groups=8, + num_head_channels=8, + ) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 32)), + timesteps=torch.randint(0, 1000, (1,)).long(), + context=torch.rand((1, 1, 3)), + ) + self.assertEqual(result.shape, (1, 1, 16, 32)) + + def test_with_conditioning_cross_attention_dim_none(self): + with self.assertRaises(ValueError): + DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + num_channels=(8, 8, 8), + attention_levels=(False, False, True), + with_conditioning=True, + transformer_num_layers=1, + cross_attention_dim=None, + norm_num_groups=8, + ) + + @skipUnless(has_einops, "Requires einops") + def test_context_with_conditioning_none(self): + net = DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + num_channels=(8, 8, 8), + attention_levels=(False, False, True), + with_conditioning=False, + transformer_num_layers=1, + norm_num_groups=8, + ) + + with self.assertRaises(ValueError): + with eval_mode(net): + net.forward( + x=torch.rand((1, 1, 16, 32)), + timesteps=torch.randint(0, 1000, (1,)).long(), + context=torch.rand((1, 1, 3)), + ) + + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models_class_conditioning(self): + net = DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + num_channels=(8, 8, 8), + attention_levels=(False, False, True), + norm_num_groups=8, + num_head_channels=8, + num_class_embeds=2, + ) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 32)), + timesteps=torch.randint(0, 1000, (1,)).long(), + class_labels=torch.randint(0, 2, (1,)).long(), + ) + self.assertEqual(result.shape, (1, 1, 16, 32)) + + @skipUnless(has_einops, "Requires einops") + def test_conditioned_models_no_class_labels(self): + net = DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + num_channels=(8, 8, 8), + attention_levels=(False, False, True), + norm_num_groups=8, + num_head_channels=8, + num_class_embeds=2, + ) + + with self.assertRaises(ValueError): + net.forward(x=torch.rand((1, 1, 16, 32)), timesteps=torch.randint(0, 1000, (1,)).long()) + + @skipUnless(has_einops, "Requires einops") + def test_model_channels_not_same_size_of_attention_levels(self): + with self.assertRaises(ValueError): + DiffusionModelUNetMaisi( + spatial_dims=2, + in_channels=1, + out_channels=1, + num_res_blocks=1, + num_channels=(8, 8, 8), + attention_levels=(False, False), + norm_num_groups=8, + num_head_channels=8, + num_class_embeds=2, + ) + + @parameterized.expand(COND_CASES_2D) + @skipUnless(has_einops, "Requires einops") + def test_conditioned_2d_models_shape(self, input_param): + net = DiffusionModelUNetMaisi(**input_param) + with eval_mode(net): + result = net.forward(torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1,)).long(), torch.rand((1, 1, 3))) + self.assertEqual(result.shape, (1, 1, 16, 16)) + + @parameterized.expand(UNCOND_CASES_2D) + @skipUnless(has_einops, "Requires einops") + def test_shape_with_additional_inputs(self, input_param): + input_param["include_top_region_index_input"] = True + input_param["include_bottom_region_index_input"] = True + input_param["include_spacing_input"] = True + net = DiffusionModelUNetMaisi(**input_param) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 16)), + timesteps=torch.randint(0, 1000, (1,)).long(), + top_region_index_tensor=torch.rand((1, 4)), + bottom_region_index_tensor=torch.rand((1, 4)), + spacing_tensor=torch.rand((1, 3)), + ) + self.assertEqual(result.shape, (1, 1, 16, 16)) + + +class TestDiffusionModelUNetMaisi3D(unittest.TestCase): + + @parameterized.expand(UNCOND_CASES_3D) + @skipUnless(has_einops, "Requires einops") + def test_shape_unconditioned_models(self, input_param): + net = DiffusionModelUNetMaisi(**input_param) + with eval_mode(net): + result = net.forward(torch.rand((1, 1, 16, 16, 16)), torch.randint(0, 1000, (1,)).long()) + self.assertEqual(result.shape, (1, 1, 16, 16, 16)) + + @skipUnless(has_einops, "Requires einops") + def test_shape_with_different_in_channel_out_channel(self): + in_channels = 6 + out_channels = 3 + net = DiffusionModelUNetMaisi( + spatial_dims=3, + in_channels=in_channels, + out_channels=out_channels, + num_res_blocks=1, + num_channels=(8, 8, 8), + attention_levels=(False, False, True), + norm_num_groups=4, + ) + with eval_mode(net): + result = net.forward(torch.rand((1, in_channels, 16, 16, 16)), torch.randint(0, 1000, (1,)).long()) + self.assertEqual(result.shape, (1, out_channels, 16, 16, 16)) + + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models(self): + net = DiffusionModelUNetMaisi( + spatial_dims=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + num_channels=(16, 16, 16), + attention_levels=(False, False, True), + norm_num_groups=16, + with_conditioning=True, + transformer_num_layers=1, + cross_attention_dim=3, + ) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 16, 16)), + timesteps=torch.randint(0, 1000, (1,)).long(), + context=torch.rand((1, 1, 3)), + ) + self.assertEqual(result.shape, (1, 1, 16, 16, 16)) + + # Test dropout specification for cross-attention blocks + @parameterized.expand(DROPOUT_WRONG) + def test_wrong_dropout(self, input_param): + with self.assertRaises(ValueError): + _ = DiffusionModelUNetMaisi(**input_param) + + @parameterized.expand(DROPOUT_OK) + @skipUnless(has_einops, "Requires einops") + def test_right_dropout(self, input_param): + _ = DiffusionModelUNetMaisi(**input_param) + + @parameterized.expand(UNCOND_CASES_3D) + @skipUnless(has_einops, "Requires einops") + def test_shape_with_additional_inputs(self, input_param): + input_param["include_top_region_index_input"] = True + input_param["include_bottom_region_index_input"] = True + input_param["include_spacing_input"] = True + net = DiffusionModelUNetMaisi(**input_param) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 16, 16)), + timesteps=torch.randint(0, 1000, (1,)).long(), + top_region_index_tensor=torch.rand((1, 4)), + bottom_region_index_tensor=torch.rand((1, 4)), + spacing_tensor=torch.rand((1, 3)), + ) + self.assertEqual(result.shape, (1, 1, 16, 16, 16)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_dynunet.py b/tests/test_dynunet.py index b0137ae245..f3c982056c 100644 --- a/tests/test_dynunet.py +++ b/tests/test_dynunet.py @@ -11,6 +11,7 @@ from __future__ import annotations +import platform import unittest from typing import Any, Sequence @@ -24,6 +25,12 @@ InstanceNorm3dNVFuser, _ = optional_import("apex.normalization", name="InstanceNorm3dNVFuser") +ON_AARCH64 = platform.machine() == "aarch64" +if ON_AARCH64: + rtol, atol = 1e-2, 1e-2 +else: + rtol, atol = 1e-4, 1e-4 + device = "cuda" if torch.cuda.is_available() else "cpu" strides: Sequence[Sequence[int] | int] @@ -159,7 +166,7 @@ def test_consistency(self, input_param, input_shape, _): with eval_mode(net_fuser): result_fuser = net_fuser(input_tensor) - assert_allclose(result, result_fuser, rtol=1e-4, atol=1e-4) + assert_allclose(result, result_fuser, rtol=rtol, atol=atol) class TestDynUNetDeepSupervision(unittest.TestCase): diff --git a/tests/test_ensure_channel_first.py b/tests/test_ensure_channel_first.py index 0c9ad5869e..fe046a4cdf 100644 --- a/tests/test_ensure_channel_first.py +++ b/tests/test_ensure_channel_first.py @@ -50,9 +50,10 @@ class TestEnsureChannelFirst(unittest.TestCase): @parameterized.expand([TEST_CASE_1, TEST_CASE_2, TEST_CASE_3, TEST_CASE_4, TEST_CASE_5, TEST_CASE_6]) @unittest.skipUnless(has_itk, "itk not installed") def test_load_nifti(self, input_param, filenames, original_channel_dim): - if original_channel_dim is None: - test_image = np.random.rand(8, 8, 8) - elif original_channel_dim == -1: + # if original_channel_dim is None + test_image = np.random.rand(8, 8, 8) + + if original_channel_dim == -1: test_image = np.random.rand(8, 8, 8, 1) with tempfile.TemporaryDirectory() as tempdir: diff --git a/tests/test_ensure_channel_firstd.py b/tests/test_ensure_channel_firstd.py index 63a437894b..e9effad951 100644 --- a/tests/test_ensure_channel_firstd.py +++ b/tests/test_ensure_channel_firstd.py @@ -35,9 +35,10 @@ class TestEnsureChannelFirstd(unittest.TestCase): @parameterized.expand([TEST_CASE_1, TEST_CASE_2, TEST_CASE_3]) def test_load_nifti(self, input_param, filenames, original_channel_dim): - if original_channel_dim is None: - test_image = np.random.rand(8, 8, 8) - elif original_channel_dim == -1: + # if original_channel_dim is None: + test_image = np.random.rand(8, 8, 8) + + if original_channel_dim == -1: test_image = np.random.rand(8, 8, 8, 1) with tempfile.TemporaryDirectory() as tempdir: diff --git a/tests/test_ensure_typed.py b/tests/test_ensure_typed.py index 09aa1f04b5..fe543347de 100644 --- a/tests/test_ensure_typed.py +++ b/tests/test_ensure_typed.py @@ -33,8 +33,8 @@ def test_array_input(self): keys="data", data_type=dtype, dtype=np.float32 if dtype == "NUMPY" else None, device="cpu" )({"data": test_data})["data"] if dtype == "NUMPY": - self.assertTrue(result.dtype == np.float32) - self.assertTrue(isinstance(result, torch.Tensor if dtype == "tensor" else np.ndarray)) + self.assertEqual(result.dtype, np.float32) + self.assertIsInstance(result, torch.Tensor if dtype == "tensor" else np.ndarray) assert_allclose(result, test_data, type_test=False) self.assertTupleEqual(result.shape, (2, 2)) @@ -45,7 +45,7 @@ def test_single_input(self): for test_data in test_datas: for dtype in ("tensor", "numpy"): result = EnsureTyped(keys="data", data_type=dtype)({"data": test_data})["data"] - self.assertTrue(isinstance(result, torch.Tensor if dtype == "tensor" else np.ndarray)) + self.assertIsInstance(result, torch.Tensor if dtype == "tensor" else np.ndarray) if isinstance(test_data, bool): self.assertFalse(result) else: @@ -56,11 +56,11 @@ def test_string(self): for dtype in ("tensor", "numpy"): # string input result = EnsureTyped(keys="data", data_type=dtype)({"data": "test_string"})["data"] - self.assertTrue(isinstance(result, str)) + self.assertIsInstance(result, str) self.assertEqual(result, "test_string") # numpy array of string result = EnsureTyped(keys="data", data_type=dtype)({"data": np.array(["test_string"])})["data"] - self.assertTrue(isinstance(result, np.ndarray)) + self.assertIsInstance(result, np.ndarray) self.assertEqual(result[0], "test_string") def test_list_tuple(self): @@ -68,15 +68,15 @@ def test_list_tuple(self): result = EnsureTyped(keys="data", data_type=dtype, wrap_sequence=False, track_meta=True)( {"data": [[1, 2], [3, 4]]} )["data"] - self.assertTrue(isinstance(result, list)) - self.assertTrue(isinstance(result[0][1], MetaTensor if dtype == "tensor" else np.ndarray)) + self.assertIsInstance(result, list) + self.assertIsInstance(result[0][1], MetaTensor if dtype == "tensor" else np.ndarray) assert_allclose(result[1][0], torch.as_tensor(3), type_test=False) # tuple of numpy arrays result = EnsureTyped(keys="data", data_type=dtype, wrap_sequence=False)( {"data": (np.array([1, 2]), np.array([3, 4]))} )["data"] - self.assertTrue(isinstance(result, tuple)) - self.assertTrue(isinstance(result[0], torch.Tensor if dtype == "tensor" else np.ndarray)) + self.assertIsInstance(result, tuple) + self.assertIsInstance(result[0], torch.Tensor if dtype == "tensor" else np.ndarray) assert_allclose(result[1], torch.as_tensor([3, 4]), type_test=False) def test_dict(self): @@ -92,19 +92,19 @@ def test_dict(self): ) for key in ("data", "label"): result = trans[key] - self.assertTrue(isinstance(result, dict)) - self.assertTrue(isinstance(result["img"], torch.Tensor if dtype == "tensor" else np.ndarray)) - self.assertTrue(isinstance(result["meta"]["size"], torch.Tensor if dtype == "tensor" else np.ndarray)) + self.assertIsInstance(result, dict) + self.assertIsInstance(result["img"], torch.Tensor if dtype == "tensor" else np.ndarray) + self.assertIsInstance(result["meta"]["size"], torch.Tensor if dtype == "tensor" else np.ndarray) self.assertEqual(result["meta"]["path"], "temp/test") self.assertEqual(result["extra"], None) assert_allclose(result["img"], torch.as_tensor([1.0, 2.0]), type_test=False) assert_allclose(result["meta"]["size"], torch.as_tensor([1, 2, 3]), type_test=False) if dtype == "numpy": - self.assertTrue(trans["data"]["img"].dtype == np.float32) - self.assertTrue(trans["label"]["img"].dtype == np.int8) + self.assertEqual(trans["data"]["img"].dtype, np.float32) + self.assertEqual(trans["label"]["img"].dtype, np.int8) else: - self.assertTrue(trans["data"]["img"].dtype == torch.float32) - self.assertTrue(trans["label"]["img"].dtype == torch.int8) + self.assertEqual(trans["data"]["img"].dtype, torch.float32) + self.assertEqual(trans["label"]["img"].dtype, torch.int8) if __name__ == "__main__": diff --git a/tests/test_evenly_divisible_all_gather_dist.py b/tests/test_evenly_divisible_all_gather_dist.py index d6d26c7e23..f1d45ba48f 100644 --- a/tests/test_evenly_divisible_all_gather_dist.py +++ b/tests/test_evenly_divisible_all_gather_dist.py @@ -27,10 +27,10 @@ def test_data(self): self._run() def _run(self): - if dist.get_rank() == 0: - data1 = torch.tensor([[1, 2], [3, 4]]) - data2 = torch.tensor([[1.0, 2.0]]) - data3 = torch.tensor(7) + # if dist.get_rank() == 0 + data1 = torch.tensor([[1, 2], [3, 4]]) + data2 = torch.tensor([[1.0, 2.0]]) + data3 = torch.tensor(7) if dist.get_rank() == 1: data1 = torch.tensor([[5, 6]]) diff --git a/tests/test_fl_monai_algo.py b/tests/test_fl_monai_algo.py index 54bec24b98..c8cb3451fc 100644 --- a/tests/test_fl_monai_algo.py +++ b/tests/test_fl_monai_algo.py @@ -75,7 +75,7 @@ tracking={ "handlers_id": DEFAULT_HANDLERS_ID, "configs": { - "execute_config": f"{_data_dir}/config_executed.json", + "save_execute_config": f"{_data_dir}/config_executed.json", "trainer": { "_target_": "MLFlowHandler", "tracking_uri": path_to_uri(_data_dir) + "/mlflow_override", @@ -201,7 +201,7 @@ def test_train(self, input_params): algo.finalize() # test experiment management - if "execute_config" in algo.train_workflow.parser: + if "save_execute_config" in algo.train_workflow.parser: self.assertTrue(os.path.exists(f"{_data_dir}/mlflow_override")) shutil.rmtree(f"{_data_dir}/mlflow_override") self.assertTrue(os.path.exists(f"{_data_dir}/config_executed.json")) @@ -224,7 +224,7 @@ def test_evaluate(self, input_params): algo.evaluate(data=data, extra={}) # test experiment management - if "execute_config" in algo.eval_workflow.parser: + if "save_execute_config" in algo.eval_workflow.parser: self.assertGreater(len(list(glob.glob(f"{_data_dir}/mlflow_*"))), 0) for f in list(glob.glob(f"{_data_dir}/mlflow_*")): shutil.rmtree(f) diff --git a/tests/test_flexible_unet.py b/tests/test_flexible_unet.py index 404855c9a8..42baa28b71 100644 --- a/tests/test_flexible_unet.py +++ b/tests/test_flexible_unet.py @@ -23,12 +23,11 @@ EfficientNetBNFeatures, FlexibleUNet, FlexUNetEncoderRegister, - ResNet, - ResNetBlock, - ResNetBottleneck, + ResNetEncoder, + ResNetFeatures, ) from monai.utils import optional_import -from tests.utils import skip_if_downloading_fails, skip_if_quick +from tests.utils import SkipIfNoModule, skip_if_downloading_fails, skip_if_quick torchvision, has_torchvision = optional_import("torchvision") PIL, has_pil = optional_import("PIL") @@ -59,101 +58,6 @@ def get_encoder_names(cls): return ["encoder_wrong_channels", "encoder_no_param1", "encoder_no_param2", "encoder_no_param3"] -class ResNetEncoder(ResNet, BaseEncoder): - backbone_names = ["resnet10", "resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "resnet200"] - output_feature_channels = [(64, 128, 256, 512)] * 3 + [(256, 512, 1024, 2048)] * 4 - parameter_layers = [ - [1, 1, 1, 1], - [2, 2, 2, 2], - [3, 4, 6, 3], - [3, 4, 6, 3], - [3, 4, 23, 3], - [3, 8, 36, 3], - [3, 24, 36, 3], - ] - - def __init__(self, in_channels, pretrained, **kargs): - super().__init__(**kargs, n_input_channels=in_channels) - if pretrained: - # Author of paper zipped the state_dict on googledrive, - # so would need to download, unzip and read (2.8gb file for a ~150mb state dict). - # Would like to load dict from url but need somewhere to save the state dicts. - raise NotImplementedError( - "Currently not implemented. You need to manually download weights provided by the paper's author" - " and load then to the model with `state_dict`. See https://github.com/Tencent/MedicalNet" - ) - - @staticmethod - def get_inplanes(): - return [64, 128, 256, 512] - - @classmethod - def get_encoder_parameters(cls) -> list[dict]: - """ - Get parameter list to initialize encoder networks. - Each parameter dict must have `spatial_dims`, `in_channels` - and `pretrained` parameters. - """ - parameter_list = [] - res_type: type[ResNetBlock] | type[ResNetBottleneck] - for backbone in range(len(cls.backbone_names)): - if backbone < 3: - res_type = ResNetBlock - else: - res_type = ResNetBottleneck - parameter_list.append( - { - "block": res_type, - "layers": cls.parameter_layers[backbone], - "block_inplanes": ResNetEncoder.get_inplanes(), - "spatial_dims": 2, - "in_channels": 3, - "pretrained": False, - } - ) - return parameter_list - - @classmethod - def num_channels_per_output(cls): - """ - Get number of output features' channel. - """ - return cls.output_feature_channels - - @classmethod - def num_outputs(cls): - """ - Get number of output feature. - """ - return [4] * 7 - - @classmethod - def get_encoder_names(cls): - """ - Get the name string of backbones which will be used to initialize flexible unet. - """ - return cls.backbone_names - - def forward(self, x: torch.Tensor): - feature_list = [] - x = self.conv1(x) - x = self.bn1(x) - x = self.relu(x) - if not self.no_max_pool: - x = self.maxpool(x) - x = self.layer1(x) - feature_list.append(x) - x = self.layer2(x) - feature_list.append(x) - x = self.layer3(x) - feature_list.append(x) - x = self.layer4(x) - feature_list.append(x) - - return feature_list - - -FLEXUNET_BACKBONE.register_class(ResNetEncoder) FLEXUNET_BACKBONE.register_class(DummyEncoder) @@ -204,9 +108,7 @@ def make_shape_cases( def make_error_case(): - error_dummy_backbones = DummyEncoder.get_encoder_names() - error_resnet_backbones = ResNetEncoder.get_encoder_names() - error_backbones = error_dummy_backbones + error_resnet_backbones + error_backbones = DummyEncoder.get_encoder_names() error_param_list = [] for backbone in error_backbones: error_param_list.append( @@ -232,7 +134,7 @@ def make_error_case(): norm="instance", ) CASES_3D = make_shape_cases( - models=[SEL_MODELS[0]], + models=[SEL_MODELS[0], SEL_MODELS[2]], spatial_dims=[3], batches=[1], pretrained=[False], @@ -345,6 +247,7 @@ def make_error_case(): "spatial_dims": 2, "norm": ("batch", {"eps": 1e-3, "momentum": 0.01}), }, + EfficientNetBNFeatures, { "in_channels": 3, "num_classes": 10, @@ -354,7 +257,20 @@ def make_error_case(): "norm": ("batch", {"eps": 1e-3, "momentum": 0.01}), }, ["_conv_stem.weight"], - ) + ), + ( + { + "in_channels": 1, + "out_channels": 10, + "backbone": SEL_MODELS[2], + "pretrained": True, + "spatial_dims": 3, + "norm": ("batch", {"eps": 1e-3, "momentum": 0.01}), + }, + ResNetFeatures, + {"model_name": SEL_MODELS[2], "pretrained": True, "spatial_dims": 3, "in_channels": 1}, + ["conv1.weight"], + ), ] CASE_ERRORS = make_error_case() @@ -363,6 +279,7 @@ def make_error_case(): CASE_REGISTER_ENCODER = ["EfficientNetEncoder", "monai.networks.nets.EfficientNetEncoder"] +@SkipIfNoModule("hf_hub_download") @skip_if_quick class TestFLEXIBLEUNET(unittest.TestCase): @@ -381,19 +298,19 @@ def test_shape(self, input_param, input_shape, expected_shape): self.assertEqual(result.shape, expected_shape) @parameterized.expand(CASES_PRETRAIN) - def test_pretrain(self, input_param, efficient_input_param, weight_list): + def test_pretrain(self, flexunet_input_param, feature_extractor_class, feature_extractor_input_param, weight_list): device = "cuda" if torch.cuda.is_available() else "cpu" with skip_if_downloading_fails(): - net = FlexibleUNet(**input_param).to(device) + net = FlexibleUNet(**flexunet_input_param).to(device) with skip_if_downloading_fails(): - eff_net = EfficientNetBNFeatures(**efficient_input_param).to(device) + feature_extractor_net = feature_extractor_class(**feature_extractor_input_param).to(device) for weight_name in weight_list: - if weight_name in net.encoder.state_dict() and weight_name in eff_net.state_dict(): + if weight_name in net.encoder.state_dict() and weight_name in feature_extractor_net.state_dict(): net_weight = net.encoder.state_dict()[weight_name] - download_weight = eff_net.state_dict()[weight_name] + download_weight = feature_extractor_net.state_dict()[weight_name] weight_diff = torch.abs(net_weight - download_weight) diff_sum = torch.sum(weight_diff) # check if a weight in weight_list equals to the downloaded weight. diff --git a/tests/test_flipd.py b/tests/test_flipd.py index 277f387051..1df6d34056 100644 --- a/tests/test_flipd.py +++ b/tests/test_flipd.py @@ -78,7 +78,7 @@ def test_torch(self, spatial_axis, img: torch.Tensor, track_meta: bool, device): def test_meta_dict(self): xform = Flipd("image", [0, 1]) res = xform({"image": torch.zeros(1, 3, 4)}) - self.assertTrue(res["image"].applied_operations == res["image_transforms"]) + self.assertEqual(res["image"].applied_operations, res["image_transforms"]) if __name__ == "__main__": diff --git a/tests/test_focal_loss.py b/tests/test_focal_loss.py index de8d625058..0bb8a078ae 100644 --- a/tests/test_focal_loss.py +++ b/tests/test_focal_loss.py @@ -132,7 +132,7 @@ def test_consistency_with_cross_entropy_2d_no_reduction(self): error = np.abs(a - b) max_error = np.maximum(error, max_error) - assert np.allclose(max_error, 0) + assert np.allclose(max_error, 0, atol=1e-6) def test_consistency_with_cross_entropy_2d_onehot_label(self): """For gamma=0 the focal loss reduces to the cross entropy loss""" diff --git a/tests/test_freeze_layers.py b/tests/test_freeze_layers.py index 1bea4ed1b5..7be8e576bf 100644 --- a/tests/test_freeze_layers.py +++ b/tests/test_freeze_layers.py @@ -40,9 +40,9 @@ def test_freeze_vars(self, device): for name, param in model.named_parameters(): if "class_layer" in name: - self.assertEqual(param.requires_grad, False) + self.assertFalse(param.requires_grad) else: - self.assertEqual(param.requires_grad, True) + self.assertTrue(param.requires_grad) @parameterized.expand(TEST_CASES) def test_exclude_vars(self, device): @@ -53,9 +53,9 @@ def test_exclude_vars(self, device): for name, param in model.named_parameters(): if "class_layer" in name: - self.assertEqual(param.requires_grad, True) + self.assertTrue(param.requires_grad) else: - self.assertEqual(param.requires_grad, False) + self.assertFalse(param.requires_grad) if __name__ == "__main__": diff --git a/tests/test_generalized_dice_focal_loss.py b/tests/test_generalized_dice_focal_loss.py index 8a0a80865e..65252611ca 100644 --- a/tests/test_generalized_dice_focal_loss.py +++ b/tests/test_generalized_dice_focal_loss.py @@ -59,8 +59,18 @@ def test_result_no_onehot_no_bg(self): def test_ill_shape(self): loss = GeneralizedDiceFocalLoss() - with self.assertRaisesRegex(ValueError, ""): - loss(torch.ones((1, 2, 3)), torch.ones((1, 1, 2, 3))) + with self.assertRaises(AssertionError): + loss.forward(torch.ones((1, 2, 3)), torch.ones((1, 2, 5))) + + def test_ill_shape2(self): + loss = GeneralizedDiceFocalLoss() + with self.assertRaises(ValueError): + loss.forward(torch.ones((1, 2, 3)), torch.ones((1, 1, 2, 3))) + + def test_ill_shape3(self): + loss = GeneralizedDiceFocalLoss() + with self.assertRaises(ValueError): + loss.forward(torch.ones((1, 3, 4, 4)), torch.ones((1, 2, 4, 4))) def test_ill_lambda(self): with self.assertRaisesRegex(ValueError, ""): diff --git a/tests/test_generalized_dice_loss.py b/tests/test_generalized_dice_loss.py index 7499507129..5738f4a089 100644 --- a/tests/test_generalized_dice_loss.py +++ b/tests/test_generalized_dice_loss.py @@ -184,7 +184,7 @@ def test_differentiability(self): generalized_dice_loss = GeneralizedDiceLoss() loss = generalized_dice_loss(prediction, target) - self.assertNotEqual(loss.grad_fn, None) + self.assertIsNotNone(loss.grad_fn) def test_batch(self): prediction = torch.zeros(2, 3, 3, 3) @@ -194,7 +194,7 @@ def test_batch(self): generalized_dice_loss = GeneralizedDiceLoss(batch=True) loss = generalized_dice_loss(prediction, target) - self.assertNotEqual(loss.grad_fn, None) + self.assertIsNotNone(loss.grad_fn) def test_script(self): loss = GeneralizedDiceLoss() diff --git a/tests/test_get_package_version.py b/tests/test_get_package_version.py index ab9e69cd31..e9e1d8eca6 100644 --- a/tests/test_get_package_version.py +++ b/tests/test_get_package_version.py @@ -20,14 +20,14 @@ class TestGetVersion(unittest.TestCase): def test_default(self): output = get_package_version("42foobarnoexist") - self.assertTrue("UNKNOWN" in output) + self.assertIn("UNKNOWN", output) output = get_package_version("numpy") - self.assertFalse("UNKNOWN" in output) + self.assertNotIn("UNKNOWN", output) def test_msg(self): output = get_package_version("42foobarnoexist", "test") - self.assertTrue("test" in output) + self.assertIn("test", output) if __name__ == "__main__": diff --git a/tests/test_global_mutual_information_loss.py b/tests/test_global_mutual_information_loss.py index 36a1978c93..22f5e88431 100644 --- a/tests/test_global_mutual_information_loss.py +++ b/tests/test_global_mutual_information_loss.py @@ -15,6 +15,7 @@ import numpy as np import torch +from parameterized import parameterized from monai import transforms from monai.losses.image_dissimilarity import GlobalMutualInformationLoss @@ -116,24 +117,33 @@ def transformation(translate_params=(0.0, 0.0, 0.0), rotate_params=(0.0, 0.0, 0. class TestGlobalMutualInformationLossIll(unittest.TestCase): - def test_ill_shape(self): + @parameterized.expand( + [ + (torch.ones((1, 2), dtype=torch.float), torch.ones((1, 3), dtype=torch.float)), # mismatched_simple_dims + ( + torch.ones((1, 3, 3), dtype=torch.float), + torch.ones((1, 3), dtype=torch.float), + ), # mismatched_advanced_dims + ] + ) + def test_ill_shape(self, input1, input2): loss = GlobalMutualInformationLoss() - with self.assertRaisesRegex(ValueError, ""): - loss.forward(torch.ones((1, 2), dtype=torch.float), torch.ones((1, 3), dtype=torch.float, device=device)) - with self.assertRaisesRegex(ValueError, ""): - loss.forward(torch.ones((1, 3, 3), dtype=torch.float), torch.ones((1, 3), dtype=torch.float, device=device)) - - def test_ill_opts(self): + with self.assertRaises(ValueError): + loss.forward(input1, input2) + + @parameterized.expand( + [ + (0, "mean", ValueError, ""), # num_bins_zero + (-1, "mean", ValueError, ""), # num_bins_negative + (64, "unknown", ValueError, ""), # reduction_unknown + (64, None, ValueError, ""), # reduction_none + ] + ) + def test_ill_opts(self, num_bins, reduction, expected_exception, expected_message): pred = torch.ones((1, 3, 3, 3, 3), dtype=torch.float, device=device) target = torch.ones((1, 3, 3, 3, 3), dtype=torch.float, device=device) - with self.assertRaisesRegex(ValueError, ""): - GlobalMutualInformationLoss(num_bins=0)(pred, target) - with self.assertRaisesRegex(ValueError, ""): - GlobalMutualInformationLoss(num_bins=-1)(pred, target) - with self.assertRaisesRegex(ValueError, ""): - GlobalMutualInformationLoss(reduction="unknown")(pred, target) - with self.assertRaisesRegex(ValueError, ""): - GlobalMutualInformationLoss(reduction=None)(pred, target) + with self.assertRaisesRegex(expected_exception, expected_message): + GlobalMutualInformationLoss(num_bins=num_bins, reduction=reduction)(pred, target) if __name__ == "__main__": diff --git a/tests/test_grid_patch.py b/tests/test_grid_patch.py index 4b324eda1a..56af123548 100644 --- a/tests/test_grid_patch.py +++ b/tests/test_grid_patch.py @@ -124,11 +124,11 @@ def test_grid_patch_meta(self, input_parameters, image, expected, expected_meta) self.assertTrue(output.meta["path"] == expected_meta[0]["path"]) for output_patch, expected_patch, expected_patch_meta in zip(output, expected, expected_meta): assert_allclose(output_patch, expected_patch, type_test=False) - self.assertTrue(isinstance(output_patch, MetaTensor)) - self.assertTrue(output_patch.meta["location"] == expected_patch_meta["location"]) + self.assertIsInstance(output_patch, MetaTensor) + self.assertEqual(output_patch.meta["location"], expected_patch_meta["location"]) self.assertTrue(output_patch.meta["spatial_shape"], list(output_patch.shape[1:])) if "path" in expected_meta[0]: - self.assertTrue(output_patch.meta["path"] == expected_patch_meta["path"]) + self.assertEqual(output_patch.meta["path"], expected_patch_meta["path"]) if __name__ == "__main__": diff --git a/tests/test_handler_ignite_metric.py b/tests/test_handler_ignite_metric.py index 28e0b69621..3e42bda35d 100644 --- a/tests/test_handler_ignite_metric.py +++ b/tests/test_handler_ignite_metric.py @@ -16,7 +16,7 @@ import torch from parameterized import parameterized -from monai.handlers import IgniteMetric, IgniteMetricHandler, from_engine +from monai.handlers import IgniteMetricHandler, from_engine from monai.losses import DiceLoss from monai.metrics import LossMetric from tests.utils import SkipIfNoModule, assert_allclose, optional_import @@ -172,7 +172,7 @@ def _val_func(engine, batch): @parameterized.expand(TEST_CASES[0:2]) def test_old_ignite_metric(self, input_param, input_data, expected_val): loss_fn = DiceLoss(**input_param) - ignite_metric = IgniteMetric(loss_fn=loss_fn, output_transform=from_engine(["pred", "label"])) + ignite_metric = IgniteMetricHandler(loss_fn=loss_fn, output_transform=from_engine(["pred", "label"])) def _val_func(engine, batch): pass diff --git a/tests/test_handler_metrics_saver_dist.py b/tests/test_handler_metrics_saver_dist.py index 46c9ad27d7..2e12b08aa9 100644 --- a/tests/test_handler_metrics_saver_dist.py +++ b/tests/test_handler_metrics_saver_dist.py @@ -51,8 +51,10 @@ def _val_func(engine, batch): engine = Engine(_val_func) + # define here to ensure symbol always exists regardless of the following if conditions + data = [{PostFix.meta("image"): {"filename_or_obj": [fnames[0]]}}] + if my_rank == 0: - data = [{PostFix.meta("image"): {"filename_or_obj": [fnames[0]]}}] @engine.on(Events.EPOCH_COMPLETED) def _save_metrics0(engine): diff --git a/tests/test_handler_prob_map_producer.py b/tests/test_handler_prob_map_producer.py index 347f8cb92c..406fe77c8f 100644 --- a/tests/test_handler_prob_map_producer.py +++ b/tests/test_handler_prob_map_producer.py @@ -30,6 +30,7 @@ class TestDataset(Dataset): + __test__ = False # indicate to pytest that this class is not intended for collection def __init__(self, name, size): super().__init__( @@ -64,6 +65,7 @@ def __getitem__(self, index): class TestEvaluator(Evaluator): + __test__ = False # indicate to pytest that this class is not intended for collection def _iteration(self, engine, batchdata): return batchdata diff --git a/tests/test_handler_stats.py b/tests/test_handler_stats.py index f876cff2a3..52da5c179b 100644 --- a/tests/test_handler_stats.py +++ b/tests/test_handler_stats.py @@ -76,9 +76,9 @@ def _update_metric(engine): if has_key_word.match(line): content_count += 1 if epoch_log is True: - self.assertTrue(content_count == max_epochs) + self.assertEqual(content_count, max_epochs) else: - self.assertTrue(content_count == 2) # 2 = len([1, 2]) from event_filter + self.assertEqual(content_count, 2) # 2 = len([1, 2]) from event_filter @parameterized.expand([[True], [get_event_filter([1, 3])]]) def test_loss_print(self, iteration_log): @@ -116,9 +116,9 @@ def _train_func(engine, batch): if has_key_word.match(line): content_count += 1 if iteration_log is True: - self.assertTrue(content_count == num_iters * max_epochs) + self.assertEqual(content_count, num_iters * max_epochs) else: - self.assertTrue(content_count == 2) # 2 = len([1, 3]) from event_filter + self.assertEqual(content_count, 2) # 2 = len([1, 3]) from event_filter def test_loss_dict(self): log_stream = StringIO() @@ -150,7 +150,7 @@ def _train_func(engine, batch): for line in output_str.split("\n"): if has_key_word.match(line): content_count += 1 - self.assertTrue(content_count > 0) + self.assertGreater(content_count, 0) def test_loss_file(self): key_to_handler = "test_logging" @@ -184,7 +184,7 @@ def _train_func(engine, batch): for line in output_str.split("\n"): if has_key_word.match(line): content_count += 1 - self.assertTrue(content_count > 0) + self.assertGreater(content_count, 0) def test_exception(self): # set up engine @@ -239,7 +239,7 @@ def _update_metric(engine): for line in output_str.split("\n"): if has_key_word.match(line): content_count += 1 - self.assertTrue(content_count > 0) + self.assertGreater(content_count, 0) def test_default_logger(self): log_stream = StringIO() @@ -274,7 +274,7 @@ def _train_func(engine, batch): for line in output_str.split("\n"): if has_key_word.match(line): content_count += 1 - self.assertTrue(content_count > 0) + self.assertGreater(content_count, 0) if __name__ == "__main__": diff --git a/tests/test_handler_validation.py b/tests/test_handler_validation.py index 752b1d3df7..92f8578f11 100644 --- a/tests/test_handler_validation.py +++ b/tests/test_handler_validation.py @@ -22,6 +22,7 @@ class TestEvaluator(Evaluator): + __test__ = False # indicate to pytest that this class is not intended for collection def _iteration(self, engine, batchdata): engine.state.output = "called" diff --git a/tests/test_hausdorff_loss.py b/tests/test_hausdorff_loss.py index f279d45b14..f2211008c2 100644 --- a/tests/test_hausdorff_loss.py +++ b/tests/test_hausdorff_loss.py @@ -219,17 +219,12 @@ def test_ill_opts(self): with self.assertRaisesRegex(ValueError, ""): HausdorffDTLoss(reduction=None)(chn_input, chn_target) - def test_input_warnings(self): + @parameterized.expand([(False, False, False), (False, True, False), (False, False, True)]) + def test_input_warnings(self, include_background, softmax, to_onehot_y): chn_input = torch.ones((1, 1, 1, 3)) chn_target = torch.ones((1, 1, 1, 3)) with self.assertWarns(Warning): - loss = HausdorffDTLoss(include_background=False) - loss.forward(chn_input, chn_target) - with self.assertWarns(Warning): - loss = HausdorffDTLoss(softmax=True) - loss.forward(chn_input, chn_target) - with self.assertWarns(Warning): - loss = HausdorffDTLoss(to_onehot_y=True) + loss = HausdorffDTLoss(include_background=include_background, softmax=softmax, to_onehot_y=to_onehot_y) loss.forward(chn_input, chn_target) @@ -256,17 +251,12 @@ def test_ill_opts(self): with self.assertRaisesRegex(ValueError, ""): LogHausdorffDTLoss(reduction=None)(chn_input, chn_target) - def test_input_warnings(self): + @parameterized.expand([(False, False, False), (False, True, False), (False, False, True)]) + def test_input_warnings(self, include_background, softmax, to_onehot_y): chn_input = torch.ones((1, 1, 1, 3)) chn_target = torch.ones((1, 1, 1, 3)) with self.assertWarns(Warning): - loss = LogHausdorffDTLoss(include_background=False) - loss.forward(chn_input, chn_target) - with self.assertWarns(Warning): - loss = LogHausdorffDTLoss(softmax=True) - loss.forward(chn_input, chn_target) - with self.assertWarns(Warning): - loss = LogHausdorffDTLoss(to_onehot_y=True) + loss = LogHausdorffDTLoss(include_background=include_background, softmax=softmax, to_onehot_y=to_onehot_y) loss.forward(chn_input, chn_target) diff --git a/tests/test_hilbert_transform.py b/tests/test_hilbert_transform.py index 879a74969d..b91ba3f6b7 100644 --- a/tests/test_hilbert_transform.py +++ b/tests/test_hilbert_transform.py @@ -19,11 +19,11 @@ from monai.networks.layers import HilbertTransform from monai.utils import OptionalImportError -from tests.utils import SkipIfModule, SkipIfNoModule, skip_if_no_cuda +from tests.utils import SkipIfModule, SkipIfNoModule def create_expected_numpy_output(input_datum, **kwargs): - x = np.fft.fft(input_datum.cpu().numpy() if input_datum.device.type == "cuda" else input_datum.numpy(), **kwargs) + x = np.fft.fft(input_datum.cpu().numpy(), **kwargs) f = np.fft.fftfreq(x.shape[kwargs["axis"]]) u = np.heaviside(f, 0.5) new_dims_before = kwargs["axis"] @@ -44,19 +44,15 @@ def create_expected_numpy_output(input_datum, **kwargs): # CPU TEST DATA cpu_input_data = {} -cpu_input_data["1D"] = torch.as_tensor(hann_windowed_sine, device=cpu).unsqueeze(0).unsqueeze(0) -cpu_input_data["2D"] = ( - torch.as_tensor(np.stack([hann_windowed_sine] * 10, axis=1), device=cpu).unsqueeze(0).unsqueeze(0) -) -cpu_input_data["3D"] = ( - torch.as_tensor(np.stack([np.stack([hann_windowed_sine] * 10, axis=1)] * 10, axis=2), device=cpu) - .unsqueeze(0) - .unsqueeze(0) -) -cpu_input_data["1D 2CH"] = torch.as_tensor(np.stack([hann_windowed_sine] * 10, axis=1), device=cpu).unsqueeze(0) +cpu_input_data["1D"] = torch.as_tensor(hann_windowed_sine, device=cpu)[None, None] +cpu_input_data["2D"] = torch.as_tensor(np.stack([hann_windowed_sine] * 10, axis=1), device=cpu)[None, None] +cpu_input_data["3D"] = torch.as_tensor( + np.stack([np.stack([hann_windowed_sine] * 10, axis=1)] * 10, axis=2), device=cpu +)[None, None] +cpu_input_data["1D 2CH"] = torch.as_tensor(np.stack([hann_windowed_sine] * 10, axis=1), device=cpu)[None] cpu_input_data["2D 2CH"] = torch.as_tensor( np.stack([np.stack([hann_windowed_sine] * 10, axis=1)] * 10, axis=2), device=cpu -).unsqueeze(0) +)[None] # SINGLE-CHANNEL CPU VALUE TESTS @@ -97,64 +93,21 @@ def create_expected_numpy_output(input_datum, **kwargs): 1e-5, # absolute tolerance ] +TEST_CASES_CPU = [ + TEST_CASE_1D_SINE_CPU, + TEST_CASE_2D_SINE_CPU, + TEST_CASE_3D_SINE_CPU, + TEST_CASE_1D_2CH_SINE_CPU, + TEST_CASE_2D_2CH_SINE_CPU, +] + # GPU TEST DATA if torch.cuda.is_available(): gpu = torch.device("cuda") - - gpu_input_data = {} - gpu_input_data["1D"] = torch.as_tensor(hann_windowed_sine, device=gpu).unsqueeze(0).unsqueeze(0) - gpu_input_data["2D"] = ( - torch.as_tensor(np.stack([hann_windowed_sine] * 10, axis=1), device=gpu).unsqueeze(0).unsqueeze(0) - ) - gpu_input_data["3D"] = ( - torch.as_tensor(np.stack([np.stack([hann_windowed_sine] * 10, axis=1)] * 10, axis=2), device=gpu) - .unsqueeze(0) - .unsqueeze(0) - ) - gpu_input_data["1D 2CH"] = torch.as_tensor(np.stack([hann_windowed_sine] * 10, axis=1), device=gpu).unsqueeze(0) - gpu_input_data["2D 2CH"] = torch.as_tensor( - np.stack([np.stack([hann_windowed_sine] * 10, axis=1)] * 10, axis=2), device=gpu - ).unsqueeze(0) - - # SINGLE CHANNEL GPU VALUE TESTS - - TEST_CASE_1D_SINE_GPU = [ - {}, # args (empty, so use default) - gpu_input_data["1D"], # Input data: Random 1D signal - create_expected_numpy_output(gpu_input_data["1D"], axis=2), # Expected output: FFT of signal - 1e-5, # absolute tolerance - ] - - TEST_CASE_2D_SINE_GPU = [ - {}, # args (empty, so use default) - gpu_input_data["2D"], # Input data: Random 1D signal - create_expected_numpy_output(gpu_input_data["2D"], axis=2), # Expected output: FFT of signal - 1e-5, # absolute tolerance - ] - - TEST_CASE_3D_SINE_GPU = [ - {}, # args (empty, so use default) - gpu_input_data["3D"], # Input data: Random 1D signal - create_expected_numpy_output(gpu_input_data["3D"], axis=2), # Expected output: FFT of signal - 1e-5, # absolute tolerance - ] - - # MULTICHANNEL GPU VALUE TESTS, PROCESS ALONG FIRST SPATIAL AXIS - - TEST_CASE_1D_2CH_SINE_GPU = [ - {}, # args (empty, so use default) - gpu_input_data["1D 2CH"], # Input data: Random 1D signal - create_expected_numpy_output(gpu_input_data["1D 2CH"], axis=2), - 1e-5, # absolute tolerance - ] - - TEST_CASE_2D_2CH_SINE_GPU = [ - {}, # args (empty, so use default) - gpu_input_data["2D 2CH"], # Input data: Random 1D signal - create_expected_numpy_output(gpu_input_data["2D 2CH"], axis=2), - 1e-5, # absolute tolerance - ] + TEST_CASES_GPU = [[args, image.to(gpu), exp_data, atol] for args, image, exp_data, atol in TEST_CASES_CPU] +else: + TEST_CASES_GPU = [] # TESTS CHECKING PADDING, AXIS SELECTION ETC ARE COVERED BY test_detect_envelope.py @@ -162,42 +115,10 @@ def create_expected_numpy_output(input_datum, **kwargs): @SkipIfNoModule("torch.fft") class TestHilbertTransformCPU(unittest.TestCase): - @parameterized.expand( - [ - TEST_CASE_1D_SINE_CPU, - TEST_CASE_2D_SINE_CPU, - TEST_CASE_3D_SINE_CPU, - TEST_CASE_1D_2CH_SINE_CPU, - TEST_CASE_2D_2CH_SINE_CPU, - ] - ) - def test_value(self, arguments, image, expected_data, atol): - result = HilbertTransform(**arguments)(image) - result = result.squeeze(0).squeeze(0).numpy() - np.testing.assert_allclose(result, expected_data.squeeze(), atol=atol) - - -@skip_if_no_cuda -@SkipIfNoModule("torch.fft") -class TestHilbertTransformGPU(unittest.TestCase): - - @parameterized.expand( - ( - [] - if not torch.cuda.is_available() - else [ - TEST_CASE_1D_SINE_GPU, - TEST_CASE_2D_SINE_GPU, - TEST_CASE_3D_SINE_GPU, - TEST_CASE_1D_2CH_SINE_GPU, - TEST_CASE_2D_2CH_SINE_GPU, - ] - ), - skip_on_empty=True, - ) + @parameterized.expand(TEST_CASES_CPU + TEST_CASES_GPU) def test_value(self, arguments, image, expected_data, atol): result = HilbertTransform(**arguments)(image) - result = result.squeeze(0).squeeze(0).cpu().numpy() + result = np.squeeze(result.cpu().numpy()) np.testing.assert_allclose(result, expected_data.squeeze(), atol=atol) diff --git a/tests/test_image_filter.py b/tests/test_image_filter.py index adc9dade9c..76e38d94f4 100644 --- a/tests/test_image_filter.py +++ b/tests/test_image_filter.py @@ -38,6 +38,7 @@ class TestModule(torch.nn.Module): + __test__ = False # indicate to pytest that this class is not intended for collection def __init__(self): super().__init__() diff --git a/tests/test_integration_bundle_run.py b/tests/test_integration_bundle_run.py index c2e0fb55b7..60aaef05bf 100644 --- a/tests/test_integration_bundle_run.py +++ b/tests/test_integration_bundle_run.py @@ -135,9 +135,8 @@ def test_scripts_fold(self): command_run = cmd + ["run", "training", "--config_file", config_file, "--meta_file", meta_file] completed_process = subprocess.run(command_run, check=True, capture_output=True, text=True) output = repr(completed_process.stdout).replace("\\n", "\n").replace("\\t", "\t") # Get the captured output - print(output) - self.assertTrue(expected_condition in output) + self.assertIn(expected_condition, output) command_run_workflow = cmd + [ "run_workflow", "--run_id", @@ -149,8 +148,7 @@ def test_scripts_fold(self): ] completed_process = subprocess.run(command_run_workflow, check=True, capture_output=True, text=True) output = repr(completed_process.stdout).replace("\\n", "\n").replace("\\t", "\t") # Get the captured output - print(output) - self.assertTrue(expected_condition in output) + self.assertIn(expected_condition, output) # test missing meta file self.assertIn("ERROR", command_line_tests(cmd + ["run", "training", "--config_file", config_file])) diff --git a/tests/test_integration_unet_2d.py b/tests/test_integration_unet_2d.py index 918190775c..3b40682de0 100644 --- a/tests/test_integration_unet_2d.py +++ b/tests/test_integration_unet_2d.py @@ -35,6 +35,7 @@ def __getitem__(self, _unused_id): def __len__(self): return train_steps + net = None if net_name == "basicunet": net = BasicUNet(spatial_dims=2, in_channels=1, out_channels=1, features=(4, 8, 8, 16, 16, 32)) elif net_name == "unet": diff --git a/tests/test_integration_workflows_adversarial.py b/tests/test_integration_workflows_adversarial.py new file mode 100644 index 0000000000..f323fc9917 --- /dev/null +++ b/tests/test_integration_workflows_adversarial.py @@ -0,0 +1,173 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +import shutil +import tempfile +import unittest +from glob import glob + +import numpy as np +import torch + +import monai +from monai.data import create_test_image_2d +from monai.engines import AdversarialTrainer +from monai.handlers import CheckpointSaver, StatsHandler, TensorBoardStatsHandler +from monai.networks.nets import AutoEncoder, Discriminator +from monai.transforms import Compose, EnsureChannelFirstd, LoadImaged, RandFlipd, ScaleIntensityd +from monai.utils import AdversarialKeys as Keys +from monai.utils import CommonKeys, optional_import, set_determinism +from tests.utils import DistTestCase, TimedCall, skip_if_quick + +nib, has_nibabel = optional_import("nibabel") + + +def run_training_test(root_dir, device="cuda:0"): + learning_rate = 2e-4 + real_label = 1 + fake_label = 0 + + real_images = sorted(glob(os.path.join(root_dir, "img*.nii.gz"))) + train_files = [{CommonKeys.IMAGE: img, CommonKeys.LABEL: img} for img in zip(real_images)] + + # prepare real data + train_transforms = Compose( + [ + LoadImaged(keys=[CommonKeys.IMAGE, CommonKeys.LABEL]), + EnsureChannelFirstd(keys=[CommonKeys.IMAGE, CommonKeys.LABEL], channel_dim=2), + ScaleIntensityd(keys=[CommonKeys.IMAGE]), + RandFlipd(keys=[CommonKeys.IMAGE, CommonKeys.LABEL], prob=0.5), + ] + ) + train_ds = monai.data.CacheDataset(data=train_files, transform=train_transforms, cache_rate=0.5) + train_loader = monai.data.DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=4) + + # Create Discriminator + discriminator_net = Discriminator( + in_shape=(1, 64, 64), channels=(8, 16, 32, 64, 1), strides=(2, 2, 2, 2, 1), num_res_units=1, kernel_size=5 + ).to(device) + discriminator_opt = torch.optim.Adam(discriminator_net.parameters(), learning_rate) + discriminator_loss_criterion = torch.nn.BCELoss() + + def discriminator_loss(real_logits, fake_logits): + real_target = real_logits.new_full((real_logits.shape[0], 1), real_label) + fake_target = fake_logits.new_full((fake_logits.shape[0], 1), fake_label) + real_loss = discriminator_loss_criterion(real_logits, real_target) + fake_loss = discriminator_loss_criterion(fake_logits.detach(), fake_target) + return torch.div(torch.add(real_loss, fake_loss), 2) + + # Create Generator + generator_network = AutoEncoder( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(8, 16, 32, 64), + strides=(2, 2, 2, 2), + num_res_units=1, + num_inter_units=1, + ) + generator_network = generator_network.to(device) + generator_optimiser = torch.optim.Adam(generator_network.parameters(), learning_rate) + generator_loss_criterion = torch.nn.MSELoss() + + def reconstruction_loss(recon_images, real_images): + return generator_loss_criterion(recon_images, real_images) + + def generator_loss(fake_logits): + fake_target = fake_logits.new_full((fake_logits.shape[0], 1), real_label) + recon_loss = discriminator_loss_criterion(fake_logits.detach(), fake_target) + return recon_loss + + key_train_metric = None + + train_handlers = [ + StatsHandler( + name="training_loss", + output_transform=lambda x: { + Keys.RECONSTRUCTION_LOSS: x[Keys.RECONSTRUCTION_LOSS], + Keys.DISCRIMINATOR_LOSS: x[Keys.DISCRIMINATOR_LOSS], + Keys.GENERATOR_LOSS: x[Keys.GENERATOR_LOSS], + }, + ), + TensorBoardStatsHandler( + log_dir=root_dir, + tag_name="training_loss", + output_transform=lambda x: { + Keys.RECONSTRUCTION_LOSS: x[Keys.RECONSTRUCTION_LOSS], + Keys.DISCRIMINATOR_LOSS: x[Keys.DISCRIMINATOR_LOSS], + Keys.GENERATOR_LOSS: x[Keys.GENERATOR_LOSS], + }, + ), + CheckpointSaver( + save_dir=root_dir, + save_dict={"g_net": generator_network, "d_net": discriminator_net}, + save_interval=2, + epoch_level=True, + ), + ] + + num_epochs = 5 + + trainer = AdversarialTrainer( + device=device, + max_epochs=num_epochs, + train_data_loader=train_loader, + g_network=generator_network, + g_optimizer=generator_optimiser, + g_loss_function=generator_loss, + recon_loss_function=reconstruction_loss, + d_network=discriminator_net, + d_optimizer=discriminator_opt, + d_loss_function=discriminator_loss, + non_blocking=True, + key_train_metric=key_train_metric, + train_handlers=train_handlers, + ) + trainer.run() + + return trainer.state + + +@skip_if_quick +@unittest.skipUnless(has_nibabel, "Requires nibabel library.") +class IntegrationWorkflowsAdversarialTrainer(DistTestCase): + def setUp(self): + set_determinism(seed=0) + + self.data_dir = tempfile.mkdtemp() + for i in range(40): + im, _ = create_test_image_2d(64, 64, num_objs=3, rad_max=14, num_seg_classes=1, channel_dim=-1) + n = nib.Nifti1Image(im, np.eye(4)) + nib.save(n, os.path.join(self.data_dir, f"img{i:d}.nii.gz")) + + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu:0") + monai.config.print_config() + + def tearDown(self): + set_determinism(seed=None) + shutil.rmtree(self.data_dir) + + @TimedCall(seconds=200, daemon=False) + def test_training(self): + torch.manual_seed(0) + + finish_state = run_training_test(self.data_dir, device=self.device) + + # Assert AdversarialTrainer training finished + self.assertEqual(finish_state.iteration, 100) + self.assertEqual(finish_state.epoch, 5) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_inverse_collation.py b/tests/test_inverse_collation.py index f33b5c67eb..bf3972e6bd 100644 --- a/tests/test_inverse_collation.py +++ b/tests/test_inverse_collation.py @@ -133,7 +133,7 @@ def test_collation(self, _, transform, collate_fn, ndim): d = decollate_batch(item) self.assertTrue(len(d) <= self.batch_size) for b in d: - self.assertTrue(isinstance(b["image"], MetaTensor)) + self.assertIsInstance(b["image"], MetaTensor) np.testing.assert_array_equal( b["image"].applied_operations[-1]["orig_size"], b["label"].applied_operations[-1]["orig_size"] ) diff --git a/tests/test_invertd.py b/tests/test_invertd.py index c32a3af643..f6e8fc40e7 100644 --- a/tests/test_invertd.py +++ b/tests/test_invertd.py @@ -134,7 +134,7 @@ def test_invert(self): # 25300: 2 workers (cpu, non-macos) # 1812: 0 workers (gpu or macos) # 1821: windows torch 1.10.0 - self.assertTrue((reverted.size - n_good) < 40000, f"diff. {reverted.size - n_good}") + self.assertLess((reverted.size - n_good), 40000, f"diff. {reverted.size - n_good}") set_determinism(seed=None) diff --git a/tests/test_latent_diffusion_inferer.py b/tests/test_latent_diffusion_inferer.py new file mode 100644 index 0000000000..2e04ad6c5c --- /dev/null +++ b/tests/test_latent_diffusion_inferer.py @@ -0,0 +1,824 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.inferers import LatentDiffusionInferer +from monai.networks.nets import VQVAE, AutoencoderKL, DiffusionModelUNet, SPADEAutoencoderKL, SPADEDiffusionModelUNet +from monai.networks.schedulers import DDPMScheduler +from monai.utils import optional_import + +_, has_einops = optional_import("einops") +TEST_CASES = [ + [ + "AutoencoderKL", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "DiffusionModelUNet", + { + "spatial_dims": 2, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + (1, 1, 8, 8), + (1, 3, 4, 4), + ], + [ + "VQVAE", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": [4, 4], + "num_res_layers": 1, + "num_res_channels": [4, 4], + "downsample_parameters": ((2, 4, 1, 1), (2, 4, 1, 1)), + "upsample_parameters": ((2, 4, 1, 1, 0), (2, 4, 1, 1, 0)), + "num_embeddings": 16, + "embedding_dim": 3, + }, + "DiffusionModelUNet", + { + "spatial_dims": 2, + "in_channels": 3, + "out_channels": 3, + "channels": [8, 8], + "norm_num_groups": 8, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + (1, 1, 16, 16), + (1, 3, 4, 4), + ], + [ + "VQVAE", + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": [4, 4], + "num_res_layers": 1, + "num_res_channels": [4, 4], + "downsample_parameters": ((2, 4, 1, 1), (2, 4, 1, 1)), + "upsample_parameters": ((2, 4, 1, 1, 0), (2, 4, 1, 1, 0)), + "num_embeddings": 16, + "embedding_dim": 3, + }, + "DiffusionModelUNet", + { + "spatial_dims": 3, + "in_channels": 3, + "out_channels": 3, + "channels": [8, 8], + "norm_num_groups": 8, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + (1, 1, 16, 16, 16), + (1, 3, 4, 4, 4), + ], + [ + "AutoencoderKL", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "SPADEDiffusionModelUNet", + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + (1, 1, 8, 8), + (1, 3, 4, 4), + ], +] +TEST_CASES_DIFF_SHAPES = [ + [ + "AutoencoderKL", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "DiffusionModelUNet", + { + "spatial_dims": 2, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + (1, 1, 12, 12), + (1, 3, 8, 8), + ], + [ + "VQVAE", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": [4, 4], + "num_res_layers": 1, + "num_res_channels": [4, 4], + "downsample_parameters": ((2, 4, 1, 1), (2, 4, 1, 1)), + "upsample_parameters": ((2, 4, 1, 1, 0), (2, 4, 1, 1, 0)), + "num_embeddings": 16, + "embedding_dim": 3, + }, + "DiffusionModelUNet", + { + "spatial_dims": 2, + "in_channels": 3, + "out_channels": 3, + "channels": [8, 8], + "norm_num_groups": 8, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + (1, 1, 12, 12), + (1, 3, 8, 8), + ], + [ + "VQVAE", + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": [4, 4], + "num_res_layers": 1, + "num_res_channels": [4, 4], + "downsample_parameters": ((2, 4, 1, 1), (2, 4, 1, 1)), + "upsample_parameters": ((2, 4, 1, 1, 0), (2, 4, 1, 1, 0)), + "num_embeddings": 16, + "embedding_dim": 3, + }, + "DiffusionModelUNet", + { + "spatial_dims": 3, + "in_channels": 3, + "out_channels": 3, + "channels": [8, 8], + "norm_num_groups": 8, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + (1, 1, 12, 12, 12), + (1, 3, 8, 8, 8), + ], + [ + "SPADEAutoencoderKL", + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "DiffusionModelUNet", + { + "spatial_dims": 2, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + (1, 1, 8, 8), + (1, 3, 4, 4), + ], + [ + "AutoencoderKL", + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "SPADEDiffusionModelUNet", + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + (1, 1, 8, 8), + (1, 3, 4, 4), + ], + [ + "SPADEAutoencoderKL", + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "latent_channels": 3, + "attention_levels": [False, False], + "num_res_blocks": 1, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + "norm_num_groups": 4, + }, + "SPADEDiffusionModelUNet", + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 3, + "out_channels": 3, + "channels": [4, 4], + "norm_num_groups": 4, + "attention_levels": [False, False], + "num_res_blocks": 1, + "num_head_channels": 4, + }, + (1, 1, 8, 8), + (1, 3, 4, 4), + ], +] + + +class TestDiffusionSamplingInferer(unittest.TestCase): + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_prediction_shape( + self, ae_model_type, autoencoder_params, dm_model_type, stage_2_params, input_shape, latent_shape + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + input = torch.randn(input_shape).to(device) + noise = torch.randn(latent_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = LatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + timesteps = torch.randint(0, scheduler.num_train_timesteps, (input_shape[0],), device=input.device).long() + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + prediction = inferer( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + seg=input_seg, + noise=noise, + timesteps=timesteps, + ) + else: + prediction = inferer( + inputs=input, autoencoder_model=stage_1, diffusion_model=stage_2, noise=noise, timesteps=timesteps + ) + self.assertEqual(prediction.shape, latent_shape) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sample_shape( + self, ae_model_type, autoencoder_params, dm_model_type, stage_2_params, input_shape, latent_shape + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + noise = torch.randn(latent_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = LatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + if ae_model_type == "SPADEAutoencoderKL" or dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + sample = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + seg=input_seg, + ) + else: + sample = inferer.sample( + input_noise=noise, autoencoder_model=stage_1, diffusion_model=stage_2, scheduler=scheduler + ) + self.assertEqual(sample.shape, input_shape) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sample_intermediates( + self, ae_model_type, autoencoder_params, dm_model_type, stage_2_params, input_shape, latent_shape + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + noise = torch.randn(latent_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = LatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + if ae_model_type == "SPADEAutoencoderKL" or dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + sample, intermediates = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + seg=input_seg, + save_intermediates=True, + intermediate_steps=1, + ) + else: + sample, intermediates = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + save_intermediates=True, + intermediate_steps=1, + ) + self.assertEqual(len(intermediates), 10) + self.assertEqual(intermediates[0].shape, input_shape) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_get_likelihoods( + self, ae_model_type, autoencoder_params, dm_model_type, stage_2_params, input_shape, latent_shape + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + input = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = LatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + sample, intermediates = inferer.get_likelihood( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + save_intermediates=True, + seg=input_seg, + ) + else: + sample, intermediates = inferer.get_likelihood( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + save_intermediates=True, + ) + self.assertEqual(len(intermediates), 10) + self.assertEqual(intermediates[0].shape, latent_shape) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_resample_likelihoods( + self, ae_model_type, autoencoder_params, dm_model_type, stage_2_params, input_shape, latent_shape + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + input = torch.randn(input_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = LatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + sample, intermediates = inferer.get_likelihood( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + save_intermediates=True, + resample_latent_likelihoods=True, + seg=input_seg, + ) + else: + sample, intermediates = inferer.get_likelihood( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + save_intermediates=True, + resample_latent_likelihoods=True, + ) + self.assertEqual(len(intermediates), 10) + self.assertEqual(intermediates[0].shape[2:], input_shape[2:]) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_prediction_shape_conditioned_concat( + self, ae_model_type, autoencoder_params, dm_model_type, stage_2_params, input_shape, latent_shape + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + stage_2_params = stage_2_params.copy() + n_concat_channel = 3 + stage_2_params["in_channels"] = stage_2_params["in_channels"] + n_concat_channel + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + input = torch.randn(input_shape).to(device) + noise = torch.randn(latent_shape).to(device) + conditioning_shape = list(latent_shape) + conditioning_shape[1] = n_concat_channel + conditioning = torch.randn(conditioning_shape).to(device) + + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = LatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + timesteps = torch.randint(0, scheduler.num_train_timesteps, (input_shape[0],), device=input.device).long() + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + prediction = inferer( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + noise=noise, + timesteps=timesteps, + condition=conditioning, + mode="concat", + seg=input_seg, + ) + else: + prediction = inferer( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + noise=noise, + timesteps=timesteps, + condition=conditioning, + mode="concat", + ) + self.assertEqual(prediction.shape, latent_shape) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_sample_shape_conditioned_concat( + self, ae_model_type, autoencoder_params, dm_model_type, stage_2_params, input_shape, latent_shape + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + stage_2_params = stage_2_params.copy() + n_concat_channel = 3 + stage_2_params["in_channels"] = stage_2_params["in_channels"] + n_concat_channel + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + noise = torch.randn(latent_shape).to(device) + conditioning_shape = list(latent_shape) + conditioning_shape[1] = n_concat_channel + conditioning = torch.randn(conditioning_shape).to(device) + + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = LatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + sample = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + conditioning=conditioning, + mode="concat", + seg=input_seg, + ) + else: + sample = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + conditioning=conditioning, + mode="concat", + ) + self.assertEqual(sample.shape, input_shape) + + @parameterized.expand(TEST_CASES_DIFF_SHAPES) + @skipUnless(has_einops, "Requires einops") + def test_sample_shape_different_latents( + self, ae_model_type, autoencoder_params, dm_model_type, stage_2_params, input_shape, latent_shape + ): + stage_1 = None + + if ae_model_type == "AutoencoderKL": + stage_1 = AutoencoderKL(**autoencoder_params) + if ae_model_type == "VQVAE": + stage_1 = VQVAE(**autoencoder_params) + if ae_model_type == "SPADEAutoencoderKL": + stage_1 = SPADEAutoencoderKL(**autoencoder_params) + if dm_model_type == "SPADEDiffusionModelUNet": + stage_2 = SPADEDiffusionModelUNet(**stage_2_params) + else: + stage_2 = DiffusionModelUNet(**stage_2_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + input = torch.randn(input_shape).to(device) + noise = torch.randn(latent_shape).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + # We infer the VAE shape + autoencoder_latent_shape = [i // (2 ** (len(autoencoder_params["channels"]) - 1)) for i in input_shape[2:]] + inferer = LatentDiffusionInferer( + scheduler=scheduler, + scale_factor=1.0, + ldm_latent_shape=list(latent_shape[2:]), + autoencoder_latent_shape=autoencoder_latent_shape, + ) + scheduler.set_timesteps(num_inference_steps=10) + + timesteps = torch.randint(0, scheduler.num_train_timesteps, (input_shape[0],), device=input.device).long() + + if dm_model_type == "SPADEDiffusionModelUNet": + input_shape_seg = list(input_shape) + if "label_nc" in stage_2_params.keys(): + input_shape_seg[1] = stage_2_params["label_nc"] + else: + input_shape_seg[1] = autoencoder_params["label_nc"] + input_seg = torch.randn(input_shape_seg).to(device) + prediction = inferer( + inputs=input, + autoencoder_model=stage_1, + diffusion_model=stage_2, + noise=noise, + timesteps=timesteps, + seg=input_seg, + ) + else: + prediction = inferer( + inputs=input, autoencoder_model=stage_1, diffusion_model=stage_2, noise=noise, timesteps=timesteps + ) + self.assertEqual(prediction.shape, latent_shape) + + @skipUnless(has_einops, "Requires einops") + def test_incompatible_spade_setup(self): + stage_1 = SPADEAutoencoderKL( + spatial_dims=2, + label_nc=6, + in_channels=1, + out_channels=1, + channels=(4, 4), + latent_channels=3, + attention_levels=[False, False], + num_res_blocks=1, + with_encoder_nonlocal_attn=False, + with_decoder_nonlocal_attn=False, + norm_num_groups=4, + ) + stage_2 = SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=3, + out_channels=3, + channels=[4, 4], + norm_num_groups=4, + attention_levels=[False, False], + num_res_blocks=1, + num_head_channels=4, + ) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + noise = torch.randn((1, 3, 4, 4)).to(device) + input_seg = torch.randn((1, 3, 8, 8)).to(device) + scheduler = DDPMScheduler(num_train_timesteps=10) + inferer = LatentDiffusionInferer(scheduler=scheduler, scale_factor=1.0) + scheduler.set_timesteps(num_inference_steps=10) + + with self.assertRaises(ValueError): + _ = inferer.sample( + input_noise=noise, + autoencoder_model=stage_1, + diffusion_model=stage_2, + scheduler=scheduler, + seg=input_seg, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_load_imaged.py b/tests/test_load_imaged.py index 699ed70059..914240c705 100644 --- a/tests/test_load_imaged.py +++ b/tests/test_load_imaged.py @@ -190,7 +190,7 @@ def test_correct(self, input_p, expected_shape, track_meta): self.assertTrue(hasattr(r, "affine")) self.assertIsInstance(r.affine, torch.Tensor) self.assertEqual(r.meta["space"], "RAS") - self.assertTrue("qform_code" not in r.meta) + self.assertNotIn("qform_code", r.meta) else: self.assertIsInstance(r, torch.Tensor) self.assertNotIsInstance(r, MetaTensor) diff --git a/tests/test_load_spacing_orientation.py b/tests/test_load_spacing_orientation.py index 63422761ca..cbc730e1bb 100644 --- a/tests/test_load_spacing_orientation.py +++ b/tests/test_load_spacing_orientation.py @@ -48,7 +48,7 @@ def test_load_spacingd(self, filename): ref = resample_to_output(anat, (1, 0.2, 1), order=1) t2 = time.time() print(f"time scipy: {t2 - t1}") - self.assertTrue(t2 >= t1) + self.assertGreaterEqual(t2, t1) np.testing.assert_allclose(res_dict["image"].affine, ref.affine) np.testing.assert_allclose(res_dict["image"].shape[1:], ref.shape) np.testing.assert_allclose(ref.get_fdata(), res_dict["image"][0], atol=0.05) @@ -68,7 +68,7 @@ def test_load_spacingd_rotate(self, filename): ref = resample_to_output(anat, (1, 2, 3), order=1) t2 = time.time() print(f"time scipy: {t2 - t1}") - self.assertTrue(t2 >= t1) + self.assertGreaterEqual(t2, t1) np.testing.assert_allclose(res_dict["image"].affine, ref.affine) if "anatomical" not in filename: np.testing.assert_allclose(res_dict["image"].shape[1:], ref.shape) diff --git a/tests/test_look_up_option.py b/tests/test_look_up_option.py index d40b7eaa8c..75560b4ac4 100644 --- a/tests/test_look_up_option.py +++ b/tests/test_look_up_option.py @@ -56,7 +56,7 @@ def test_default(self): def test_str_enum(self): output = look_up_option("C", {"A", "B"}, default=None) - self.assertEqual(output, None) + self.assertIsNone(output) self.assertEqual(list(_CaseStrEnum), ["A", "B"]) self.assertEqual(_CaseStrEnum.MODE_A, "A") self.assertEqual(str(_CaseStrEnum.MODE_A), "A") diff --git a/tests/test_map_and_generate_sampling_centers.py b/tests/test_map_and_generate_sampling_centers.py new file mode 100644 index 0000000000..ff74f974b9 --- /dev/null +++ b/tests/test_map_and_generate_sampling_centers.py @@ -0,0 +1,87 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from copy import deepcopy + +import numpy as np +from parameterized import parameterized + +from monai.transforms import map_and_generate_sampling_centers +from monai.utils.misc import set_determinism +from tests.utils import TEST_NDARRAYS, assert_allclose + +TEST_CASE_1 = [ + # test Argmax data + { + "label": (np.array([[[0, 1, 2], [2, 0, 1], [1, 2, 0]]])), + "spatial_size": [2, 2, 2], + "num_samples": 2, + "label_spatial_shape": [3, 3, 3], + "num_classes": 3, + "image": None, + "ratios": [0, 1, 2], + "image_threshold": 0.0, + }, + tuple, + 2, + 3, +] + +TEST_CASE_2 = [ + { + "label": ( + np.array( + [ + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], + [[0, 1, 0], [0, 0, 1], [1, 0, 0]], + [[0, 0, 1], [1, 0, 0], [0, 1, 0]], + ] + ) + ), + "spatial_size": [2, 2, 2], + "num_samples": 1, + "ratios": None, + "label_spatial_shape": [3, 3, 3], + "image": None, + "image_threshold": 0.0, + }, + tuple, + 1, + 3, +] + + +class TestMapAndGenerateSamplingCenters(unittest.TestCase): + + @parameterized.expand([TEST_CASE_1, TEST_CASE_2]) + def test_map_and_generate_sampling_centers(self, input_data, expected_type, expected_count, expected_shape): + results = [] + for p in TEST_NDARRAYS + (None,): + input_data = deepcopy(input_data) + if p is not None: + input_data["label"] = p(input_data["label"]) + set_determinism(0) + result = map_and_generate_sampling_centers(**input_data) + self.assertIsInstance(result, expected_type) + self.assertEqual(len(result), expected_count) + self.assertEqual(len(result[0]), expected_shape) + # check for consistency between numpy, torch and torch.cuda + results.append(result) + if len(results) > 1: + for x, y in zip(result[0], result[-1]): + assert_allclose(x, y, type_test=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_matshow3d.py b/tests/test_matshow3d.py index e513025e69..2eba310f4e 100644 --- a/tests/test_matshow3d.py +++ b/tests/test_matshow3d.py @@ -78,7 +78,7 @@ def test_samples(self): fig, mat = matshow3d( [im[keys] for im in ims], title=f"testing {keys}", figsize=(2, 2), frames_per_row=5, every_n=2, show=False ) - self.assertTrue(mat.dtype == np.float32) + self.assertEqual(mat.dtype, np.float32) with tempfile.TemporaryDirectory() as tempdir: tempimg = f"{tempdir}/matshow3d_patch_test.png" @@ -114,6 +114,7 @@ def test_3d_rgb(self): every_n=2, frame_dim=-1, channel_dim=0, + fill_value=0, show=False, ) diff --git a/tests/test_median_filter.py b/tests/test_median_filter.py index 1f5e623260..02fa812380 100644 --- a/tests/test_median_filter.py +++ b/tests/test_median_filter.py @@ -15,27 +15,21 @@ import numpy as np import torch +from parameterized import parameterized from monai.networks.layers import MedianFilter class MedianFilterTestCase(unittest.TestCase): - def test_3d_big(self): - a = torch.ones(1, 1, 2, 3, 5) - g = MedianFilter([1, 2, 4]).to(torch.device("cpu:0")) + @parameterized.expand([(torch.ones(1, 1, 2, 3, 5), [1, 2, 4]), (torch.ones(1, 1, 4, 3, 4), 1)]) # 3d_big # 3d + def test_3d(self, input_tensor, radius): + filter = MedianFilter(radius).to(torch.device("cpu:0")) - expected = a.numpy() - out = g(a).cpu().numpy() - np.testing.assert_allclose(out, expected, rtol=1e-5) - - def test_3d(self): - a = torch.ones(1, 1, 4, 3, 4) - g = MedianFilter(1).to(torch.device("cpu:0")) + expected = input_tensor.numpy() + output = filter(input_tensor).cpu().numpy() - expected = a.numpy() - out = g(a).cpu().numpy() - np.testing.assert_allclose(out, expected, rtol=1e-5) + np.testing.assert_allclose(output, expected, rtol=1e-5) def test_3d_radii(self): a = torch.ones(1, 1, 4, 3, 2) diff --git a/tests/test_mednistdataset.py b/tests/test_mednistdataset.py index baf3bf4f2d..c1b21e9373 100644 --- a/tests/test_mednistdataset.py +++ b/tests/test_mednistdataset.py @@ -41,7 +41,7 @@ def _test_dataset(dataset): self.assertEqual(len(dataset), int(MEDNIST_FULL_DATASET_LENGTH * dataset.test_frac)) self.assertTrue("image" in dataset[0]) self.assertTrue("label" in dataset[0]) - self.assertTrue(isinstance(dataset[0]["image"], MetaTensor)) + self.assertIsInstance(dataset[0]["image"], MetaTensor) self.assertTupleEqual(dataset[0]["image"].shape, (1, 64, 64)) with skip_if_downloading_fails(): @@ -65,11 +65,8 @@ def _test_dataset(dataset): self.assertEqual(data[0]["class_name"], "AbdomenCT") self.assertEqual(data[0]["label"], 0) shutil.rmtree(os.path.join(testing_dir, "MedNIST")) - try: + with self.assertRaisesRegex(RuntimeError, "^Cannot find dataset directory"): MedNISTDataset(root_dir=testing_dir, transform=transform, section="test", download=False) - except RuntimeError as e: - print(str(e)) - self.assertTrue(str(e).startswith("Cannot find dataset directory")) if __name__ == "__main__": diff --git a/tests/test_meta_affine.py b/tests/test_meta_affine.py index 95764a0c89..890734391f 100644 --- a/tests/test_meta_affine.py +++ b/tests/test_meta_affine.py @@ -160,7 +160,7 @@ def test_linear_consistent(self, xform_cls, input_dict, atol): diff = np.abs(itk.GetArrayFromImage(ref_2) - itk.GetArrayFromImage(expected)) avg_diff = np.mean(diff) - self.assertTrue(avg_diff < atol, f"{xform_cls} avg_diff: {avg_diff}, tol: {atol}") + self.assertLess(avg_diff, atol, f"{xform_cls} avg_diff: {avg_diff}, tol: {atol}") @parameterized.expand(TEST_CASES_DICT) def test_linear_consistent_dict(self, xform_cls, input_dict, atol): @@ -175,7 +175,7 @@ def test_linear_consistent_dict(self, xform_cls, input_dict, atol): diff = {k: np.abs(itk.GetArrayFromImage(ref_2[k]) - itk.GetArrayFromImage(expected[k])) for k in keys} avg_diff = {k: np.mean(diff[k]) for k in keys} for k in keys: - self.assertTrue(avg_diff[k] < atol, f"{xform_cls} avg_diff: {avg_diff}, tol: {atol}") + self.assertLess(avg_diff[k], atol, f"{xform_cls} avg_diff: {avg_diff}, tol: {atol}") if __name__ == "__main__": diff --git a/tests/test_meta_tensor.py b/tests/test_meta_tensor.py index 1e0f188b63..f31a07eba4 100644 --- a/tests/test_meta_tensor.py +++ b/tests/test_meta_tensor.py @@ -222,9 +222,9 @@ def test_stack(self, device, dtype): def test_get_set_meta_fns(self): set_track_meta(False) - self.assertEqual(get_track_meta(), False) + self.assertFalse(get_track_meta()) set_track_meta(True) - self.assertEqual(get_track_meta(), True) + self.assertTrue(get_track_meta()) @parameterized.expand(TEST_DEVICES) def test_torchscript(self, device): diff --git a/tests/test_mlp.py b/tests/test_mlp.py index 54f70d3318..2598d8877d 100644 --- a/tests/test_mlp.py +++ b/tests/test_mlp.py @@ -15,10 +15,12 @@ import numpy as np import torch +import torch.nn as nn from parameterized import parameterized from monai.networks import eval_mode from monai.networks.blocks.mlp import MLPBlock +from monai.networks.layers.factories import split_args TEST_CASE_MLP = [] for dropout_rate in np.linspace(0, 1, 4): @@ -31,6 +33,14 @@ ] TEST_CASE_MLP.append(test_case) +# test different activation layers +TEST_CASE_ACT = [] +for act in ["GELU", "GEGLU", ("GEGLU", {})]: # type: ignore + TEST_CASE_ACT.append([{"hidden_size": 128, "mlp_dim": 0, "act": act}, (2, 512, 128), (2, 512, 128)]) + +# test different dropout modes +TEST_CASE_DROP = [["vit", nn.Dropout], ["swin", nn.Dropout], ["vista3d", nn.Identity]] + class TestMLPBlock(unittest.TestCase): @@ -45,6 +55,24 @@ def test_ill_arg(self): with self.assertRaises(ValueError): MLPBlock(hidden_size=128, mlp_dim=512, dropout_rate=5.0) + @parameterized.expand(TEST_CASE_ACT) + def test_act(self, input_param, input_shape, expected_shape): + net = MLPBlock(**input_param) + with eval_mode(net): + result = net(torch.randn(input_shape)) + self.assertEqual(result.shape, expected_shape) + act_name, _ = split_args(input_param["act"]) + if act_name == "GEGLU": + self.assertEqual(net.linear1.in_features, net.linear1.out_features // 2) + else: + self.assertEqual(net.linear1.in_features, net.linear1.out_features) + + @parameterized.expand(TEST_CASE_DROP) + def test_dropout_mode(self, dropout_mode, dropout_layer): + net = MLPBlock(hidden_size=128, mlp_dim=512, dropout_rate=0.1, dropout_mode=dropout_mode) + self.assertTrue(isinstance(net.drop1, dropout_layer)) + self.assertTrue(isinstance(net.drop2, dropout_layer)) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_mmar_download.py b/tests/test_mmar_download.py index 6af3d09fb2..2ac73a8149 100644 --- a/tests/test_mmar_download.py +++ b/tests/test_mmar_download.py @@ -142,7 +142,7 @@ def test_load_ckpt(self, input_args, expected_name, expected_val): def test_unique(self): # model ids are unique keys = sorted(m["id"] for m in MODEL_DESC) - self.assertTrue(keys == sorted(set(keys))) + self.assertEqual(keys, sorted(set(keys))) def test_search(self): self.assertEqual(_get_val({"a": 1, "b": 2}, key="b"), 2) diff --git a/tests/test_monai_utils_misc.py b/tests/test_monai_utils_misc.py index a2a4ed62f7..f4eb5d3956 100644 --- a/tests/test_monai_utils_misc.py +++ b/tests/test_monai_utils_misc.py @@ -92,12 +92,11 @@ def test_run_cmd(self): cmd2 = "-c" cmd3 = 'import sys; print("\\tThis is on stderr\\n", file=sys.stderr); sys.exit(1)' os.environ["MONAI_DEBUG"] = str(True) - try: + with self.assertRaises(RuntimeError) as cm: run_cmd([cmd1, cmd2, cmd3], check=True) - except RuntimeError as err: - self.assertIn("This is on stderr", str(err)) - self.assertNotIn("\\n", str(err)) - self.assertNotIn("\\t", str(err)) + self.assertIn("This is on stderr", str(cm.exception)) + self.assertNotIn("\\n", str(cm.exception)) + self.assertNotIn("\\t", str(cm.exception)) if __name__ == "__main__": diff --git a/tests/test_morphological_ops.py b/tests/test_morphological_ops.py new file mode 100644 index 0000000000..422e8c4b9d --- /dev/null +++ b/tests/test_morphological_ops.py @@ -0,0 +1,102 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.transforms.utils_morphological_ops import dilate, erode, get_morphological_filter_result_t +from tests.utils import TEST_NDARRAYS, assert_allclose + +TESTS_SHAPE = [] +for p in TEST_NDARRAYS: + mask = torch.zeros(1, 1, 5, 5, 5) + filter_size = 3 + TESTS_SHAPE.append([{"mask": p(mask), "filter_size": filter_size}, [1, 1, 5, 5, 5]]) + mask = torch.zeros(3, 2, 5, 5, 5) + filter_size = 5 + TESTS_SHAPE.append([{"mask": p(mask), "filter_size": filter_size}, [3, 2, 5, 5, 5]]) + mask = torch.zeros(1, 1, 1, 1, 1) + filter_size = 5 + TESTS_SHAPE.append([{"mask": p(mask), "filter_size": filter_size}, [1, 1, 1, 1, 1]]) + mask = torch.zeros(1, 1, 1, 1) + filter_size = 5 + TESTS_SHAPE.append([{"mask": p(mask), "filter_size": filter_size}, [1, 1, 1, 1]]) + +TESTS_VALUE_T = [] +filter_size = 3 +mask = torch.ones(3, 2, 3, 3, 3) +TESTS_VALUE_T.append([{"mask": mask, "filter_size": filter_size, "pad_value": 1.0}, torch.ones(3, 2, 3, 3, 3)]) +mask = torch.zeros(3, 2, 3, 3, 3) +TESTS_VALUE_T.append([{"mask": mask, "filter_size": filter_size, "pad_value": 0.0}, torch.zeros(3, 2, 3, 3, 3)]) +mask = torch.ones(3, 2, 3, 3) +TESTS_VALUE_T.append([{"mask": mask, "filter_size": filter_size, "pad_value": 1.0}, torch.ones(3, 2, 3, 3)]) +mask = torch.zeros(3, 2, 3, 3) +TESTS_VALUE_T.append([{"mask": mask, "filter_size": filter_size, "pad_value": 0.0}, torch.zeros(3, 2, 3, 3)]) + +TESTS_VALUE = [] +for p in TEST_NDARRAYS: + mask = torch.zeros(3, 2, 5, 5, 5) + filter_size = 3 + TESTS_VALUE.append( + [{"mask": p(mask), "filter_size": filter_size}, p(torch.zeros(3, 2, 5, 5, 5)), p(torch.zeros(3, 2, 5, 5, 5))] + ) + mask = torch.ones(1, 1, 3, 3, 3) + filter_size = 3 + TESTS_VALUE.append( + [{"mask": p(mask), "filter_size": filter_size}, p(torch.ones(1, 1, 3, 3, 3)), p(torch.ones(1, 1, 3, 3, 3))] + ) + mask = torch.ones(1, 2, 3, 3, 3) + filter_size = 3 + TESTS_VALUE.append( + [{"mask": p(mask), "filter_size": filter_size}, p(torch.ones(1, 2, 3, 3, 3)), p(torch.ones(1, 2, 3, 3, 3))] + ) + mask = torch.zeros(3, 2, 3, 3, 3) + mask[:, :, 1, 1, 1] = 1.0 + filter_size = 3 + TESTS_VALUE.append( + [{"mask": p(mask), "filter_size": filter_size}, p(torch.zeros(3, 2, 3, 3, 3)), p(torch.ones(3, 2, 3, 3, 3))] + ) + mask = torch.zeros(3, 2, 3, 3) + mask[:, :, 1, 1] = 1.0 + filter_size = 3 + TESTS_VALUE.append( + [{"mask": p(mask), "filter_size": filter_size}, p(torch.zeros(3, 2, 3, 3)), p(torch.ones(3, 2, 3, 3))] + ) + + +class TestMorph(unittest.TestCase): + + @parameterized.expand(TESTS_SHAPE) + def test_shape(self, input_data, expected_result): + result1 = erode(input_data["mask"], input_data["filter_size"]) + assert_allclose(result1.shape, expected_result, type_test=False, device_test=False, atol=0.0) + + @parameterized.expand(TESTS_VALUE_T) + def test_value_t(self, input_data, expected_result): + result1 = get_morphological_filter_result_t( + input_data["mask"], input_data["filter_size"], input_data["pad_value"] + ) + assert_allclose(result1, expected_result, type_test=False, device_test=False, atol=0.0) + + @parameterized.expand(TESTS_VALUE) + def test_value(self, input_data, expected_erode_result, expected_dilate_result): + result1 = erode(input_data["mask"], input_data["filter_size"]) + assert_allclose(result1, expected_erode_result, type_test=True, device_test=True, atol=0.0) + result2 = dilate(input_data["mask"], input_data["filter_size"]) + assert_allclose(result2, expected_dilate_result, type_test=True, device_test=True, atol=0.0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_multi_scale.py b/tests/test_multi_scale.py index 6681f266a8..0b49087216 100644 --- a/tests/test_multi_scale.py +++ b/tests/test_multi_scale.py @@ -58,17 +58,24 @@ def test_shape(self, input_param, input_data, expected_val): result = MultiScaleLoss(**input_param).forward(**input_data) np.testing.assert_allclose(result.detach().cpu().numpy(), expected_val, rtol=1e-5) - def test_ill_opts(self): - with self.assertRaisesRegex(ValueError, ""): - MultiScaleLoss(loss=dice_loss, kernel="none") - with self.assertRaisesRegex(ValueError, ""): - MultiScaleLoss(loss=dice_loss, scales=[-1])( - torch.ones((1, 1, 3), device=device), torch.ones((1, 1, 3), device=device) - ) - with self.assertRaisesRegex(ValueError, ""): - MultiScaleLoss(loss=dice_loss, scales=[-1], reduction="none")( - torch.ones((1, 1, 3), device=device), torch.ones((1, 1, 3), device=device) - ) + @parameterized.expand( + [ + ({"loss": dice_loss, "kernel": "none"}, None, None), # kernel_none + ({"loss": dice_loss, "scales": [-1]}, torch.ones((1, 1, 3)), torch.ones((1, 1, 3))), # scales_negative + ( + {"loss": dice_loss, "scales": [-1], "reduction": "none"}, + torch.ones((1, 1, 3)), + torch.ones((1, 1, 3)), + ), # scales_negative_reduction_none + ] + ) + def test_ill_opts(self, kwargs, input, target): + if input is None and target is None: + with self.assertRaisesRegex(ValueError, ""): + MultiScaleLoss(**kwargs) + else: + with self.assertRaisesRegex(ValueError, ""): + MultiScaleLoss(**kwargs)(input, target) def test_script(self): input_param, input_data, expected_val = TEST_CASES[0] diff --git a/tests/test_nacl_loss.py b/tests/test_nacl_loss.py new file mode 100644 index 0000000000..704bbdb9b1 --- /dev/null +++ b/tests/test_nacl_loss.py @@ -0,0 +1,167 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import numpy as np +import torch +from parameterized import parameterized + +from monai.losses import NACLLoss + +inputs = torch.tensor( + [ + [ + [ + [0.1498, 0.1158, 0.3996, 0.3730], + [0.2155, 0.1585, 0.8541, 0.8579], + [0.6640, 0.2424, 0.0774, 0.0324], + [0.0580, 0.2180, 0.3447, 0.8722], + ], + [ + [0.3908, 0.9366, 0.1779, 0.1003], + [0.9630, 0.6118, 0.4405, 0.7916], + [0.5782, 0.9515, 0.4088, 0.3946], + [0.7860, 0.3910, 0.0324, 0.9568], + ], + [ + [0.0759, 0.0238, 0.5570, 0.1691], + [0.2703, 0.7722, 0.1611, 0.6431], + [0.8051, 0.6596, 0.4121, 0.1125], + [0.5283, 0.6746, 0.5528, 0.7913], + ], + ] + ] +) +targets = torch.tensor([[[1, 1, 1, 1], [1, 1, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0]]]) + +TEST_CASES = [ + [{"classes": 3, "dim": 2}, {"inputs": inputs, "targets": targets}, 1.1442], + [{"classes": 3, "dim": 2}, {"inputs": inputs.repeat(4, 1, 1, 1), "targets": targets.repeat(4, 1, 1)}, 1.1442], + [{"classes": 3, "dim": 2, "kernel_ops": "gaussian"}, {"inputs": inputs, "targets": targets}, 1.1433], + [{"classes": 3, "dim": 2, "kernel_ops": "gaussian", "sigma": 0.5}, {"inputs": inputs, "targets": targets}, 1.1469], + [{"classes": 3, "dim": 2, "distance_type": "l2"}, {"inputs": inputs, "targets": targets}, 1.1269], + [{"classes": 3, "dim": 2, "alpha": 0.2}, {"inputs": inputs, "targets": targets}, 1.1790], + [ + {"classes": 3, "dim": 3, "kernel_ops": "gaussian"}, + { + "inputs": torch.tensor( + [ + [ + [ + [ + [0.5977, 0.2767, 0.0591, 0.1675], + [0.4835, 0.3778, 0.8406, 0.3065], + [0.6047, 0.2860, 0.9742, 0.2013], + [0.9128, 0.8368, 0.6711, 0.4384], + ], + [ + [0.9797, 0.1863, 0.5584, 0.6652], + [0.2272, 0.2004, 0.7914, 0.4224], + [0.5097, 0.8818, 0.2581, 0.3495], + [0.1054, 0.5483, 0.3732, 0.3587], + ], + [ + [0.3060, 0.7066, 0.7922, 0.4689], + [0.1733, 0.8902, 0.6704, 0.2037], + [0.8656, 0.5561, 0.2701, 0.0092], + [0.1866, 0.7714, 0.6424, 0.9791], + ], + [ + [0.5067, 0.3829, 0.6156, 0.8985], + [0.5192, 0.8347, 0.2098, 0.2260], + [0.8887, 0.3944, 0.6400, 0.5345], + [0.1207, 0.3763, 0.5282, 0.7741], + ], + ], + [ + [ + [0.8499, 0.4759, 0.1964, 0.5701], + [0.3190, 0.1238, 0.2368, 0.9517], + [0.0797, 0.6185, 0.0135, 0.8672], + [0.4116, 0.1683, 0.1355, 0.0545], + ], + [ + [0.7533, 0.2658, 0.5955, 0.4498], + [0.9500, 0.2317, 0.2825, 0.9763], + [0.1493, 0.1558, 0.3743, 0.8723], + [0.1723, 0.7980, 0.8816, 0.0133], + ], + [ + [0.8426, 0.2666, 0.2077, 0.3161], + [0.1725, 0.8414, 0.1515, 0.2825], + [0.4882, 0.5159, 0.4120, 0.1585], + [0.2551, 0.9073, 0.7691, 0.9898], + ], + [ + [0.4633, 0.8717, 0.8537, 0.2899], + [0.3693, 0.7953, 0.1183, 0.4596], + [0.0087, 0.7925, 0.0989, 0.8385], + [0.8261, 0.6920, 0.7069, 0.4464], + ], + ], + [ + [ + [0.0110, 0.1608, 0.4814, 0.6317], + [0.0194, 0.9669, 0.3259, 0.0028], + [0.5674, 0.8286, 0.0306, 0.5309], + [0.3973, 0.8183, 0.0238, 0.1934], + ], + [ + [0.8947, 0.6629, 0.9439, 0.8905], + [0.0072, 0.1697, 0.4634, 0.0201], + [0.7184, 0.2424, 0.0820, 0.7504], + [0.3937, 0.1424, 0.4463, 0.5779], + ], + [ + [0.4123, 0.6227, 0.0523, 0.8826], + [0.0051, 0.0353, 0.3662, 0.7697], + [0.4867, 0.8986, 0.2510, 0.5316], + [0.1856, 0.2634, 0.9140, 0.9725], + ], + [ + [0.2041, 0.4248, 0.2371, 0.7256], + [0.2168, 0.5380, 0.4538, 0.7007], + [0.9013, 0.2623, 0.0739, 0.2998], + [0.1366, 0.5590, 0.2952, 0.4592], + ], + ], + ] + ] + ), + "targets": torch.tensor( + [ + [ + [[0, 1, 0, 1], [1, 2, 1, 0], [2, 1, 1, 1], [1, 1, 0, 1]], + [[2, 1, 0, 2], [1, 2, 0, 2], [1, 0, 1, 1], [1, 1, 0, 0]], + [[1, 0, 2, 1], [0, 2, 2, 1], [1, 0, 1, 1], [0, 0, 2, 1]], + [[2, 1, 1, 0], [1, 0, 0, 2], [1, 0, 2, 1], [2, 1, 0, 1]], + ] + ] + ), + }, + 1.15035, + ], +] + + +class TestNACLLoss(unittest.TestCase): + @parameterized.expand(TEST_CASES) + def test_result(self, input_param, input_data, expected_val): + loss = NACLLoss(**input_param) + result = loss(**input_data) + np.testing.assert_allclose(result.detach().cpu().numpy(), expected_val, atol=1e-4, rtol=1e-4) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_optional_import.py b/tests/test_optional_import.py index e7e1c03fd0..2f640f88d0 100644 --- a/tests/test_optional_import.py +++ b/tests/test_optional_import.py @@ -13,22 +13,20 @@ import unittest +from parameterized import parameterized + from monai.utils import OptionalImportError, exact_version, optional_import class TestOptionalImport(unittest.TestCase): - def test_default(self): - my_module, flag = optional_import("not_a_module") + @parameterized.expand(["not_a_module", "torch.randint"]) + def test_default(self, import_module): + my_module, flag = optional_import(import_module) self.assertFalse(flag) with self.assertRaises(OptionalImportError): my_module.test - my_module, flag = optional_import("torch.randint") - with self.assertRaises(OptionalImportError): - self.assertFalse(flag) - print(my_module.test) - def test_import_valid(self): my_module, flag = optional_import("torch") self.assertTrue(flag) @@ -47,18 +45,9 @@ def test_import_wrong_number(self): self.assertTrue(flag) print(my_module.randint(1, 2, (1, 2))) - def test_import_good_number(self): - my_module, flag = optional_import("torch", "0") - my_module.nn - self.assertTrue(flag) - print(my_module.randint(1, 2, (1, 2))) - - my_module, flag = optional_import("torch", "0.0.0.1") - my_module.nn - self.assertTrue(flag) - print(my_module.randint(1, 2, (1, 2))) - - my_module, flag = optional_import("torch", "1.1.0") + @parameterized.expand(["0", "0.0.0.1", "1.1.0"]) + def test_import_good_number(self, version_number): + my_module, flag = optional_import("torch", version_number) my_module.nn self.assertTrue(flag) print(my_module.randint(1, 2, (1, 2))) diff --git a/tests/test_ordering.py b/tests/test_ordering.py new file mode 100644 index 0000000000..e6b235e179 --- /dev/null +++ b/tests/test_ordering.py @@ -0,0 +1,289 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import numpy as np +from parameterized import parameterized + +from monai.utils.enums import OrderingTransformations, OrderingType +from monai.utils.ordering import Ordering + +TEST_2D_NON_RANDOM = [ + [ + { + "ordering_type": OrderingType.RASTER_SCAN, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (), + "transpositions_axes": (), + "rot90_axes": (), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [0, 1, 2, 3], + ], + [ + { + "ordering_type": OrderingType.S_CURVE, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (), + "transpositions_axes": (), + "rot90_axes": (), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [0, 1, 3, 2], + ], + [ + { + "ordering_type": OrderingType.RASTER_SCAN, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (True, False), + "transpositions_axes": (), + "rot90_axes": (), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [2, 3, 0, 1], + ], + [ + { + "ordering_type": OrderingType.S_CURVE, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (True, False), + "transpositions_axes": (), + "rot90_axes": (), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [2, 3, 1, 0], + ], + [ + { + "ordering_type": OrderingType.RASTER_SCAN, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (), + "transpositions_axes": ((1, 0),), + "rot90_axes": (), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [0, 2, 1, 3], + ], + [ + { + "ordering_type": OrderingType.S_CURVE, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (), + "transpositions_axes": ((1, 0),), + "rot90_axes": (), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [0, 2, 3, 1], + ], + [ + { + "ordering_type": OrderingType.RASTER_SCAN, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (), + "transpositions_axes": (), + "rot90_axes": ((0, 1),), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [1, 3, 0, 2], + ], + [ + { + "ordering_type": OrderingType.S_CURVE, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (), + "transpositions_axes": (), + "rot90_axes": ((0, 1),), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [1, 3, 2, 0], + ], + [ + { + "ordering_type": OrderingType.RASTER_SCAN, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (True, False), + "transpositions_axes": ((1, 0),), + "rot90_axes": ((0, 1),), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [0, 1, 2, 3], + ], + [ + { + "ordering_type": OrderingType.S_CURVE, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (True, False), + "transpositions_axes": ((1, 0),), + "rot90_axes": ((0, 1),), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [0, 1, 3, 2], + ], +] + + +TEST_3D = [ + [ + { + "ordering_type": OrderingType.RASTER_SCAN, + "spatial_dims": 3, + "dimensions": (1, 2, 2, 2), + "reflected_spatial_dims": (), + "transpositions_axes": (), + "rot90_axes": (), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + }, + [0, 1, 2, 3, 4, 5, 6, 7], + ] +] + +TEST_ORDERING_TYPE_FAILURE = [ + [ + { + "ordering_type": "hilbert", + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (True, False), + "transpositions_axes": ((1, 0),), + "rot90_axes": ((0, 1),), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + } + ] +] + +TEST_ORDERING_TRANSFORMATION_FAILURE = [ + [ + { + "ordering_type": OrderingType.S_CURVE, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (True, False), + "transpositions_axes": ((1, 0),), + "rot90_axes": ((0, 1),), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + "flip", + ), + } + ] +] + +TEST_REVERT = [ + [ + { + "ordering_type": OrderingType.S_CURVE, + "spatial_dims": 2, + "dimensions": (1, 2, 2), + "reflected_spatial_dims": (True, False), + "transpositions_axes": (), + "rot90_axes": (), + "transformation_order": ( + OrderingTransformations.TRANSPOSE.value, + OrderingTransformations.ROTATE_90.value, + OrderingTransformations.REFLECT.value, + ), + } + ] +] + + +class TestOrdering(unittest.TestCase): + @parameterized.expand(TEST_2D_NON_RANDOM + TEST_3D) + def test_ordering(self, input_param, expected_sequence_ordering): + ordering = Ordering(**input_param) + self.assertTrue(np.array_equal(ordering.get_sequence_ordering(), expected_sequence_ordering, equal_nan=True)) + + @parameterized.expand(TEST_ORDERING_TYPE_FAILURE) + def test_ordering_type_failure(self, input_param): + with self.assertRaises(ValueError): + Ordering(**input_param) + + @parameterized.expand(TEST_ORDERING_TRANSFORMATION_FAILURE) + def test_ordering_transformation_failure(self, input_param): + with self.assertRaises(ValueError): + Ordering(**input_param) + + @parameterized.expand(TEST_REVERT) + def test_revert(self, input_param): + sequence = np.random.randint(0, 100, size=input_param["dimensions"]).flatten() + + ordering = Ordering(**input_param) + + reverted_sequence = sequence[ordering.get_sequence_ordering()] + reverted_sequence = reverted_sequence[ordering.get_revert_sequence_ordering()] + + self.assertTrue(np.array_equal(sequence, reverted_sequence, equal_nan=True)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_pad_collation.py b/tests/test_pad_collation.py index ee6e001438..9d5012c9a3 100644 --- a/tests/test_pad_collation.py +++ b/tests/test_pad_collation.py @@ -89,7 +89,7 @@ def tearDown(self) -> None: @parameterized.expand(TESTS) def test_pad_collation(self, t_type, collate_method, transform): - if t_type == dict: + if t_type is dict: dataset = CacheDataset(self.dict_data, transform, progress=False) else: dataset = _Dataset(self.list_data, self.list_labels, transform) @@ -104,7 +104,7 @@ def test_pad_collation(self, t_type, collate_method, transform): loader = DataLoader(dataset, batch_size=10, collate_fn=collate_method) # check collation in forward direction for data in loader: - if t_type == dict: + if t_type is dict: shapes = [] decollated_data = decollate_batch(data) for d in decollated_data: @@ -113,11 +113,11 @@ def test_pad_collation(self, t_type, collate_method, transform): self.assertTrue(len(output["image"].applied_operations), len(dataset.transform.transforms)) self.assertTrue(len(set(shapes)) > 1) # inverted shapes must be different because of random xforms - if t_type == dict: + if t_type is dict: batch_inverse = BatchInverseTransform(dataset.transform, loader) for data in loader: output = batch_inverse(data) - self.assertTrue(output[0]["image"].shape, (1, 10, 9)) + self.assertEqual(output[0]["image"].shape, (1, 10, 9)) if __name__ == "__main__": diff --git a/tests/test_patch_gan_dicriminator.py b/tests/test_patch_gan_dicriminator.py new file mode 100644 index 0000000000..c19898e70d --- /dev/null +++ b/tests/test_patch_gan_dicriminator.py @@ -0,0 +1,179 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.networks import eval_mode +from monai.networks.nets import MultiScalePatchDiscriminator, PatchDiscriminator +from tests.utils import test_script_save + +TEST_PATCHGAN = [ + [ + { + "num_layers_d": 3, + "spatial_dims": 2, + "channels": 8, + "in_channels": 3, + "out_channels": 1, + "kernel_size": 3, + "activation": "LEAKYRELU", + "norm": "instance", + "bias": False, + "dropout": 0.1, + }, + torch.rand([1, 3, 256, 512]), + (1, 8, 128, 256), + (1, 1, 32, 64), + ], + [ + { + "num_layers_d": 3, + "spatial_dims": 3, + "channels": 8, + "in_channels": 3, + "out_channels": 1, + "kernel_size": 3, + "activation": "LEAKYRELU", + "norm": "instance", + "bias": False, + "dropout": 0.1, + }, + torch.rand([1, 3, 256, 512, 256]), + (1, 8, 128, 256, 128), + (1, 1, 32, 64, 32), + ], +] + +TEST_MULTISCALE_PATCHGAN = [ + [ + { + "num_d": 2, + "num_layers_d": 3, + "spatial_dims": 2, + "channels": 8, + "in_channels": 3, + "out_channels": 1, + "kernel_size": 3, + "activation": "LEAKYRELU", + "norm": "instance", + "bias": False, + "dropout": 0.1, + "minimum_size_im": 256, + }, + torch.rand([1, 3, 256, 512]), + [(1, 1, 32, 64), (1, 1, 4, 8)], + [4, 7], + ], + [ + { + "num_d": 2, + "num_layers_d": 3, + "spatial_dims": 3, + "channels": 8, + "in_channels": 3, + "out_channels": 1, + "kernel_size": 3, + "activation": "LEAKYRELU", + "norm": "instance", + "bias": False, + "dropout": 0.1, + "minimum_size_im": 256, + }, + torch.rand([1, 3, 256, 512, 256]), + [(1, 1, 32, 64, 32), (1, 1, 4, 8, 4)], + [4, 7], + ], +] +TEST_TOO_SMALL_SIZE = [ + { + "num_d": 2, + "num_layers_d": 6, + "spatial_dims": 2, + "channels": 8, + "in_channels": 3, + "out_channels": 1, + "kernel_size": 3, + "activation": "LEAKYRELU", + "norm": "instance", + "bias": False, + "dropout": 0.1, + "minimum_size_im": 256, + } +] + + +class TestPatchGAN(unittest.TestCase): + @parameterized.expand(TEST_PATCHGAN) + def test_shape(self, input_param, input_data, expected_shape_feature, expected_shape_output): + net = PatchDiscriminator(**input_param) + with eval_mode(net): + result = net.forward(input_data) + self.assertEqual(tuple(result[0].shape), expected_shape_feature) + self.assertEqual(tuple(result[-1].shape), expected_shape_output) + + def test_script(self): + net = PatchDiscriminator( + num_layers_d=3, + spatial_dims=2, + channels=8, + in_channels=3, + out_channels=1, + kernel_size=3, + activation="LEAKYRELU", + norm="instance", + bias=False, + dropout=0.1, + ) + i = torch.rand([1, 3, 256, 512]) + test_script_save(net, i) + + +class TestMultiscalePatchGAN(unittest.TestCase): + @parameterized.expand(TEST_MULTISCALE_PATCHGAN) + def test_shape(self, input_param, input_data, expected_shape, features_lengths=None): + net = MultiScalePatchDiscriminator(**input_param) + with eval_mode(net): + result, features = net.forward(input_data) + for r_ind, r in enumerate(result): + self.assertEqual(tuple(r.shape), expected_shape[r_ind]) + for o_d_ind, o_d in enumerate(features): + self.assertEqual(len(o_d), features_lengths[o_d_ind]) + + def test_too_small_shape(self): + with self.assertRaises(AssertionError): + MultiScalePatchDiscriminator(**TEST_TOO_SMALL_SIZE[0]) + + def test_script(self): + net = MultiScalePatchDiscriminator( + num_d=2, + num_layers_d=3, + spatial_dims=2, + channels=8, + in_channels=3, + out_channels=1, + kernel_size=3, + activation="LEAKYRELU", + norm="instance", + bias=False, + dropout=0.1, + minimum_size_im=256, + ) + i = torch.rand([1, 3, 256, 512]) + test_script_save(net, i) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_patchembedding.py b/tests/test_patchembedding.py index f8610d9214..71ac767966 100644 --- a/tests/test_patchembedding.py +++ b/tests/test_patchembedding.py @@ -43,7 +43,7 @@ "patch_size": (patch_size,) * nd, "hidden_size": hidden_size, "num_heads": num_heads, - "pos_embed": proj_type, + "proj_type": proj_type, "pos_embed_type": pos_embed_type, "dropout_rate": dropout_rate, }, @@ -93,6 +93,32 @@ def test_shape(self, input_param, input_shape, expected_shape): result = net(torch.randn(input_shape)) self.assertEqual(result.shape, expected_shape) + def test_sincos_pos_embed(self): + net = PatchEmbeddingBlock( + in_channels=1, + img_size=(32, 32, 32), + patch_size=(8, 8, 8), + hidden_size=96, + num_heads=8, + pos_embed_type="sincos", + dropout_rate=0.5, + ) + + self.assertEqual(net.position_embeddings.requires_grad, False) + + def test_learnable_pos_embed(self): + net = PatchEmbeddingBlock( + in_channels=1, + img_size=(32, 32, 32), + patch_size=(8, 8, 8), + hidden_size=96, + num_heads=8, + pos_embed_type="learnable", + dropout_rate=0.5, + ) + + self.assertEqual(net.position_embeddings.requires_grad, True) + def test_ill_arg(self): with self.assertRaises(ValueError): PatchEmbeddingBlock( @@ -101,7 +127,7 @@ def test_ill_arg(self): patch_size=(16, 16, 16), hidden_size=128, num_heads=12, - pos_embed="conv", + proj_type="conv", pos_embed_type="sincos", dropout_rate=5.0, ) @@ -113,7 +139,7 @@ def test_ill_arg(self): patch_size=(64, 64, 64), hidden_size=512, num_heads=8, - pos_embed="perceptron", + proj_type="perceptron", pos_embed_type="sincos", dropout_rate=0.3, ) @@ -125,7 +151,7 @@ def test_ill_arg(self): patch_size=(8, 8, 8), hidden_size=512, num_heads=14, - pos_embed="conv", + proj_type="conv", dropout_rate=0.3, ) @@ -136,7 +162,7 @@ def test_ill_arg(self): patch_size=(4, 4, 4), hidden_size=768, num_heads=8, - pos_embed="perceptron", + proj_type="perceptron", dropout_rate=0.3, ) with self.assertRaises(ValueError): @@ -157,7 +183,7 @@ def test_ill_arg(self): patch_size=(16, 16, 16), hidden_size=768, num_heads=12, - pos_embed="perc", + proj_type="perc", dropout_rate=0.3, ) diff --git a/tests/test_perceptual_loss.py b/tests/test_perceptual_loss.py index ba204af697..b8aa2e5982 100644 --- a/tests/test_perceptual_loss.py +++ b/tests/test_perceptual_loss.py @@ -18,7 +18,7 @@ from monai.losses import PerceptualLoss from monai.utils import optional_import -from tests.utils import SkipIfBeforePyTorchVersion, skip_if_downloading_fails, skip_if_quick +from tests.utils import SkipIfBeforePyTorchVersion, assert_allclose, skip_if_downloading_fails, skip_if_quick _, has_torchvision = optional_import("torchvision") TEST_CASES = [ @@ -40,6 +40,31 @@ (2, 1, 64, 64, 64), (2, 1, 64, 64, 64), ], + [ + {"spatial_dims": 3, "network_type": "medicalnet_resnet10_23datasets", "is_fake_3d": False}, + (2, 6, 64, 64, 64), + (2, 6, 64, 64, 64), + ], + [ + { + "spatial_dims": 3, + "network_type": "medicalnet_resnet10_23datasets", + "is_fake_3d": False, + "channel_wise": True, + }, + (2, 6, 64, 64, 64), + (2, 6, 64, 64, 64), + ], + [ + {"spatial_dims": 3, "network_type": "medicalnet_resnet50_23datasets", "is_fake_3d": False}, + (2, 1, 64, 64, 64), + (2, 1, 64, 64, 64), + ], + [ + {"spatial_dims": 3, "network_type": "medicalnet_resnet50_23datasets", "is_fake_3d": False}, + (2, 6, 64, 64, 64), + (2, 6, 64, 64, 64), + ], [ {"spatial_dims": 3, "network_type": "resnet50", "is_fake_3d": True, "pretrained": True, "fake_3d_ratio": 0.2}, (2, 1, 64, 64, 64), @@ -58,7 +83,11 @@ def test_shape(self, input_param, input_shape, target_shape): with skip_if_downloading_fails(): loss = PerceptualLoss(**input_param) result = loss(torch.randn(input_shape), torch.randn(target_shape)) - self.assertEqual(result.shape, torch.Size([])) + + if "channel_wise" in input_param.keys() and input_param["channel_wise"]: + self.assertEqual(result.shape, torch.Size([input_shape[1]])) + else: + self.assertEqual(result.shape, torch.Size([])) @parameterized.expand(TEST_CASES) def test_identical_input(self, input_param, input_shape, target_shape): @@ -66,7 +95,11 @@ def test_identical_input(self, input_param, input_shape, target_shape): loss = PerceptualLoss(**input_param) tensor = torch.randn(input_shape) result = loss(tensor, tensor) - self.assertEqual(result, torch.Tensor([0.0])) + + if "channel_wise" in input_param.keys() and input_param["channel_wise"]: + assert_allclose(result, torch.Tensor([0.0] * input_shape[1])) + else: + self.assertEqual(result, torch.Tensor([0.0])) def test_different_shape(self): with skip_if_downloading_fails(): @@ -80,12 +113,10 @@ def test_1d(self): with self.assertRaises(NotImplementedError): PerceptualLoss(spatial_dims=1) - def test_medicalnet_on_2d_data(self): - with self.assertRaises(ValueError): - PerceptualLoss(spatial_dims=2, network_type="medicalnet_resnet10_23datasets") - + @parameterized.expand(["medicalnet_resnet10_23datasets", "medicalnet_resnet50_23datasets"]) + def test_medicalnet_on_2d_data(self, network_type): with self.assertRaises(ValueError): - PerceptualLoss(spatial_dims=2, network_type="medicalnet_resnet50_23datasets") + PerceptualLoss(spatial_dims=2, network_type=network_type) if __name__ == "__main__": diff --git a/tests/test_persistentdataset.py b/tests/test_persistentdataset.py index b7bf2fbb11..7c4969e283 100644 --- a/tests/test_persistentdataset.py +++ b/tests/test_persistentdataset.py @@ -165,7 +165,7 @@ def test_different_transforms(self): im1 = PersistentDataset([im], Identity(), cache_dir=path, hash_transform=json_hashing)[0] im2 = PersistentDataset([im], Flip(1), cache_dir=path, hash_transform=json_hashing)[0] l2 = ((im1 - im2) ** 2).sum() ** 0.5 - self.assertTrue(l2 > 1) + self.assertGreater(l2, 1) if __name__ == "__main__": diff --git a/tests/test_prepare_batch_default.py b/tests/test_prepare_batch_default.py index d5a5fbf57e..093468ce27 100644 --- a/tests/test_prepare_batch_default.py +++ b/tests/test_prepare_batch_default.py @@ -14,12 +14,14 @@ import unittest import torch +from parameterized import parameterized from monai.engines import PrepareBatchDefault, SupervisedEvaluator from tests.utils import assert_allclose class TestNet(torch.nn.Module): + __test__ = False # indicate to pytest that this class is not intended for collection def forward(self, x: torch.Tensor): return x @@ -27,85 +29,48 @@ def forward(self, x: torch.Tensor): class TestPrepareBatchDefault(unittest.TestCase): - def test_dict_content(self): - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - dataloader = [ - { - "image": torch.tensor([1, 2]), - "label": torch.tensor([3, 4]), - "extra1": torch.tensor([5, 6]), - "extra2": 16, - "extra3": "test", - } + @parameterized.expand( + [ + ( + [ + { + "image": torch.tensor([1, 2]), + "label": torch.tensor([3, 4]), + "extra1": torch.tensor([5, 6]), + "extra2": 16, + "extra3": "test", + } + ], + TestNet(), + True, + ), # dict_content + ([torch.tensor([1, 2])], torch.nn.Identity(), True), # tensor_content + ([(torch.tensor([1, 2]), torch.tensor([3, 4]))], torch.nn.Identity(), True), # pair_content + ([], TestNet(), False), # empty_data ] - # set up engine - evaluator = SupervisedEvaluator( - device=device, - val_data_loader=dataloader, - epoch_length=1, - network=TestNet(), - non_blocking=False, - prepare_batch=PrepareBatchDefault(), - decollate=False, - mode="eval", - ) - evaluator.run() - output = evaluator.state.output - assert_allclose(output["image"], torch.tensor([1, 2], device=device)) - assert_allclose(output["label"], torch.tensor([3, 4], device=device)) - - def test_tensor_content(self): + ) + def test_prepare_batch(self, dataloader, network, should_run): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - dataloader = [torch.tensor([1, 2])] - - # set up engine evaluator = SupervisedEvaluator( device=device, val_data_loader=dataloader, - epoch_length=1, - network=torch.nn.Identity(), + epoch_length=len(dataloader) if should_run else 0, + network=network, non_blocking=False, prepare_batch=PrepareBatchDefault(), decollate=False, - mode="eval", + mode="eval" if should_run else "train", ) evaluator.run() - output = evaluator.state.output - assert_allclose(output["image"], torch.tensor([1, 2], device=device)) - self.assertTrue(output["label"] is None) - def test_pair_content(self): - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - dataloader = [(torch.tensor([1, 2]), torch.tensor([3, 4]))] - - # set up engine - evaluator = SupervisedEvaluator( - device=device, - val_data_loader=dataloader, - epoch_length=1, - network=torch.nn.Identity(), - non_blocking=False, - prepare_batch=PrepareBatchDefault(), - decollate=False, - mode="eval", - ) - evaluator.run() - output = evaluator.state.output - assert_allclose(output["image"], torch.tensor([1, 2], device=device)) - assert_allclose(output["label"], torch.tensor([3, 4], device=device)) - - def test_empty_data(self): - dataloader = [] - evaluator = SupervisedEvaluator( - val_data_loader=dataloader, - device=torch.device("cpu"), - epoch_length=0, - network=TestNet(), - non_blocking=False, - prepare_batch=PrepareBatchDefault(), - decollate=False, - ) - evaluator.run() + if should_run: + output = evaluator.state.output + if isinstance(dataloader[0], dict) or isinstance(dataloader[0], tuple): + assert_allclose(output["image"], torch.tensor([1, 2], device=device)) + assert_allclose(output["label"], torch.tensor([3, 4], device=device)) + else: + assert_allclose(output["image"], torch.tensor([1, 2], device=device)) + self.assertTrue(output["label"] is None) if __name__ == "__main__": diff --git a/tests/test_prepare_batch_default_dist.py b/tests/test_prepare_batch_default_dist.py index 0c53a74834..53a79575e6 100644 --- a/tests/test_prepare_batch_default_dist.py +++ b/tests/test_prepare_batch_default_dist.py @@ -43,6 +43,7 @@ class TestNet(torch.nn.Module): + __test__ = False # indicate to pytest that this class is not intended for collection def forward(self, x: torch.Tensor): return x diff --git a/tests/test_prepare_batch_diffusion.py b/tests/test_prepare_batch_diffusion.py new file mode 100644 index 0000000000..d969c06368 --- /dev/null +++ b/tests/test_prepare_batch_diffusion.py @@ -0,0 +1,104 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.engines import SupervisedEvaluator +from monai.engines.utils import DiffusionPrepareBatch +from monai.inferers import DiffusionInferer +from monai.networks.nets import DiffusionModelUNet +from monai.networks.schedulers import DDPMScheduler + +TEST_CASES = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": [8], + "norm_num_groups": 8, + "attention_levels": [True], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + (2, 1, 8, 8), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": [8], + "norm_num_groups": 8, + "attention_levels": [True], + "num_res_blocks": 1, + "num_head_channels": 8, + }, + (2, 1, 8, 8, 8), + ], +] + + +class TestPrepareBatchDiffusion(unittest.TestCase): + @parameterized.expand(TEST_CASES) + def test_output_sizes(self, input_args, image_size): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + dataloader = [{"image": torch.randn(image_size).to(device)}] + scheduler = DDPMScheduler(num_train_timesteps=20) + inferer = DiffusionInferer(scheduler=scheduler) + network = DiffusionModelUNet(**input_args).to(device) + evaluator = SupervisedEvaluator( + device=device, + val_data_loader=dataloader, + epoch_length=1, + network=network, + inferer=inferer, + non_blocking=True, + prepare_batch=DiffusionPrepareBatch(num_train_timesteps=20), + decollate=False, + ) + evaluator.run() + output = evaluator.state.output + # check shapes are the same + self.assertEqual(output["pred"].shape, image_size) + self.assertEqual(output["label"].shape, output["image"].shape) + + @parameterized.expand(TEST_CASES) + def test_conditioning(self, input_args, image_size): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + dataloader = [{"image": torch.randn(image_size).to(device), "context": torch.randn((2, 4, 3)).to(device)}] + scheduler = DDPMScheduler(num_train_timesteps=20) + inferer = DiffusionInferer(scheduler=scheduler) + network = DiffusionModelUNet(**input_args, with_conditioning=True, cross_attention_dim=3).to(device) + evaluator = SupervisedEvaluator( + device=device, + val_data_loader=dataloader, + epoch_length=1, + network=network, + inferer=inferer, + non_blocking=True, + prepare_batch=DiffusionPrepareBatch(num_train_timesteps=20, condition_name="context"), + decollate=False, + ) + evaluator.run() + output = evaluator.state.output + # check shapes are the same + self.assertEqual(output["pred"].shape, image_size) + self.assertEqual(output["label"].shape, output["image"].shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_prepare_batch_extra_input.py b/tests/test_prepare_batch_extra_input.py index f20c6e7352..3c53cc6481 100644 --- a/tests/test_prepare_batch_extra_input.py +++ b/tests/test_prepare_batch_extra_input.py @@ -36,6 +36,7 @@ class TestNet(torch.nn.Module): + __test__ = False # indicate to pytest that this class is not intended for collection def forward(self, x: torch.Tensor, t1=None, t2=None, t3=None): return {"x": x, "t1": t1, "t2": t2, "t3": t3} diff --git a/tests/test_prepare_batch_hovernet.py b/tests/test_prepare_batch_hovernet.py index 773fcb53bf..ae9554a3e8 100644 --- a/tests/test_prepare_batch_hovernet.py +++ b/tests/test_prepare_batch_hovernet.py @@ -28,6 +28,7 @@ class TestNet(torch.nn.Module): + __test__ = False # indicate to pytest that this class is not intended for collection def forward(self, x: torch.Tensor): return {HoVerNetBranch.NP: torch.tensor([1, 2]), HoVerNetBranch.NC: torch.tensor([4, 4]), HoVerNetBranch.HV: 16} diff --git a/tests/test_profiling.py b/tests/test_profiling.py index 6bee7ba262..649d980ebf 100644 --- a/tests/test_profiling.py +++ b/tests/test_profiling.py @@ -35,6 +35,7 @@ def setUp(self): self.scale = mt.ScaleIntensity() self.scale_call_name = "ScaleIntensity.__call__" + self.compose_call_name = "Compose.__call__" self.test_comp = mt.Compose([mt.ScaleIntensity(), mt.RandAxisFlip(0.5)]) self.test_image = torch.rand(1, 16, 16, 16) self.pid = os.getpid() @@ -82,7 +83,7 @@ def test_profile_multithread(self): self.assertSequenceEqual(batch.shape, (4, 1, 16, 16, 16)) results = wp.get_results() - self.assertSequenceEqual(list(results), [self.scale_call_name]) + self.assertSequenceEqual(list(results), [self.scale_call_name, self.compose_call_name]) prs = results[self.scale_call_name] @@ -98,6 +99,7 @@ def test_profile_context(self): self.scale(self.test_image) results = wp.get_results() + self.assertSequenceEqual(set(results), {"ScaleIntensity.__call__", "context"}) prs = results["context"] diff --git a/tests/test_rand_affine.py b/tests/test_rand_affine.py index f37f7827bb..2c827b7426 100644 --- a/tests/test_rand_affine.py +++ b/tests/test_rand_affine.py @@ -147,16 +147,15 @@ def test_rand_affine(self, input_param, input_data, expected_val): g.set_random_state(123) result = g(**input_data) g.rand_affine_grid.affine = torch.eye(4, dtype=torch.float64) # reset affine - test_resampler_lazy(g, result, input_param, input_data, seed=123) + test_resampler_lazy(g, result, input_param, input_data, seed=123, rtol=_rtol) if input_param.get("cache_grid", False): self.assertTrue(g._cached_grid is not None) assert_allclose(result, expected_val, rtol=_rtol, atol=1e-4, type_test="tensor") - def test_ill_cache(self): + @parameterized.expand([(None,), ((1, 1, -1),)]) + def test_ill_cache(self, spatial_size): with self.assertWarns(UserWarning): - RandAffine(cache_grid=True) - with self.assertWarns(UserWarning): - RandAffine(cache_grid=True, spatial_size=(1, 1, -1)) + RandAffine(cache_grid=True, spatial_size=spatial_size) @parameterized.expand(TEST_CASES_SKIPPED_CONSISTENCY) def test_skipped_transform_consistency(self, im, in_dtype): diff --git a/tests/test_rand_affined.py b/tests/test_rand_affined.py index 20c50954e2..eb8ebd06c5 100644 --- a/tests/test_rand_affined.py +++ b/tests/test_rand_affined.py @@ -234,11 +234,13 @@ def test_rand_affined(self, input_param, input_data, expected_val, track_meta): lazy_init_param["keys"], lazy_init_param["mode"] = key, mode resampler = RandAffined(**lazy_init_param).set_random_state(123) expected_output = resampler(**call_param) - test_resampler_lazy(resampler, expected_output, lazy_init_param, call_param, seed=123, output_key=key) + test_resampler_lazy( + resampler, expected_output, lazy_init_param, call_param, seed=123, output_key=key, rtol=_rtol + ) resampler.lazy = False if input_param.get("cache_grid", False): - self.assertTrue(g.rand_affine._cached_grid is not None) + self.assertIsNotNone(g.rand_affine._cached_grid) for key in res: if isinstance(key, str) and key.endswith("_transforms"): continue @@ -270,13 +272,10 @@ def test_rand_affined(self, input_param, input_data, expected_val, track_meta): self.assertEqual(len(v.applied_operations), 0) self.assertTupleEqual(v.shape, input_data[k].shape) - def test_ill_cache(self): + @parameterized.expand([(None,), ((2, -1),)]) # spatial size is None # spatial size is dynamic + def test_ill_cache(self, spatial_size): with self.assertWarns(UserWarning): - # spatial size is None - RandAffined(device=device, spatial_size=None, prob=1.0, cache_grid=True, keys=("img", "seg")) - with self.assertWarns(UserWarning): - # spatial size is dynamic - RandAffined(device=device, spatial_size=(2, -1), prob=1.0, cache_grid=True, keys=("img", "seg")) + RandAffined(device=device, spatial_size=spatial_size, prob=1.0, cache_grid=True, keys=("img", "seg")) if __name__ == "__main__": diff --git a/tests/test_rand_bias_field.py b/tests/test_rand_bias_field.py index 333a9ecba5..328f46b7ee 100644 --- a/tests/test_rand_bias_field.py +++ b/tests/test_rand_bias_field.py @@ -39,7 +39,7 @@ def test_output_shape(self, class_args, img_shape): img = p(np.random.rand(*img_shape)) output = bias_field(img) np.testing.assert_equal(output.shape, img_shape) - self.assertTrue(output.dtype in (np.float32, torch.float32)) + self.assertIn(output.dtype, (np.float32, torch.float32)) img_zero = np.zeros([*img_shape]) output_zero = bias_field(img_zero) diff --git a/tests/test_rand_gaussian_noise.py b/tests/test_rand_gaussian_noise.py index a56e54fe31..233b4dd1b6 100644 --- a/tests/test_rand_gaussian_noise.py +++ b/tests/test_rand_gaussian_noise.py @@ -22,22 +22,24 @@ TESTS = [] for p in TEST_NDARRAYS: - TESTS.append(("test_zero_mean", p, 0, 0.1)) - TESTS.append(("test_non_zero_mean", p, 1, 0.5)) + TESTS.append(("test_zero_mean", p, 0, 0.1, True)) + TESTS.append(("test_non_zero_mean", p, 1, 0.5, True)) + TESTS.append(("test_no_sample_std", p, 1, 0.5, False)) class TestRandGaussianNoise(NumpyImageTestCase2D): @parameterized.expand(TESTS) - def test_correct_results(self, _, im_type, mean, std): + def test_correct_results(self, _, im_type, mean, std, sample_std): seed = 0 - gaussian_fn = RandGaussianNoise(prob=1.0, mean=mean, std=std) + gaussian_fn = RandGaussianNoise(prob=1.0, mean=mean, std=std, sample_std=sample_std) gaussian_fn.set_random_state(seed) im = im_type(self.imt) noised = gaussian_fn(im) np.random.seed(seed) np.random.random() - expected = self.imt + np.random.normal(mean, np.random.uniform(0, std), size=self.imt.shape) + _std = np.random.uniform(0, std) if sample_std else std + expected = self.imt + np.random.normal(mean, _std, size=self.imt.shape) if isinstance(noised, torch.Tensor): noised = noised.cpu() np.testing.assert_allclose(expected, noised, atol=1e-5) diff --git a/tests/test_rand_gaussian_noised.py b/tests/test_rand_gaussian_noised.py index bcbed98b5a..e3df196be2 100644 --- a/tests/test_rand_gaussian_noised.py +++ b/tests/test_rand_gaussian_noised.py @@ -22,8 +22,9 @@ TESTS = [] for p in TEST_NDARRAYS: - TESTS.append(["test_zero_mean", p, ["img1", "img2"], 0, 0.1]) - TESTS.append(["test_non_zero_mean", p, ["img1", "img2"], 1, 0.5]) + TESTS.append(["test_zero_mean", p, ["img1", "img2"], 0, 0.1, True]) + TESTS.append(["test_non_zero_mean", p, ["img1", "img2"], 1, 0.5, True]) + TESTS.append(["test_no_sample_std", p, ["img1", "img2"], 1, 0.5, False]) seed = 0 @@ -31,15 +32,18 @@ class TestRandGaussianNoised(NumpyImageTestCase2D): @parameterized.expand(TESTS) - def test_correct_results(self, _, im_type, keys, mean, std): - gaussian_fn = RandGaussianNoised(keys=keys, prob=1.0, mean=mean, std=std, dtype=np.float64) + def test_correct_results(self, _, im_type, keys, mean, std, sample_std): + gaussian_fn = RandGaussianNoised( + keys=keys, prob=1.0, mean=mean, std=std, dtype=np.float64, sample_std=sample_std + ) gaussian_fn.set_random_state(seed) im = im_type(self.imt) noised = gaussian_fn({k: im for k in keys}) np.random.seed(seed) # simulate the randomize() of transform np.random.random() - noise = np.random.normal(mean, np.random.uniform(0, std), size=self.imt.shape) + _std = np.random.uniform(0, std) if sample_std else std + noise = np.random.normal(mean, _std, size=self.imt.shape) for k in keys: expected = self.imt + noise if isinstance(noised[k], torch.Tensor): diff --git a/tests/test_rand_gibbs_noise.py b/tests/test_rand_gibbs_noise.py index 4befeffbe2..5ef249a1f4 100644 --- a/tests/test_rand_gibbs_noise.py +++ b/tests/test_rand_gibbs_noise.py @@ -90,6 +90,15 @@ def test_alpha(self, im_shape, input_type): self.assertGreaterEqual(t.sampled_alpha, 0.5) self.assertLessEqual(t.sampled_alpha, 0.51) + @parameterized.expand(TEST_CASES) + def test_alpha_single_value(self, im_shape, input_type): + im = self.get_data(im_shape, input_type) + alpha = 0.01 + t = RandGibbsNoise(1.0, alpha) + _ = t(deepcopy(im)) + self.assertGreaterEqual(t.sampled_alpha, 0) + self.assertLessEqual(t.sampled_alpha, 0.01) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_rand_gibbs_noised.py b/tests/test_rand_gibbs_noised.py index 6580189af6..382290dd39 100644 --- a/tests/test_rand_gibbs_noised.py +++ b/tests/test_rand_gibbs_noised.py @@ -105,6 +105,14 @@ def test_alpha(self, im_shape, input_type): _ = t(deepcopy(data)) self.assertTrue(0.5 <= t.rand_gibbs_noise.sampled_alpha <= 0.51) + @parameterized.expand(TEST_CASES) + def test_alpha_single_value(self, im_shape, input_type): + data = self.get_data(im_shape, input_type) + alpha = 0.01 + t = RandGibbsNoised(KEYS, 1.0, alpha) + _ = t(deepcopy(data)) + self.assertTrue(0 <= t.rand_gibbs_noise.sampled_alpha <= 0.01) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_rand_weighted_cropd.py b/tests/test_rand_weighted_cropd.py index 1524442f61..a1414df0ac 100644 --- a/tests/test_rand_weighted_cropd.py +++ b/tests/test_rand_weighted_cropd.py @@ -154,7 +154,7 @@ def test_rand_weighted_cropd(self, _, init_params, input_data, expected_shape, e crop = RandWeightedCropd(**init_params) crop.set_random_state(10) result = crop(input_data) - self.assertTrue(len(result) == init_params["num_samples"]) + self.assertEqual(len(result), init_params["num_samples"]) _len = len(tuple(input_data.keys())) self.assertTupleEqual(tuple(result[0].keys())[:_len], tuple(input_data.keys())) diff --git a/tests/test_recon_net_utils.py b/tests/test_recon_net_utils.py index 1815000777..48d3b59a17 100644 --- a/tests/test_recon_net_utils.py +++ b/tests/test_recon_net_utils.py @@ -64,7 +64,7 @@ def test_reshape_channel_complex(self, test_data): def test_complex_normalize(self, test_data): result, mean, std = complex_normalize(test_data) result = result * std + mean - self.assertTrue((((result - test_data) ** 2).mean() ** 0.5).item() < 1e-5) + self.assertLess((((result - test_data) ** 2).mean() ** 0.5).item(), 1e-5) @parameterized.expand(TEST_PAD) def test_pad(self, test_data): diff --git a/tests/test_reg_loss_integration.py b/tests/test_reg_loss_integration.py index e8f82eb0c2..8afc2da6ad 100644 --- a/tests/test_reg_loss_integration.py +++ b/tests/test_reg_loss_integration.py @@ -83,6 +83,9 @@ def forward(self, x): # initialize a SGD optimizer optimizer = optim.Adam(net.parameters(), lr=learning_rate) + # declare first for pylint + init_loss = None + # train the network for it in range(max_iter): # set the gradient to zero @@ -99,7 +102,7 @@ def forward(self, x): # backward pass loss_val.backward() optimizer.step() - self.assertTrue(init_loss > loss_val, "loss did not decrease") + self.assertGreater(init_loss, loss_val, "loss did not decrease") if __name__ == "__main__": diff --git a/tests/test_regularization.py b/tests/test_regularization.py new file mode 100644 index 0000000000..12d64637d5 --- /dev/null +++ b/tests/test_regularization.py @@ -0,0 +1,142 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import numpy as np +import torch + +from monai.transforms import CutMix, CutMixd, CutOut, CutOutd, MixUp, MixUpd +from tests.utils import assert_allclose + + +class TestMixup(unittest.TestCase): + + def test_mixup(self): + for dims in [2, 3]: + shape = (6, 3) + (32,) * dims + sample = torch.rand(*shape, dtype=torch.float32) + mixup = MixUp(6, 1.0) + mixup.set_random_state(seed=0) + output = mixup(sample) + np.random.seed(0) + # simulate the randomize() of transform + np.random.random() + weight = torch.from_numpy(np.random.beta(1.0, 1.0, 6)).type(torch.float32) + perm = np.random.permutation(6) + self.assertEqual(output.shape, sample.shape) + mixweight = weight[(Ellipsis,) + (None,) * (dims + 1)] + expected = mixweight * sample + (1 - mixweight) * sample[perm, ...] + assert_allclose(output, expected, type_test=False, atol=1e-7) + + with self.assertRaises(ValueError): + MixUp(6, -0.5) + + mixup = MixUp(6, 0.5) + for dims in [2, 3]: + with self.assertRaises(ValueError): + shape = (5, 3) + (32,) * dims + sample = torch.rand(*shape, dtype=torch.float32) + mixup(sample) + + def test_mixupd(self): + for dims in [2, 3]: + shape = (6, 3) + (32,) * dims + t = torch.rand(*shape, dtype=torch.float32) + sample = {"a": t, "b": t} + mixup = MixUpd(["a", "b"], 6) + mixup.set_random_state(seed=0) + output = mixup(sample) + np.random.seed(0) + # simulate the randomize() of transform + np.random.random() + weight = torch.from_numpy(np.random.beta(1.0, 1.0, 6)).type(torch.float32) + perm = np.random.permutation(6) + self.assertEqual(output["a"].shape, sample["a"].shape) + mixweight = weight[(Ellipsis,) + (None,) * (dims + 1)] + expected = mixweight * sample["a"] + (1 - mixweight) * sample["a"][perm, ...] + assert_allclose(output["a"], expected, type_test=False, atol=1e-7) + assert_allclose(output["a"], output["b"], type_test=False, atol=1e-7) + # self.assertTrue(torch.allclose(output["a"], output["b"])) + + with self.assertRaises(ValueError): + MixUpd(["k1", "k2"], 6, -0.5) + + +class TestCutMix(unittest.TestCase): + + def test_cutmix(self): + for dims in [2, 3]: + shape = (6, 3) + (32,) * dims + sample = torch.rand(*shape, dtype=torch.float32) + cutmix = CutMix(6, 1.0) + cutmix.set_random_state(seed=0) + output = cutmix(sample) + self.assertEqual(output.shape, sample.shape) + self.assertTrue(any(not torch.allclose(sample, cutmix(sample)) for _ in range(10))) + + def test_cutmixd(self): + for dims in [2, 3]: + shape = (6, 3) + (32,) * dims + t = torch.rand(*shape, dtype=torch.float32) + label = torch.randint(0, 1, shape) + sample = {"a": t, "b": t, "lbl1": label, "lbl2": label} + cutmix = CutMixd(["a", "b"], 6, label_keys=("lbl1", "lbl2")) + cutmix.set_random_state(seed=123) + output = cutmix(sample) + # but mixing of labels is not affected by it + self.assertTrue(torch.allclose(output["lbl1"], output["lbl2"])) + + +class TestCutOut(unittest.TestCase): + + def test_cutout(self): + for dims in [2, 3]: + shape = (6, 3) + (32,) * dims + sample = torch.rand(*shape, dtype=torch.float32) + cutout = CutOut(6, 1.0) + cutout.set_random_state(seed=123) + output = cutout(sample) + np.random.seed(123) + # simulate the randomize() of transform + np.random.random() + weight = torch.from_numpy(np.random.beta(1.0, 1.0, 6)).type(torch.float32) + perm = np.random.permutation(6) + coords = [torch.from_numpy(np.random.randint(0, d, size=(1,))) for d in sample.shape[2:]] + assert_allclose(weight, cutout._params[0]) + assert_allclose(perm, cutout._params[1]) + self.assertSequenceEqual(coords, cutout._params[2]) + self.assertEqual(output.shape, sample.shape) + + def test_cutoutd(self): + for dims in [2, 3]: + shape = (6, 3) + (32,) * dims + t = torch.rand(*shape, dtype=torch.float32) + sample = {"a": t, "b": t} + cutout = CutOutd(["a", "b"], 6, 1.0) + cutout.set_random_state(seed=123) + output = cutout(sample) + np.random.seed(123) + # simulate the randomize() of transform + np.random.random() + weight = torch.from_numpy(np.random.beta(1.0, 1.0, 6)).type(torch.float32) + perm = np.random.permutation(6) + coords = [torch.from_numpy(np.random.randint(0, d, size=(1,))) for d in t.shape[2:]] + assert_allclose(weight, cutout.cutout._params[0]) + assert_allclose(perm, cutout.cutout._params[1]) + self.assertSequenceEqual(coords, cutout.cutout._params[2]) + self.assertEqual(output["a"].shape, sample["a"].shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_resize.py b/tests/test_resize.py index 65b33ea649..d4c57e2742 100644 --- a/tests/test_resize.py +++ b/tests/test_resize.py @@ -46,6 +46,7 @@ class TestResize(NumpyImageTestCase2D): + def test_invalid_inputs(self): with self.assertRaises(ValueError): resize = Resize(spatial_size=(128, 128, 3), mode="order") diff --git a/tests/test_resized.py b/tests/test_resized.py index d62f29ab5c..243a4e6622 100644 --- a/tests/test_resized.py +++ b/tests/test_resized.py @@ -66,6 +66,7 @@ @SkipIfAtLeastPyTorchVersion((2, 2, 0)) # https://github.com/Project-MONAI/MONAI/issues/7445 class TestResized(NumpyImageTestCase2D): + def test_invalid_inputs(self): with self.assertRaises(ValueError): resize = Resized(keys="img", spatial_size=(128, 128, 3), mode="order") diff --git a/tests/test_resnet.py b/tests/test_resnet.py index ad1aad8fc6..a55d18f5de 100644 --- a/tests/test_resnet.py +++ b/tests/test_resnet.py @@ -24,6 +24,7 @@ from monai.networks import eval_mode from monai.networks.nets import ( ResNet, + ResNetFeatures, get_medicalnet_pretrained_resnet_args, get_pretrained_resnet_medicalnet, resnet10, @@ -36,7 +37,14 @@ ) from monai.networks.nets.resnet import ResNetBlock from monai.utils import optional_import -from tests.utils import equal_state_dict, skip_if_downloading_fails, skip_if_no_cuda, skip_if_quick, test_script_save +from tests.utils import ( + SkipIfNoModule, + equal_state_dict, + skip_if_downloading_fails, + skip_if_no_cuda, + skip_if_quick, + test_script_save, +) if TYPE_CHECKING: import torchvision @@ -99,6 +107,7 @@ "num_classes": 3, "conv1_t_size": [3], "conv1_t_stride": 1, + "act": ("relu", {"inplace": False}), }, (1, 2, 32), (1, 3), @@ -177,19 +186,60 @@ (1, 3), ] +TEST_CASE_8 = [ + { + "block": "bottleneck", + "layers": [3, 4, 6, 3], + "block_inplanes": [64, 128, 256, 512], + "spatial_dims": 1, + "n_input_channels": 2, + "num_classes": 3, + "conv1_t_size": [3], + "conv1_t_stride": 1, + "act": ("relu", {"inplace": False}), + }, + (1, 2, 32), + (1, 3), +] + +TEST_CASE_9 = [ # Layer norm + { + "block": ResNetBlock, + "layers": [3, 4, 6, 3], + "block_inplanes": [64, 128, 256, 512], + "spatial_dims": 1, + "n_input_channels": 2, + "num_classes": 3, + "conv1_t_size": [3], + "conv1_t_stride": 1, + "act": ("relu", {"inplace": False}), + "norm": ("layer", {"normalized_shape": (64, 32)}), + }, + (1, 2, 32), + (1, 3), +] + TEST_CASES = [] PRETRAINED_TEST_CASES = [] for case in [TEST_CASE_1, TEST_CASE_2, TEST_CASE_3, TEST_CASE_2_A, TEST_CASE_3_A]: for model in [resnet10, resnet18, resnet34, resnet50, resnet101, resnet152, resnet200]: TEST_CASES.append([model, *case]) PRETRAINED_TEST_CASES.append([model, *case]) -for case in [TEST_CASE_5, TEST_CASE_5_A, TEST_CASE_6, TEST_CASE_7]: +for case in [TEST_CASE_5, TEST_CASE_5_A, TEST_CASE_6, TEST_CASE_7, TEST_CASE_8, TEST_CASE_9]: TEST_CASES.append([ResNet, *case]) TEST_SCRIPT_CASES = [ [model, *TEST_CASE_1] for model in [resnet10, resnet18, resnet34, resnet50, resnet101, resnet152, resnet200] ] +CASE_EXTRACT_FEATURES = [ + ( + {"model_name": "resnet10", "pretrained": True, "spatial_dims": 3, "in_channels": 1}, + [1, 1, 64, 64, 64], + ([1, 64, 32, 32, 32], [1, 64, 16, 16, 16], [1, 128, 8, 8, 8], [1, 256, 4, 4, 4], [1, 512, 2, 2, 2]), + ) +] + class TestResNet(unittest.TestCase): @@ -211,12 +261,12 @@ def test_resnet_shape(self, model, input_param, input_shape, expected_shape): if input_param.get("feed_forward", True): self.assertEqual(result.shape, expected_shape) else: - self.assertTrue(result.shape in expected_shape) + self.assertIn(result.shape, expected_shape) @parameterized.expand(PRETRAINED_TEST_CASES) @skip_if_quick @skip_if_no_cuda - def test_resnet_pretrained(self, model, input_param, input_shape, expected_shape): + def test_resnet_pretrained(self, model, input_param, _input_shape, _expected_shape): net = model(**input_param).to(device) # Save ckpt torch.save(net.state_dict(), self.tmp_ckpt_filename) @@ -240,9 +290,7 @@ def test_resnet_pretrained(self, model, input_param, input_shape, expected_shape and input_param.get("n_input_channels", 3) == 1 and input_param.get("feed_forward", True) is False and input_param.get("shortcut_type", "B") == shortcut_type - and ( - input_param.get("bias_downsample", True) == bool(bias_downsample) if bias_downsample != -1 else True - ) + and (input_param.get("bias_downsample", True) == bias_downsample) ): model(**cp_input_param) else: @@ -253,7 +301,7 @@ def test_resnet_pretrained(self, model, input_param, input_shape, expected_shape cp_input_param["n_input_channels"] = 1 cp_input_param["feed_forward"] = False cp_input_param["shortcut_type"] = shortcut_type - cp_input_param["bias_downsample"] = bool(bias_downsample) if bias_downsample != -1 else True + cp_input_param["bias_downsample"] = bias_downsample if cp_input_param.get("spatial_dims", 3) == 3: with skip_if_downloading_fails(): pretrained_net = model(**cp_input_param).to(device) @@ -270,5 +318,25 @@ def test_script(self, model, input_param, input_shape, expected_shape): test_script_save(net, test_data) +@SkipIfNoModule("hf_hub_download") +class TestExtractFeatures(unittest.TestCase): + + @parameterized.expand(CASE_EXTRACT_FEATURES) + def test_shape(self, input_param, input_shape, expected_shapes): + device = "cuda" if torch.cuda.is_available() else "cpu" + + with skip_if_downloading_fails(): + net = ResNetFeatures(**input_param).to(device) + + # run inference with random tensor + with eval_mode(net): + features = net(torch.randn(input_shape).to(device)) + + # check output shape + self.assertEqual(len(features), len(expected_shapes)) + for feature, expected_shape in zip(features, expected_shapes): + self.assertEqual(feature.shape, torch.Size(expected_shape)) + + if __name__ == "__main__": unittest.main() diff --git a/tests/test_scheduler_ddim.py b/tests/test_scheduler_ddim.py new file mode 100644 index 0000000000..1a8f8cab67 --- /dev/null +++ b/tests/test_scheduler_ddim.py @@ -0,0 +1,83 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.networks.schedulers import DDIMScheduler +from tests.utils import assert_allclose + +TEST_2D_CASE = [] +for beta_schedule in ["linear_beta", "scaled_linear_beta"]: + TEST_2D_CASE.append([{"schedule": beta_schedule}, (2, 6, 16, 16), (2, 6, 16, 16)]) + +TEST_3D_CASE = [] +for beta_schedule in ["linear_beta", "scaled_linear_beta"]: + TEST_3D_CASE.append([{"schedule": beta_schedule}, (2, 6, 16, 16, 16), (2, 6, 16, 16, 16)]) + +TEST_CASES = TEST_2D_CASE + TEST_3D_CASE + +TEST_FULl_LOOP = [ + [{"schedule": "linear_beta"}, (1, 1, 2, 2), torch.Tensor([[[[-0.9579, -0.6457], [0.4684, -0.9694]]]])] +] + + +class TestDDPMScheduler(unittest.TestCase): + @parameterized.expand(TEST_CASES) + def test_add_noise(self, input_param, input_shape, expected_shape): + scheduler = DDIMScheduler(**input_param) + scheduler.set_timesteps(num_inference_steps=100) + original_sample = torch.zeros(input_shape) + noise = torch.randn_like(original_sample) + timesteps = torch.randint(0, scheduler.num_train_timesteps, (original_sample.shape[0],)).long() + + noisy = scheduler.add_noise(original_samples=original_sample, noise=noise, timesteps=timesteps) + self.assertEqual(noisy.shape, expected_shape) + + @parameterized.expand(TEST_CASES) + def test_step_shape(self, input_param, input_shape, expected_shape): + scheduler = DDIMScheduler(**input_param) + scheduler.set_timesteps(num_inference_steps=100) + model_output = torch.randn(input_shape) + sample = torch.randn(input_shape) + output_step = scheduler.step(model_output=model_output, timestep=500, sample=sample) + self.assertEqual(output_step[0].shape, expected_shape) + self.assertEqual(output_step[1].shape, expected_shape) + + @parameterized.expand(TEST_FULl_LOOP) + def test_full_timestep_loop(self, input_param, input_shape, expected_output): + scheduler = DDIMScheduler(**input_param) + scheduler.set_timesteps(50) + torch.manual_seed(42) + model_output = torch.randn(input_shape) + sample = torch.randn(input_shape) + for t in range(50): + sample, _ = scheduler.step(model_output=model_output, timestep=t, sample=sample) + assert_allclose(sample, expected_output, rtol=1e-3, atol=1e-3) + + def test_set_timesteps(self): + scheduler = DDIMScheduler(num_train_timesteps=1000) + scheduler.set_timesteps(num_inference_steps=100) + self.assertEqual(scheduler.num_inference_steps, 100) + self.assertEqual(len(scheduler.timesteps), 100) + + def test_set_timesteps_with_num_inference_steps_bigger_than_num_train_timesteps(self): + scheduler = DDIMScheduler(num_train_timesteps=1000) + with self.assertRaises(ValueError): + scheduler.set_timesteps(num_inference_steps=2000) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_scheduler_ddpm.py b/tests/test_scheduler_ddpm.py new file mode 100644 index 0000000000..f0447aded2 --- /dev/null +++ b/tests/test_scheduler_ddpm.py @@ -0,0 +1,104 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.networks.schedulers import DDPMScheduler +from tests.utils import assert_allclose + +TEST_2D_CASE = [] +for beta_schedule in ["linear_beta", "scaled_linear_beta"]: + for variance_type in ["fixed_small", "fixed_large"]: + TEST_2D_CASE.append( + [{"schedule": beta_schedule, "variance_type": variance_type}, (2, 6, 16, 16), (2, 6, 16, 16)] + ) + +TEST_3D_CASE = [] +for beta_schedule in ["linear_beta", "scaled_linear_beta"]: + for variance_type in ["fixed_small", "fixed_large"]: + TEST_3D_CASE.append( + [{"schedule": beta_schedule, "variance_type": variance_type}, (2, 6, 16, 16, 16), (2, 6, 16, 16, 16)] + ) + +TEST_CASES = TEST_2D_CASE + TEST_3D_CASE + +TEST_FULl_LOOP = [ + [{"schedule": "linear_beta"}, (1, 1, 2, 2), torch.Tensor([[[[-1.0153, -0.3218], [0.8454, -0.7870]]]])] +] + + +class TestDDPMScheduler(unittest.TestCase): + @parameterized.expand(TEST_CASES) + def test_add_noise(self, input_param, input_shape, expected_shape): + scheduler = DDPMScheduler(**input_param) + original_sample = torch.zeros(input_shape) + noise = torch.randn_like(original_sample) + timesteps = torch.randint(0, scheduler.num_train_timesteps, (original_sample.shape[0],)).long() + + noisy = scheduler.add_noise(original_samples=original_sample, noise=noise, timesteps=timesteps) + self.assertEqual(noisy.shape, expected_shape) + + @parameterized.expand(TEST_CASES) + def test_step_shape(self, input_param, input_shape, expected_shape): + scheduler = DDPMScheduler(**input_param) + model_output = torch.randn(input_shape) + sample = torch.randn(input_shape) + output_step = scheduler.step(model_output=model_output, timestep=500, sample=sample) + self.assertEqual(output_step[0].shape, expected_shape) + self.assertEqual(output_step[1].shape, expected_shape) + + @parameterized.expand(TEST_FULl_LOOP) + def test_full_timestep_loop(self, input_param, input_shape, expected_output): + scheduler = DDPMScheduler(**input_param) + scheduler.set_timesteps(50) + torch.manual_seed(42) + model_output = torch.randn(input_shape) + sample = torch.randn(input_shape) + for t in range(50): + sample, _ = scheduler.step(model_output=model_output, timestep=t, sample=sample) + assert_allclose(sample, expected_output, rtol=1e-3, atol=1e-3) + + @parameterized.expand(TEST_CASES) + def test_get_velocity_shape(self, input_param, input_shape, expected_shape): + scheduler = DDPMScheduler(**input_param) + sample = torch.randn(input_shape) + timesteps = torch.randint(0, scheduler.num_train_timesteps, (input_shape[0],)).long() + velocity = scheduler.get_velocity(sample=sample, noise=sample, timesteps=timesteps) + self.assertEqual(velocity.shape, expected_shape) + + def test_step_learned(self): + for variance_type in ["learned", "learned_range"]: + scheduler = DDPMScheduler(variance_type=variance_type) + model_output = torch.randn(2, 6, 16, 16) + sample = torch.randn(2, 3, 16, 16) + output_step = scheduler.step(model_output=model_output, timestep=500, sample=sample) + self.assertEqual(output_step[0].shape, sample.shape) + self.assertEqual(output_step[1].shape, sample.shape) + + def test_set_timesteps(self): + scheduler = DDPMScheduler(num_train_timesteps=1000) + scheduler.set_timesteps(num_inference_steps=100) + self.assertEqual(scheduler.num_inference_steps, 100) + self.assertEqual(len(scheduler.timesteps), 100) + + def test_set_timesteps_with_num_inference_steps_bigger_than_num_train_timesteps(self): + scheduler = DDPMScheduler(num_train_timesteps=1000) + with self.assertRaises(ValueError): + scheduler.set_timesteps(num_inference_steps=2000) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_scheduler_pndm.py b/tests/test_scheduler_pndm.py new file mode 100644 index 0000000000..69e5e403f5 --- /dev/null +++ b/tests/test_scheduler_pndm.py @@ -0,0 +1,108 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.networks.schedulers import PNDMScheduler +from tests.utils import assert_allclose + +TEST_2D_CASE = [] +for beta_schedule in ["linear_beta", "scaled_linear_beta"]: + TEST_2D_CASE.append([{"schedule": beta_schedule}, (2, 6, 16, 16), (2, 6, 16, 16)]) + +TEST_3D_CASE = [] +for beta_schedule in ["linear_beta", "scaled_linear_beta"]: + TEST_3D_CASE.append([{"schedule": beta_schedule}, (2, 6, 16, 16, 16), (2, 6, 16, 16, 16)]) + +TEST_CASES = TEST_2D_CASE + TEST_3D_CASE + +TEST_FULl_LOOP = [ + [ + {"schedule": "linear_beta"}, + (1, 1, 2, 2), + torch.Tensor([[[[-2123055.2500, -459014.2812], [2863438.0000, -1263401.7500]]]]), + ] +] + + +class TestDDPMScheduler(unittest.TestCase): + @parameterized.expand(TEST_CASES) + def test_add_noise(self, input_param, input_shape, expected_shape): + scheduler = PNDMScheduler(**input_param) + original_sample = torch.zeros(input_shape) + noise = torch.randn_like(original_sample) + timesteps = torch.randint(0, scheduler.num_train_timesteps, (original_sample.shape[0],)).long() + noisy = scheduler.add_noise(original_samples=original_sample, noise=noise, timesteps=timesteps) + self.assertEqual(noisy.shape, expected_shape) + + @parameterized.expand(TEST_CASES) + def test_step_shape(self, input_param, input_shape, expected_shape): + scheduler = PNDMScheduler(**input_param) + scheduler.set_timesteps(600) + model_output = torch.randn(input_shape) + sample = torch.randn(input_shape) + output_step = scheduler.step(model_output=model_output, timestep=500, sample=sample) + self.assertEqual(output_step[0].shape, expected_shape) + self.assertEqual(output_step[1], None) + + @parameterized.expand(TEST_FULl_LOOP) + def test_full_timestep_loop(self, input_param, input_shape, expected_output): + scheduler = PNDMScheduler(**input_param) + scheduler.set_timesteps(50) + torch.manual_seed(42) + model_output = torch.randn(input_shape) + sample = torch.randn(input_shape) + for t in range(50): + sample, _ = scheduler.step(model_output=model_output, timestep=t, sample=sample) + assert_allclose(sample, expected_output, rtol=1e-3, atol=1e-3) + + @parameterized.expand(TEST_FULl_LOOP) + def test_timestep_two_loops(self, input_param, input_shape, expected_output): + scheduler = PNDMScheduler(**input_param) + scheduler.set_timesteps(50) + torch.manual_seed(42) + model_output = torch.randn(input_shape) + sample = torch.randn(input_shape) + for t in range(50): + sample, _ = scheduler.step(model_output=model_output, timestep=t, sample=sample) + torch.manual_seed(42) + model_output2 = torch.randn(input_shape) + sample2 = torch.randn(input_shape) + scheduler.set_timesteps(50) + for t in range(50): + sample2, _ = scheduler.step(model_output=model_output2, timestep=t, sample=sample2) + assert_allclose(sample, sample2, rtol=1e-3, atol=1e-3) + + def test_set_timesteps(self): + scheduler = PNDMScheduler(num_train_timesteps=1000, skip_prk_steps=True) + scheduler.set_timesteps(num_inference_steps=100) + self.assertEqual(scheduler.num_inference_steps, 100) + self.assertEqual(len(scheduler.timesteps), 100) + + def test_set_timesteps_prk(self): + scheduler = PNDMScheduler(num_train_timesteps=1000, skip_prk_steps=False) + scheduler.set_timesteps(num_inference_steps=100) + self.assertEqual(scheduler.num_inference_steps, 109) + self.assertEqual(len(scheduler.timesteps), 109) + + def test_set_timesteps_with_num_inference_steps_bigger_than_num_train_timesteps(self): + scheduler = PNDMScheduler(num_train_timesteps=1000) + with self.assertRaises(ValueError): + scheduler.set_timesteps(num_inference_steps=2000) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_segresnet_ds.py b/tests/test_segresnet_ds.py index 5372fcc8ae..eab7bac9a0 100644 --- a/tests/test_segresnet_ds.py +++ b/tests/test_segresnet_ds.py @@ -17,7 +17,7 @@ from parameterized import parameterized from monai.networks import eval_mode -from monai.networks.nets import SegResNetDS +from monai.networks.nets import SegResNetDS, SegResNetDS2 from tests.utils import SkipIfBeforePyTorchVersion, test_script_save device = "cuda" if torch.cuda.is_available() else "cpu" @@ -71,7 +71,7 @@ ] -class TestResNetDS(unittest.TestCase): +class TestSegResNetDS(unittest.TestCase): @parameterized.expand(TEST_CASE_SEGRESNET_DS) def test_shape(self, input_param, input_shape, expected_shape): @@ -80,47 +80,71 @@ def test_shape(self, input_param, input_shape, expected_shape): result = net(torch.randn(input_shape).to(device)) self.assertEqual(result.shape, expected_shape, msg=str(input_param)) + @parameterized.expand(TEST_CASE_SEGRESNET_DS) + def test_shape_ds2(self, input_param, input_shape, expected_shape): + net = SegResNetDS2(**input_param).to(device) + with eval_mode(net): + result = net(torch.randn(input_shape).to(device), with_label=False) + self.assertEqual(result[0].shape, expected_shape, msg=str(input_param)) + self.assertTrue(result[1] == []) + + result = net(torch.randn(input_shape).to(device), with_point=False) + self.assertEqual(result[1].shape, expected_shape, msg=str(input_param)) + self.assertTrue(result[0] == []) + @parameterized.expand(TEST_CASE_SEGRESNET_DS2) def test_shape2(self, input_param, input_shape, expected_shape): dsdepth = input_param.get("dsdepth", 1) - net = SegResNetDS(**input_param).to(device) - - net.train() - result = net(torch.randn(input_shape).to(device)) - if dsdepth > 1: - assert isinstance(result, list) - self.assertEqual(dsdepth, len(result)) - for i in range(dsdepth): - self.assertEqual( - result[i].shape, - expected_shape[:2] + tuple(e // (2**i) for e in expected_shape[2:]), - msg=str(input_param), - ) - else: - assert isinstance(result, torch.Tensor) - self.assertEqual(result.shape, expected_shape, msg=str(input_param)) - - net.eval() - result = net(torch.randn(input_shape).to(device)) - assert isinstance(result, torch.Tensor) - self.assertEqual(result.shape, expected_shape, msg=str(input_param)) + for net in [SegResNetDS, SegResNetDS2]: + net = net(**input_param).to(device) + net.train() + if isinstance(net, SegResNetDS2): + result = net(torch.randn(input_shape).to(device), with_label=False)[0] + else: + result = net(torch.randn(input_shape).to(device)) + if dsdepth > 1: + assert isinstance(result, list) + self.assertEqual(dsdepth, len(result)) + for i in range(dsdepth): + self.assertEqual( + result[i].shape, + expected_shape[:2] + tuple(e // (2**i) for e in expected_shape[2:]), + msg=str(input_param), + ) + else: + assert isinstance(result, torch.Tensor) + self.assertEqual(result.shape, expected_shape, msg=str(input_param)) + + if not isinstance(net, SegResNetDS2): + # eval mode of SegResNetDS2 has same output as training mode + # so only test eval mode for SegResNetDS + net.eval() + result = net(torch.randn(input_shape).to(device)) + assert isinstance(result, torch.Tensor) + self.assertEqual(result.shape, expected_shape, msg=str(input_param)) @parameterized.expand(TEST_CASE_SEGRESNET_DS3) def test_shape3(self, input_param, input_shape, expected_shapes): dsdepth = input_param.get("dsdepth", 1) - net = SegResNetDS(**input_param).to(device) - - net.train() - result = net(torch.randn(input_shape).to(device)) - assert isinstance(result, list) - self.assertEqual(dsdepth, len(result)) - for i in range(dsdepth): - self.assertEqual(result[i].shape, expected_shapes[i], msg=str(input_param)) + for net in [SegResNetDS, SegResNetDS2]: + net = net(**input_param).to(device) + net.train() + if isinstance(net, SegResNetDS2): + result = net(torch.randn(input_shape).to(device), with_point=False)[1] + else: + result = net(torch.randn(input_shape).to(device)) + assert isinstance(result, list) + self.assertEqual(dsdepth, len(result)) + for i in range(dsdepth): + self.assertEqual(result[i].shape, expected_shapes[i], msg=str(input_param)) def test_ill_arg(self): with self.assertRaises(ValueError): SegResNetDS(spatial_dims=4) + with self.assertRaises(ValueError): + SegResNetDS2(spatial_dims=4) + @SkipIfBeforePyTorchVersion((1, 10)) def test_script(self): input_param, input_shape, _ = TEST_CASE_SEGRESNET_DS[0] diff --git a/tests/test_selfattention.py b/tests/test_selfattention.py index b8be4fd1b6..88919fd8b1 100644 --- a/tests/test_selfattention.py +++ b/tests/test_selfattention.py @@ -20,7 +20,9 @@ from monai.networks import eval_mode from monai.networks.blocks.selfattention import SABlock +from monai.networks.layers.factories import RelPosEmbedding from monai.utils import optional_import +from tests.utils import SkipIfBeforePyTorchVersion, assert_allclose, test_script_save einops, has_einops = optional_import("einops") @@ -28,18 +30,32 @@ for dropout_rate in np.linspace(0, 1, 4): for hidden_size in [360, 480, 600, 768]: for num_heads in [4, 6, 8, 12]: - test_case = [ - {"hidden_size": hidden_size, "num_heads": num_heads, "dropout_rate": dropout_rate}, - (2, 512, hidden_size), - (2, 512, hidden_size), - ] - TEST_CASE_SABLOCK.append(test_case) + for rel_pos_embedding in [None, RelPosEmbedding.DECOMPOSED]: + for input_size in [(16, 32), (8, 8, 8)]: + for include_fc in [True, False]: + for use_combined_linear in [True, False]: + test_case = [ + { + "hidden_size": hidden_size, + "num_heads": num_heads, + "dropout_rate": dropout_rate, + "rel_pos_embedding": rel_pos_embedding, + "input_size": input_size, + "include_fc": include_fc, + "use_combined_linear": use_combined_linear, + "use_flash_attention": True if rel_pos_embedding is None else False, + }, + (2, 512, hidden_size), + (2, 512, hidden_size), + ] + TEST_CASE_SABLOCK.append(test_case) class TestResBlock(unittest.TestCase): @parameterized.expand(TEST_CASE_SABLOCK) @skipUnless(has_einops, "Requires einops") + @SkipIfBeforePyTorchVersion((2, 0)) def test_shape(self, input_param, input_shape, expected_shape): net = SABlock(**input_param) with eval_mode(net): @@ -53,6 +69,60 @@ def test_ill_arg(self): with self.assertRaises(ValueError): SABlock(hidden_size=620, num_heads=8, dropout_rate=0.4) + @SkipIfBeforePyTorchVersion((2, 0)) + def test_rel_pos_embedding_with_flash_attention(self): + with self.assertRaises(ValueError): + SABlock( + hidden_size=128, + num_heads=3, + dropout_rate=0.1, + use_flash_attention=True, + save_attn=False, + rel_pos_embedding=RelPosEmbedding.DECOMPOSED, + ) + + @SkipIfBeforePyTorchVersion((1, 13)) + def test_save_attn_with_flash_attention(self): + with self.assertRaises(ValueError): + SABlock(hidden_size=128, num_heads=3, dropout_rate=0.1, use_flash_attention=True, save_attn=True) + + def test_attention_dim_not_multiple_of_heads(self): + with self.assertRaises(ValueError): + SABlock(hidden_size=128, num_heads=3, dropout_rate=0.1) + + @skipUnless(has_einops, "Requires einops") + def test_inner_dim_different(self): + SABlock(hidden_size=128, num_heads=4, dropout_rate=0.1, dim_head=30) + + def test_causal_no_sequence_length(self): + with self.assertRaises(ValueError): + SABlock(hidden_size=128, num_heads=4, dropout_rate=0.1, causal=True) + + @skipUnless(has_einops, "Requires einops") + @SkipIfBeforePyTorchVersion((2, 0)) + def test_causal_flash_attention(self): + block = SABlock( + hidden_size=128, + num_heads=1, + dropout_rate=0.1, + causal=True, + sequence_length=16, + save_attn=False, + use_flash_attention=True, + ) + input_shape = (1, 16, 128) + # Check it runs correctly + block(torch.randn(input_shape)) + + @skipUnless(has_einops, "Requires einops") + def test_causal(self): + block = SABlock(hidden_size=128, num_heads=1, dropout_rate=0.1, causal=True, sequence_length=16, save_attn=True) + input_shape = (1, 16, 128) + block(torch.randn(input_shape)) + # check upper triangular part of the attention matrix is zero + assert torch.triu(block.att_mat, diagonal=1).sum() == 0 + + @skipUnless(has_einops, "Requires einops") def test_access_attn_matrix(self): # input format hidden_size = 128 @@ -74,6 +144,73 @@ def test_access_attn_matrix(self): matrix_acess_blk(torch.randn(input_shape)) assert matrix_acess_blk.att_mat.shape == (input_shape[0], input_shape[0], input_shape[1], input_shape[1]) + def test_number_of_parameters(self): + + def count_sablock_params(*args, **kwargs): + """Count the number of parameters in a SABlock.""" + sablock = SABlock(*args, **kwargs) + return sum([x.numel() for x in sablock.parameters() if x.requires_grad]) + + hidden_size = 128 + num_heads = 8 + default_dim_head = hidden_size // num_heads + + # Default dim_head is hidden_size // num_heads + nparams_default = count_sablock_params(hidden_size=hidden_size, num_heads=num_heads) + nparams_like_default = count_sablock_params( + hidden_size=hidden_size, num_heads=num_heads, dim_head=default_dim_head + ) + self.assertEqual(nparams_default, nparams_like_default) + + # Increasing dim_head should increase the number of parameters + nparams_custom_large = count_sablock_params( + hidden_size=hidden_size, num_heads=num_heads, dim_head=default_dim_head * 2 + ) + self.assertGreater(nparams_custom_large, nparams_default) + + # Decreasing dim_head should decrease the number of parameters + nparams_custom_small = count_sablock_params( + hidden_size=hidden_size, num_heads=num_heads, dim_head=default_dim_head // 2 + ) + self.assertGreater(nparams_default, nparams_custom_small) + + # Increasing the number of heads with the default behaviour should not change the number of params. + nparams_default_more_heads = count_sablock_params(hidden_size=hidden_size, num_heads=num_heads * 2) + self.assertEqual(nparams_default, nparams_default_more_heads) + + @parameterized.expand([[True, False], [True, True], [False, True], [False, False]]) + @skipUnless(has_einops, "Requires einops") + @SkipIfBeforePyTorchVersion((2, 0)) + def test_script(self, include_fc, use_combined_linear): + input_param = { + "hidden_size": 360, + "num_heads": 4, + "dropout_rate": 0.0, + "rel_pos_embedding": None, + "input_size": (16, 32), + "include_fc": include_fc, + "use_combined_linear": use_combined_linear, + } + net = SABlock(**input_param) + input_shape = (2, 512, 360) + test_data = torch.randn(input_shape) + test_script_save(net, test_data) + + @skipUnless(has_einops, "Requires einops") + @SkipIfBeforePyTorchVersion((2, 0)) + def test_flash_attention(self): + for causal in [True, False]: + input_param = {"hidden_size": 360, "num_heads": 4, "input_size": (16, 32), "causal": causal} + device = "cuda:0" if torch.cuda.is_available() else "cpu" + block_w_flash_attention = SABlock(**input_param, use_flash_attention=True).to(device) + block_wo_flash_attention = SABlock(**input_param, use_flash_attention=False).to(device) + block_wo_flash_attention.load_state_dict(block_w_flash_attention.state_dict()) + test_data = torch.randn(2, 512, 360).to(device) + + out_1 = block_w_flash_attention(test_data) + out_2 = block_wo_flash_attention(test_data) + assert_allclose(out_1, out_2, atol=1e-4) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_set_visible_devices.py b/tests/test_set_visible_devices.py index 7860656b3d..b4f44957a2 100644 --- a/tests/test_set_visible_devices.py +++ b/tests/test_set_visible_devices.py @@ -14,7 +14,7 @@ import os import unittest -from tests.utils import skip_if_no_cuda +from tests.utils import SkipIfAtLeastPyTorchVersion, skip_if_no_cuda class TestVisibleDevices(unittest.TestCase): @@ -25,6 +25,7 @@ def run_process_and_get_exit_code(code_to_execute): return int(bin(value).replace("0b", "").rjust(16, "0")[:8], 2) @skip_if_no_cuda + @SkipIfAtLeastPyTorchVersion((2, 2, 1)) def test_visible_devices(self): num_gpus_before = self.run_process_and_get_exit_code( 'python -c "import os; import torch; ' diff --git a/tests/test_sobel_gradient.py b/tests/test_sobel_gradient.py index 3d995a60c9..a0d7cf5a8b 100644 --- a/tests/test_sobel_gradient.py +++ b/tests/test_sobel_gradient.py @@ -164,8 +164,8 @@ def test_sobel_gradients(self, image, arguments, expected_grad): ) def test_sobel_kernels(self, arguments, expected_kernels): sobel = SobelGradients(**arguments) - self.assertTrue(sobel.kernel_diff.dtype == expected_kernels[0].dtype) - self.assertTrue(sobel.kernel_smooth.dtype == expected_kernels[0].dtype) + self.assertEqual(sobel.kernel_diff.dtype, expected_kernels[0].dtype) + self.assertEqual(sobel.kernel_smooth.dtype, expected_kernels[0].dtype) assert_allclose(sobel.kernel_diff, expected_kernels[0]) assert_allclose(sobel.kernel_smooth, expected_kernels[1]) diff --git a/tests/test_sobel_gradientd.py b/tests/test_sobel_gradientd.py index 7499a0410b..03524823a5 100644 --- a/tests/test_sobel_gradientd.py +++ b/tests/test_sobel_gradientd.py @@ -187,8 +187,8 @@ def test_sobel_gradients(self, image_dict, arguments, expected_grad): ) def test_sobel_kernels(self, arguments, expected_kernels): sobel = SobelGradientsd(**arguments) - self.assertTrue(sobel.kernel_diff.dtype == expected_kernels[0].dtype) - self.assertTrue(sobel.kernel_smooth.dtype == expected_kernels[0].dtype) + self.assertEqual(sobel.kernel_diff.dtype, expected_kernels[0].dtype) + self.assertEqual(sobel.kernel_smooth.dtype, expected_kernels[0].dtype) assert_allclose(sobel.kernel_diff, expected_kernels[0]) assert_allclose(sobel.kernel_smooth, expected_kernels[1]) diff --git a/tests/test_soft_clip.py b/tests/test_soft_clip.py new file mode 100644 index 0000000000..de5122e982 --- /dev/null +++ b/tests/test_soft_clip.py @@ -0,0 +1,125 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import numpy as np +import torch +from parameterized import parameterized + +from monai.transforms.utils import soft_clip + +TEST_CASES = [ + [ + {"minv": 2, "maxv": 8, "sharpness_factor": 10}, + { + "input": torch.arange(10).float(), + "clipped": torch.tensor([2.0000, 2.0000, 2.0693, 3.0000, 4.0000, 5.0000, 6.0000, 7.0000, 7.9307, 8.0000]), + }, + ], + [ + {"minv": 2, "maxv": None, "sharpness_factor": 10}, + { + "input": torch.arange(10).float(), + "clipped": torch.tensor([2.0000, 2.0000, 2.0693, 3.0000, 4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000]), + }, + ], + [ + {"minv": None, "maxv": 7, "sharpness_factor": 10}, + { + "input": torch.arange(10).float(), + "clipped": torch.tensor([0.0000, 1.0000, 2.0000, 3.0000, 4.0000, 5.0000, 6.0000, 6.9307, 7.0000, 7.0000]), + }, + ], + [ + {"minv": 2, "maxv": 8, "sharpness_factor": 1.0}, + { + "input": torch.arange(10).float(), + "clipped": torch.tensor([2.1266, 2.3124, 2.6907, 3.3065, 4.1088, 5.0000, 5.8912, 6.6935, 7.3093, 7.6877]), + }, + ], + [ + {"minv": 2, "maxv": 8, "sharpness_factor": 3.0}, + { + "input": torch.arange(10).float(), + "clipped": torch.tensor([2.0008, 2.0162, 2.2310, 3.0162, 4.0008, 5.0000, 5.9992, 6.9838, 7.7690, 7.9838]), + }, + ], + [ + {"minv": 2, "maxv": 8, "sharpness_factor": 5.0}, + { + "input": torch.arange(10).float(), + "clipped": torch.tensor([2.0000, 2.0013, 2.1386, 3.0013, 4.0000, 5.0000, 6.0000, 6.9987, 7.8614, 7.9987]), + }, + ], + [ + {"minv": 2, "maxv": 8, "sharpness_factor": 10}, + { + "input": np.arange(10).astype(np.float32), + "clipped": np.array([2.0000, 2.0000, 2.0693, 3.0000, 4.0000, 5.0000, 6.0000, 7.0000, 7.9307, 8.0000]), + }, + ], + [ + {"minv": 2, "maxv": None, "sharpness_factor": 10}, + { + "input": np.arange(10).astype(float), + "clipped": np.array([2.0000, 2.0000, 2.0693, 3.0000, 4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000]), + }, + ], + [ + {"minv": None, "maxv": 7, "sharpness_factor": 10}, + { + "input": np.arange(10).astype(float), + "clipped": np.array([0.0000, 1.0000, 2.0000, 3.0000, 4.0000, 5.0000, 6.0000, 6.9307, 7.0000, 7.0000]), + }, + ], + [ + {"minv": 2, "maxv": 8, "sharpness_factor": 1.0}, + { + "input": np.arange(10).astype(float), + "clipped": np.array([2.1266, 2.3124, 2.6907, 3.3065, 4.1088, 5.0000, 5.8912, 6.6935, 7.3093, 7.6877]), + }, + ], + [ + {"minv": 2, "maxv": 8, "sharpness_factor": 3.0}, + { + "input": np.arange(10).astype(float), + "clipped": np.array([2.0008, 2.0162, 2.2310, 3.0162, 4.0008, 5.0000, 5.9992, 6.9838, 7.7690, 7.9838]), + }, + ], + [ + {"minv": 2, "maxv": 8, "sharpness_factor": 5.0}, + { + "input": np.arange(10).astype(float), + "clipped": np.array([2.0000, 2.0013, 2.1386, 3.0013, 4.0000, 5.0000, 6.0000, 6.9987, 7.8614, 7.9987]), + }, + ], +] + + +class TestSoftClip(unittest.TestCase): + + @parameterized.expand(TEST_CASES) + def test_result(self, input_param, input_data): + outputs = soft_clip(input_data["input"], **input_param) + expected_val = input_data["clipped"] + if isinstance(outputs, torch.Tensor): + np.testing.assert_allclose( + outputs.detach().cpu().numpy(), expected_val.detach().cpu().numpy(), atol=1e-4, rtol=1e-4 + ) + else: + np.testing.assert_allclose(outputs, expected_val, atol=1e-4, rtol=1e-4) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_spade_autoencoderkl.py b/tests/test_spade_autoencoderkl.py new file mode 100644 index 0000000000..9353ceedc2 --- /dev/null +++ b/tests/test_spade_autoencoderkl.py @@ -0,0 +1,295 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.networks import eval_mode +from monai.networks.nets import SPADEAutoencoderKL +from monai.utils import optional_import + +einops, has_einops = optional_import("einops") + +CASES_NO_ATTENTION = [ + [ + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": 1, + "norm_num_groups": 4, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + }, + (1, 1, 16, 16), + (1, 3, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 3, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": 1, + "norm_num_groups": 4, + "with_encoder_nonlocal_attn": False, + "with_decoder_nonlocal_attn": False, + }, + (1, 1, 16, 16, 16), + (1, 3, 16, 16, 16), + (1, 1, 16, 16, 16), + (1, 4, 4, 4, 4), + ], +] + +CASES_ATTENTION = [ + [ + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": 1, + "norm_num_groups": 4, + }, + (1, 1, 16, 16), + (1, 3, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": (1, 1, 2), + "norm_num_groups": 4, + }, + (1, 1, 16, 16), + (1, 3, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": 1, + "norm_num_groups": 4, + }, + (1, 1, 16, 16), + (1, 3, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, True), + "num_res_blocks": 1, + "norm_num_groups": 4, + }, + (1, 1, 16, 16), + (1, 3, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, False), + "num_res_blocks": 1, + "norm_num_groups": 4, + "with_encoder_nonlocal_attn": False, + }, + (1, 1, 16, 16), + (1, 3, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], + [ + { + "spatial_dims": 3, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, True), + "num_res_blocks": 1, + "norm_num_groups": 4, + }, + (1, 1, 16, 16, 16), + (1, 3, 16, 16, 16), + (1, 1, 16, 16, 16), + (1, 4, 4, 4, 4), + ], + [ + { + "spatial_dims": 2, + "label_nc": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4, 4), + "latent_channels": 4, + "attention_levels": (False, False, True), + "num_res_blocks": 1, + "norm_num_groups": 4, + "spade_intermediate_channels": 32, + }, + (1, 1, 16, 16), + (1, 3, 16, 16), + (1, 1, 16, 16), + (1, 4, 4, 4), + ], +] + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +if has_einops: + CASES = CASES_ATTENTION + CASES_NO_ATTENTION +else: + CASES = CASES_NO_ATTENTION + + +class TestSPADEAutoEncoderKL(unittest.TestCase): + @parameterized.expand(CASES) + def test_shape(self, input_param, input_shape, input_seg, expected_shape, expected_latent_shape): + net = SPADEAutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.forward(torch.randn(input_shape).to(device), torch.randn(input_seg).to(device)) + self.assertEqual(result[0].shape, expected_shape) + self.assertEqual(result[1].shape, expected_latent_shape) + + @skipUnless(has_einops, "Requires einops") + def test_model_channels_not_multiple_of_norm_num_group(self): + with self.assertRaises(ValueError): + SPADEAutoencoderKL( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + channels=(24, 24, 24), + attention_levels=(False, False, False), + latent_channels=8, + num_res_blocks=1, + norm_num_groups=16, + ) + + @skipUnless(has_einops, "Requires einops") + def test_model_channels_not_same_size_of_attention_levels(self): + with self.assertRaises(ValueError): + SPADEAutoencoderKL( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + channels=(24, 24, 24), + attention_levels=(False, False), + latent_channels=8, + num_res_blocks=1, + norm_num_groups=16, + ) + + @skipUnless(has_einops, "Requires einops") + def test_model_channels_not_same_size_of_num_res_blocks(self): + with self.assertRaises(ValueError): + SPADEAutoencoderKL( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + channels=(24, 24, 24), + attention_levels=(False, False, False), + latent_channels=8, + num_res_blocks=(8, 8), + norm_num_groups=16, + ) + + def test_shape_encode(self): + input_param, input_shape, _, _, expected_latent_shape = CASES[0] + net = SPADEAutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.encode(torch.randn(input_shape).to(device)) + self.assertEqual(result[0].shape, expected_latent_shape) + self.assertEqual(result[1].shape, expected_latent_shape) + + def test_shape_sampling(self): + input_param, _, _, _, expected_latent_shape = CASES[0] + net = SPADEAutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.sampling( + torch.randn(expected_latent_shape).to(device), torch.randn(expected_latent_shape).to(device) + ) + self.assertEqual(result.shape, expected_latent_shape) + + def test_shape_decode(self): + input_param, _, input_seg_shape, expected_input_shape, latent_shape = CASES[0] + net = SPADEAutoencoderKL(**input_param).to(device) + with eval_mode(net): + result = net.decode(torch.randn(latent_shape).to(device), torch.randn(input_seg_shape).to(device)) + self.assertEqual(result.shape, expected_input_shape) + + @skipUnless(has_einops, "Requires einops") + def test_wrong_shape_decode(self): + net = SPADEAutoencoderKL( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + channels=(4, 4, 4), + latent_channels=4, + attention_levels=(False, False, False), + num_res_blocks=1, + norm_num_groups=4, + ) + with self.assertRaises(RuntimeError): + _ = net.decode(torch.randn((1, 1, 16, 16)).to(device), torch.randn((1, 6, 16, 16)).to(device)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_spade_diffusion_model_unet.py b/tests/test_spade_diffusion_model_unet.py new file mode 100644 index 0000000000..481705f56f --- /dev/null +++ b/tests/test_spade_diffusion_model_unet.py @@ -0,0 +1,574 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.networks import eval_mode +from monai.networks.nets import SPADEDiffusionModelUNet +from monai.utils import optional_import + +einops, has_einops = optional_import("einops") +UNCOND_CASES_2D = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": (1, 1, 2), + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "resblock_updown": True, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "resblock_updown": True, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, True, True), + "num_head_channels": (0, 2, 4), + "norm_num_groups": 8, + "label_nc": 3, + } + ], +] + +UNCOND_CASES_3D = [ + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "label_nc": 3, + "spade_intermediate_channels": 256, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, False), + "norm_num_groups": 8, + "resblock_updown": True, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 8, + "norm_num_groups": 8, + "resblock_updown": True, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": (0, 0, 4), + "norm_num_groups": 8, + "label_nc": 3, + } + ], +] + +COND_CASES_2D = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "resblock_updown": True, + "label_nc": 3, + } + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "num_res_blocks": 1, + "channels": (8, 8, 8), + "attention_levels": (False, False, True), + "num_head_channels": 4, + "norm_num_groups": 8, + "with_conditioning": True, + "transformer_num_layers": 1, + "cross_attention_dim": 3, + "upcast_attention": True, + "label_nc": 3, + } + ], +] + + +class TestSPADEDiffusionModelUNet2D(unittest.TestCase): + @parameterized.expand(UNCOND_CASES_2D) + @skipUnless(has_einops, "Requires einops") + def test_shape_unconditioned_models(self, input_param): + net = SPADEDiffusionModelUNet(**input_param) + with eval_mode(net): + result = net.forward( + torch.rand((1, 1, 16, 16)), + torch.randint(0, 1000, (1,)).long(), + torch.rand((1, input_param["label_nc"], 16, 16)), + ) + self.assertEqual(result.shape, (1, 1, 16, 16)) + + @skipUnless(has_einops, "Requires einops") + def test_timestep_with_wrong_shape(self): + net = SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + with self.assertRaises(ValueError): + with eval_mode(net): + net.forward( + torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1, 1)).long(), torch.rand((1, 3, 16, 16)) + ) + + @skipUnless(has_einops, "Requires einops") + def test_label_with_wrong_shape(self): + net = SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + with self.assertRaises(RuntimeError): + with eval_mode(net): + net.forward(torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1,)).long(), torch.rand((1, 6, 16, 16))) + + @skipUnless(has_einops, "Requires einops") + def test_shape_with_different_in_channel_out_channel(self): + in_channels = 6 + out_channels = 3 + net = SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=in_channels, + out_channels=out_channels, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + with eval_mode(net): + result = net.forward( + torch.rand((1, in_channels, 16, 16)), torch.randint(0, 1000, (1,)).long(), torch.rand((1, 3, 16, 16)) + ) + self.assertEqual(result.shape, (1, out_channels, 16, 16)) + + def test_model_channels_not_multiple_of_norm_num_group(self): + with self.assertRaises(ValueError): + SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 12), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + + def test_attention_levels_with_different_length_num_head_channels(self): + with self.assertRaises(ValueError): + SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, False), + num_head_channels=(0, 2), + norm_num_groups=8, + ) + + def test_num_res_blocks_with_different_length_channels(self): + with self.assertRaises(ValueError): + SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=(1, 1), + channels=(8, 8, 8), + attention_levels=(False, False, False), + norm_num_groups=8, + ) + + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models(self): + net = SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + with_conditioning=True, + transformer_num_layers=1, + cross_attention_dim=3, + norm_num_groups=8, + num_head_channels=8, + ) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 32)), + timesteps=torch.randint(0, 1000, (1,)).long(), + seg=torch.rand((1, 3, 16, 32)), + context=torch.rand((1, 1, 3)), + ) + self.assertEqual(result.shape, (1, 1, 16, 32)) + + @skipUnless(has_einops, "Requires einops") + def test_with_conditioning_cross_attention_dim_none(self): + with self.assertRaises(ValueError): + SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + with_conditioning=True, + transformer_num_layers=1, + cross_attention_dim=None, + norm_num_groups=8, + ) + + @skipUnless(has_einops, "Requires einops") + def test_context_with_conditioning_none(self): + net = SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + with_conditioning=False, + transformer_num_layers=1, + norm_num_groups=8, + ) + + with self.assertRaises(ValueError): + with eval_mode(net): + net.forward( + x=torch.rand((1, 1, 16, 32)), + timesteps=torch.randint(0, 1000, (1,)).long(), + seg=torch.rand((1, 3, 16, 32)), + context=torch.rand((1, 1, 3)), + ) + + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models_class_conditioning(self): + net = SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + norm_num_groups=8, + num_head_channels=8, + num_class_embeds=2, + ) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 32)), + timesteps=torch.randint(0, 1000, (1,)).long(), + seg=torch.rand((1, 3, 16, 32)), + class_labels=torch.randint(0, 2, (1,)).long(), + ) + self.assertEqual(result.shape, (1, 1, 16, 32)) + + @skipUnless(has_einops, "Requires einops") + def test_conditioned_models_no_class_labels(self): + net = SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + norm_num_groups=8, + num_head_channels=8, + num_class_embeds=2, + ) + + with self.assertRaises(ValueError): + net.forward( + x=torch.rand((1, 1, 16, 32)), + timesteps=torch.randint(0, 1000, (1,)).long(), + seg=torch.rand((1, 3, 16, 32)), + ) + + def test_model_channels_not_same_size_of_attention_levels(self): + with self.assertRaises(ValueError): + SPADEDiffusionModelUNet( + spatial_dims=2, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False), + norm_num_groups=8, + num_head_channels=8, + num_class_embeds=2, + ) + + @parameterized.expand(COND_CASES_2D) + @skipUnless(has_einops, "Requires einops") + def test_conditioned_2d_models_shape(self, input_param): + net = SPADEDiffusionModelUNet(**input_param) + with eval_mode(net): + result = net.forward( + torch.rand((1, 1, 16, 16)), + torch.randint(0, 1000, (1,)).long(), + torch.rand((1, input_param["label_nc"], 16, 16)), + torch.rand((1, 1, 3)), + ) + self.assertEqual(result.shape, (1, 1, 16, 16)) + + +class TestDiffusionModelUNet3D(unittest.TestCase): + @parameterized.expand(UNCOND_CASES_3D) + @skipUnless(has_einops, "Requires einops") + def test_shape_unconditioned_models(self, input_param): + net = SPADEDiffusionModelUNet(**input_param) + with eval_mode(net): + result = net.forward( + torch.rand((1, 1, 16, 16, 16)), + torch.randint(0, 1000, (1,)).long(), + torch.rand((1, input_param["label_nc"], 16, 16, 16)), + ) + self.assertEqual(result.shape, (1, 1, 16, 16, 16)) + + @skipUnless(has_einops, "Requires einops") + def test_shape_with_different_in_channel_out_channel(self): + in_channels = 6 + out_channels = 3 + net = SPADEDiffusionModelUNet( + spatial_dims=3, + label_nc=3, + in_channels=in_channels, + out_channels=out_channels, + num_res_blocks=1, + channels=(8, 8, 8), + attention_levels=(False, False, True), + norm_num_groups=4, + ) + with eval_mode(net): + result = net.forward( + torch.rand((1, in_channels, 16, 16, 16)), + torch.randint(0, 1000, (1,)).long(), + torch.rand((1, 3, 16, 16, 16)), + ) + self.assertEqual(result.shape, (1, out_channels, 16, 16, 16)) + + @skipUnless(has_einops, "Requires einops") + def test_shape_conditioned_models(self): + net = SPADEDiffusionModelUNet( + spatial_dims=3, + label_nc=3, + in_channels=1, + out_channels=1, + num_res_blocks=1, + channels=(16, 16, 16), + attention_levels=(False, False, True), + norm_num_groups=16, + with_conditioning=True, + transformer_num_layers=1, + cross_attention_dim=3, + ) + with eval_mode(net): + result = net.forward( + x=torch.rand((1, 1, 16, 16, 16)), + timesteps=torch.randint(0, 1000, (1,)).long(), + seg=torch.rand((1, 3, 16, 16, 16)), + context=torch.rand((1, 1, 3)), + ) + self.assertEqual(result.shape, (1, 1, 16, 16, 16)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_spade_vaegan.py b/tests/test_spade_vaegan.py new file mode 100644 index 0000000000..3fdb9b74cb --- /dev/null +++ b/tests/test_spade_vaegan.py @@ -0,0 +1,140 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import numpy as np +import torch +from parameterized import parameterized + +from monai.networks import eval_mode +from monai.networks.nets import SPADENet + +CASE_2D = [ + [[2, 1, 1, 3, [64, 64], [16, 32, 64, 128], 16, True]], + [[2, 1, 1, 3, [64, 64], [16, 32, 64, 128], None, False]], +] +CASE_3D = [ + [[3, 1, 1, 3, [64, 64, 64], [16, 32, 64, 128], 16, True]], + [[3, 1, 1, 3, [64, 64, 64], [16, 32, 64, 128], None, False]], +] + + +def create_semantic_data(shape: list, semantic_regions: int): + """ + To create semantic and image mock inputs for the network. + Args: + shape: input shape + semantic_regions: number of semantic region + Returns: + """ + out_label = torch.zeros(shape) + out_image = torch.zeros(shape) + torch.randn(shape) * 0.01 + for i in range(1, semantic_regions): + shape_square = [i // np.random.choice(list(range(2, i // 2))) for i in shape] + start_point = [np.random.choice(list(range(shape[ind] - shape_square[ind]))) for ind, i in enumerate(shape)] + if len(shape) == 2: + out_label[ + start_point[0] : (start_point[0] + shape_square[0]), start_point[1] : (start_point[1] + shape_square[1]) + ] = i + base_intensity = torch.ones(shape_square) * np.random.randn() + out_image[ + start_point[0] : (start_point[0] + shape_square[0]), start_point[1] : (start_point[1] + shape_square[1]) + ] = (base_intensity + torch.randn(shape_square) * 0.1) + elif len(shape) == 3: + out_label[ + start_point[0] : (start_point[0] + shape_square[0]), + start_point[1] : (start_point[1] + shape_square[1]), + start_point[2] : (start_point[2] + shape_square[2]), + ] = i + base_intensity = torch.ones(shape_square) * np.random.randn() + out_image[ + start_point[0] : (start_point[0] + shape_square[0]), + start_point[1] : (start_point[1] + shape_square[1]), + start_point[2] : (start_point[2] + shape_square[2]), + ] = (base_intensity + torch.randn(shape_square) * 0.1) + else: + ValueError("Supports only 2D and 3D tensors") + + # One hot encode label + out_label_ = torch.zeros([semantic_regions] + list(out_label.shape)) + for ch in range(semantic_regions): + out_label_[ch, ...] = out_label == ch + + return out_label_.unsqueeze(0), out_image.unsqueeze(0).unsqueeze(0) + + +class TestSpadeNet(unittest.TestCase): + @parameterized.expand(CASE_2D) + def test_forward_2d(self, input_param): + """ + Check that forward method is called correctly and output shape matches. + """ + net = SPADENet(*input_param) + in_label, in_image = create_semantic_data(input_param[4], input_param[3]) + with eval_mode(net): + if not net.is_vae: + out = net(in_label, in_image) + out = out[0] + else: + out, z_mu, z_logvar = net(in_label, in_image) + self.assertTrue(torch.all(torch.isfinite(z_mu))) + self.assertTrue(torch.all(torch.isfinite(z_logvar))) + + self.assertTrue(torch.all(torch.isfinite(out))) + self.assertEqual(list(out.shape), [1, 1, 64, 64]) + + @parameterized.expand(CASE_2D) + def test_encoder_decoder(self, input_param): + """ + Check that forward method is called correctly and output shape matches. + """ + net = SPADENet(*input_param) + in_label, in_image = create_semantic_data(input_param[4], input_param[3]) + with eval_mode(net): + out_z = net.encode(in_image) + if net.is_vae: + self.assertEqual(list(out_z.shape), [1, 16]) + else: + self.assertEqual(out_z, None) + out_i = net.decode(in_label, out_z) + self.assertEqual(list(out_i.shape), [1, 1, 64, 64]) + + @parameterized.expand(CASE_3D) + def test_forward_3d(self, input_param): + """ + Check that forward method is called correctly and output shape matches. + """ + net = SPADENet(*input_param) + in_label, in_image = create_semantic_data(input_param[4], input_param[3]) + with eval_mode(net): + if net.is_vae: + out, z_mu, z_logvar = net(in_label, in_image) + self.assertTrue(torch.all(torch.isfinite(z_mu))) + self.assertTrue(torch.all(torch.isfinite(z_logvar))) + else: + out = net(in_label, in_image) + out = out[0] + self.assertTrue(torch.all(torch.isfinite(out))) + self.assertEqual(list(out.shape), [1, 1, 64, 64, 64]) + + def test_shape_wrong(self): + """ + We input an input shape that isn't divisible by 2**(n downstream steps) + """ + with self.assertRaises(ValueError): + _ = SPADENet(1, 1, 8, [16, 16], [16, 32, 64, 128], 16, True) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_spatial_resampled.py b/tests/test_spatial_resampled.py index 541015cc34..d5c86258d7 100644 --- a/tests/test_spatial_resampled.py +++ b/tests/test_spatial_resampled.py @@ -11,6 +11,7 @@ from __future__ import annotations +import platform import unittest import numpy as np @@ -23,6 +24,12 @@ from tests.lazy_transforms_utils import test_resampler_lazy from tests.utils import TEST_DEVICES, assert_allclose +ON_AARCH64 = platform.machine() == "aarch64" +if ON_AARCH64: + rtol, atol = 1e-1, 1e-2 +else: + rtol, atol = 1e-3, 1e-4 + TESTS = [] destinations_3d = [ @@ -104,7 +111,7 @@ def test_flips_inverse(self, img, device, dst_affine, kwargs, expected_output): # check lazy lazy_xform = SpatialResampled(**init_param) - test_resampler_lazy(lazy_xform, output_data, init_param, call_param, output_key="img") + test_resampler_lazy(lazy_xform, output_data, init_param, call_param, output_key="img", rtol=rtol, atol=atol) # check inverse inverted = xform.inverse(output_data)["img"] diff --git a/tests/test_spatialattention.py b/tests/test_spatialattention.py new file mode 100644 index 0000000000..70b78263c5 --- /dev/null +++ b/tests/test_spatialattention.py @@ -0,0 +1,55 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.networks import eval_mode +from monai.networks.blocks.spatialattention import SpatialAttentionBlock +from monai.utils import optional_import + +einops, has_einops = optional_import("einops") + +TEST_CASES = [ + [ + {"spatial_dims": 2, "num_channels": 128, "num_head_channels": 32, "norm_num_groups": 32, "norm_eps": 1e-6}, + (1, 128, 32, 32), + (1, 128, 32, 32), + ], + [ + {"spatial_dims": 3, "num_channels": 16, "num_head_channels": 8, "norm_num_groups": 8, "norm_eps": 1e-6}, + (1, 16, 8, 8, 8), + (1, 16, 8, 8, 8), + ], +] + + +class TestBlock(unittest.TestCase): + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_shape(self, input_param, input_shape, expected_shape): + net = SpatialAttentionBlock(**input_param) + with eval_mode(net): + result = net(torch.randn(input_shape)) + self.assertEqual(result.shape, expected_shape) + + def test_attention_dim_not_multiple_of_heads(self): + with self.assertRaises(ValueError): + SpatialAttentionBlock(spatial_dims=2, num_channels=128, num_head_channels=33) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_subpixel_upsample.py b/tests/test_subpixel_upsample.py index 5abbe57e11..fe9fb1c328 100644 --- a/tests/test_subpixel_upsample.py +++ b/tests/test_subpixel_upsample.py @@ -55,9 +55,9 @@ (2, 1, 32, 16, 8), ] -TEST_CASE_SUBPIXEL.append(TEST_CASE_SUBPIXEL_2D_EXTRA) -TEST_CASE_SUBPIXEL.append(TEST_CASE_SUBPIXEL_3D_EXTRA) -TEST_CASE_SUBPIXEL.append(TEST_CASE_SUBPIXEL_CONV_BLOCK_EXTRA) +TEST_CASE_SUBPIXEL.append(TEST_CASE_SUBPIXEL_2D_EXTRA) # type: ignore +TEST_CASE_SUBPIXEL.append(TEST_CASE_SUBPIXEL_3D_EXTRA) # type: ignore +TEST_CASE_SUBPIXEL.append(TEST_CASE_SUBPIXEL_CONV_BLOCK_EXTRA) # type: ignore # add every test back with the pad/pool sequential component omitted for tests in list(TEST_CASE_SUBPIXEL): diff --git a/tests/test_sure_loss.py b/tests/test_sure_loss.py new file mode 100644 index 0000000000..903f9bd2ca --- /dev/null +++ b/tests/test_sure_loss.py @@ -0,0 +1,72 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch + +from monai.losses import SURELoss + + +class TestSURELoss(unittest.TestCase): + + def test_real_value(self): + """Test SURELoss with real-valued input: when the input is real value, the loss should be 0.0.""" + sure_loss_real = SURELoss(perturb_noise=torch.zeros(2, 1, 128, 128), eps=0.1) + + def operator(x): + return x + + y_pseudo_gt = torch.randn(2, 1, 128, 128) + x = torch.randn(2, 1, 128, 128) + loss = sure_loss_real(operator, x, y_pseudo_gt, complex_input=False) + self.assertAlmostEqual(loss.item(), 0.0) + + def test_complex_value(self): + """Test SURELoss with complex-valued input: when the input is complex value, the loss should be 0.0.""" + + def operator(x): + return x + + sure_loss_complex = SURELoss(perturb_noise=torch.zeros(2, 2, 128, 128), eps=0.1) + y_pseudo_gt = torch.randn(2, 2, 128, 128) + x = torch.randn(2, 2, 128, 128) + loss = sure_loss_complex(operator, x, y_pseudo_gt, complex_input=True) + self.assertAlmostEqual(loss.item(), 0.0) + + def test_complex_general_input(self): + """Test SURELoss with complex-valued input: when the input is general complex value, the loss should be 0.0.""" + + def operator(x): + return x + + perturb_noise_real = torch.randn(2, 1, 128, 128) + perturb_noise_complex = torch.zeros(2, 2, 128, 128) + perturb_noise_complex[:, 0, :, :] = perturb_noise_real.squeeze() + y_pseudo_gt_real = torch.randn(2, 1, 128, 128) + y_pseudo_gt_complex = torch.zeros(2, 2, 128, 128) + y_pseudo_gt_complex[:, 0, :, :] = y_pseudo_gt_real.squeeze() + x_real = torch.randn(2, 1, 128, 128) + x_complex = torch.zeros(2, 2, 128, 128) + x_complex[:, 0, :, :] = x_real.squeeze() + + sure_loss_real = SURELoss(perturb_noise=perturb_noise_real, eps=0.1) + sure_loss_complex = SURELoss(perturb_noise=perturb_noise_complex, eps=0.1) + + loss_real = sure_loss_real(operator, x_real, y_pseudo_gt_real, complex_input=False) + loss_complex = sure_loss_complex(operator, x_complex, y_pseudo_gt_complex, complex_input=True) + self.assertAlmostEqual(loss_real.item(), loss_complex.abs().item(), places=6) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_synthetic.py b/tests/test_synthetic.py index 7db3c3e77a..4ab2144568 100644 --- a/tests/test_synthetic.py +++ b/tests/test_synthetic.py @@ -47,7 +47,7 @@ def test_create_test_image(self, dim, input_param, expected_img, expected_seg, e set_determinism(seed=0) if dim == 2: img, seg = create_test_image_2d(**input_param) - elif dim == 3: + else: # dim == 3 img, seg = create_test_image_3d(**input_param) self.assertEqual(img.shape, expected_shape) self.assertEqual(seg.max(), expected_max_cls) diff --git a/tests/test_tciadataset.py b/tests/test_tciadataset.py index d996922e20..5a16bb4816 100644 --- a/tests/test_tciadataset.py +++ b/tests/test_tciadataset.py @@ -108,7 +108,7 @@ def _test_dataset(dataset): )[0] shutil.rmtree(os.path.join(testing_dir, collection)) - try: + with self.assertRaisesRegex(RuntimeError, "^Cannot find dataset directory"): TciaDataset( root_dir=testing_dir, collection=collection, @@ -117,8 +117,6 @@ def _test_dataset(dataset): download=False, val_frac=val_frac, ) - except RuntimeError as e: - self.assertTrue(str(e).startswith("Cannot find dataset directory")) if __name__ == "__main__": diff --git a/tests/test_threadcontainer.py b/tests/test_threadcontainer.py index 9551dec703..568461748b 100644 --- a/tests/test_threadcontainer.py +++ b/tests/test_threadcontainer.py @@ -62,7 +62,7 @@ def test_container(self): self.assertTrue(con.is_alive) self.assertIsNotNone(con.status()) - self.assertTrue(len(con.status_dict) > 0) + self.assertGreater(len(con.status_dict), 0) con.join() diff --git a/tests/test_to_cupy.py b/tests/test_to_cupy.py index 5a1754e7c5..38400f0d3f 100644 --- a/tests/test_to_cupy.py +++ b/tests/test_to_cupy.py @@ -62,8 +62,8 @@ def test_numpy_input_dtype(self): test_data = np.rot90(test_data) self.assertFalse(test_data.flags["C_CONTIGUOUS"]) result = ToCupy(np.uint8)(test_data) - self.assertTrue(result.dtype == cp.uint8) - self.assertTrue(isinstance(result, cp.ndarray)) + self.assertEqual(result.dtype, cp.uint8) + self.assertIsInstance(result, cp.ndarray) self.assertTrue(result.flags["C_CONTIGUOUS"]) cp.testing.assert_allclose(result, test_data) @@ -72,8 +72,8 @@ def test_tensor_input(self): test_data = test_data.rot90() self.assertFalse(test_data.is_contiguous()) result = ToCupy()(test_data) - self.assertTrue(result.dtype == cp.float32) - self.assertTrue(isinstance(result, cp.ndarray)) + self.assertEqual(result.dtype, cp.float32) + self.assertIsInstance(result, cp.ndarray) self.assertTrue(result.flags["C_CONTIGUOUS"]) cp.testing.assert_allclose(result, test_data) @@ -83,8 +83,8 @@ def test_tensor_cuda_input(self): test_data = test_data.rot90() self.assertFalse(test_data.is_contiguous()) result = ToCupy()(test_data) - self.assertTrue(result.dtype == cp.float32) - self.assertTrue(isinstance(result, cp.ndarray)) + self.assertEqual(result.dtype, cp.float32) + self.assertIsInstance(result, cp.ndarray) self.assertTrue(result.flags["C_CONTIGUOUS"]) cp.testing.assert_allclose(result, test_data) @@ -95,8 +95,8 @@ def test_tensor_cuda_input_dtype(self): self.assertFalse(test_data.is_contiguous()) result = ToCupy(dtype="float32")(test_data) - self.assertTrue(result.dtype == cp.float32) - self.assertTrue(isinstance(result, cp.ndarray)) + self.assertEqual(result.dtype, cp.float32) + self.assertIsInstance(result, cp.ndarray) self.assertTrue(result.flags["C_CONTIGUOUS"]) cp.testing.assert_allclose(result, test_data) diff --git a/tests/test_to_numpy.py b/tests/test_to_numpy.py index 8f7cf34865..f4e5f80a29 100644 --- a/tests/test_to_numpy.py +++ b/tests/test_to_numpy.py @@ -32,7 +32,7 @@ def test_cupy_input(self): test_data = cp.rot90(test_data) self.assertFalse(test_data.flags["C_CONTIGUOUS"]) result = ToNumpy()(test_data) - self.assertTrue(isinstance(result, np.ndarray)) + self.assertIsInstance(result, np.ndarray) self.assertTrue(result.flags["C_CONTIGUOUS"]) assert_allclose(result, test_data.get(), type_test=False) @@ -41,8 +41,8 @@ def test_numpy_input(self): test_data = np.rot90(test_data) self.assertFalse(test_data.flags["C_CONTIGUOUS"]) result = ToNumpy(dtype="float32")(test_data) - self.assertTrue(isinstance(result, np.ndarray)) - self.assertTrue(result.dtype == np.float32) + self.assertIsInstance(result, np.ndarray) + self.assertEqual(result.dtype, np.float32) self.assertTrue(result.flags["C_CONTIGUOUS"]) assert_allclose(result, test_data, type_test=False) @@ -51,7 +51,7 @@ def test_tensor_input(self): test_data = test_data.rot90() self.assertFalse(test_data.is_contiguous()) result = ToNumpy(dtype=torch.uint8)(test_data) - self.assertTrue(isinstance(result, np.ndarray)) + self.assertIsInstance(result, np.ndarray) self.assertTrue(result.flags["C_CONTIGUOUS"]) assert_allclose(result, test_data, type_test=False) @@ -61,7 +61,7 @@ def test_tensor_cuda_input(self): test_data = test_data.rot90() self.assertFalse(test_data.is_contiguous()) result = ToNumpy()(test_data) - self.assertTrue(isinstance(result, np.ndarray)) + self.assertIsInstance(result, np.ndarray) self.assertTrue(result.flags["C_CONTIGUOUS"]) assert_allclose(result, test_data, type_test=False) @@ -71,13 +71,13 @@ def test_list_tuple(self): assert_allclose(result, np.asarray(test_data), type_test=False) test_data = ((1, 2), (3, 4)) result = ToNumpy(wrap_sequence=False)(test_data) - self.assertTrue(type(result), tuple) + self.assertIsInstance(result, tuple) assert_allclose(result, ((np.asarray(1), np.asarray(2)), (np.asarray(3), np.asarray(4)))) def test_single_value(self): for test_data in [5, np.array(5), torch.tensor(5)]: result = ToNumpy(dtype=np.uint8)(test_data) - self.assertTrue(isinstance(result, np.ndarray)) + self.assertIsInstance(result, np.ndarray) assert_allclose(result, np.asarray(test_data), type_test=False) self.assertEqual(result.ndim, 0) diff --git a/tests/test_torchscript_utils.py b/tests/test_torchscript_utils.py index 6f8f231829..5a5fb47864 100644 --- a/tests/test_torchscript_utils.py +++ b/tests/test_torchscript_utils.py @@ -23,6 +23,7 @@ class TestModule(torch.nn.Module): + __test__ = False # indicate to pytest that this class is not intended for collection def forward(self, x): return x + 10 diff --git a/tests/test_torchvision_fc_model.py b/tests/test_torchvision_fc_model.py index 322cce1161..9cc19db62c 100644 --- a/tests/test_torchvision_fc_model.py +++ b/tests/test_torchvision_fc_model.py @@ -195,8 +195,8 @@ def test_get_module(self): mod = look_up_named_module("model.1.submodule.1.submodule.1.submodule.0.conv", net) self.assertTrue(str(mod).startswith("Conv2d")) self.assertIsInstance(set_named_module(net, "model", torch.nn.Identity()).model, torch.nn.Identity) - self.assertEqual(look_up_named_module("model.1.submodule.1.submodule.1.submodule.conv", net), None) - self.assertEqual(look_up_named_module("test attribute", net), None) + self.assertIsNone(look_up_named_module("model.1.submodule.1.submodule.1.submodule.conv", net)) + self.assertIsNone(look_up_named_module("test attribute", net)) if __name__ == "__main__": diff --git a/tests/test_traceable_transform.py b/tests/test_traceable_transform.py index dd139053e3..6a499b2dd9 100644 --- a/tests/test_traceable_transform.py +++ b/tests/test_traceable_transform.py @@ -33,12 +33,12 @@ def test_default(self): expected_key = "_transforms" a = _TraceTest() for x in a.transform_info_keys(): - self.assertTrue(x in a.get_transform_info()) + self.assertIn(x, a.get_transform_info()) self.assertEqual(a.trace_key(), expected_key) data = {"image": "test"} data = a(data) # adds to the stack - self.assertTrue(isinstance(data[expected_key], list)) + self.assertIsInstance(data[expected_key], list) self.assertEqual(data[expected_key][0]["class"], "_TraceTest") data = a(data) # adds to the stack diff --git a/tests/test_transformer.py b/tests/test_transformer.py new file mode 100644 index 0000000000..b371809d47 --- /dev/null +++ b/tests/test_transformer.py @@ -0,0 +1,109 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +import tempfile +import unittest +from unittest import skipUnless + +import numpy as np +import torch +from parameterized import parameterized + +from monai.apps import download_url +from monai.networks import eval_mode +from monai.networks.nets import DecoderOnlyTransformer +from monai.utils import optional_import +from tests.utils import skip_if_downloading_fails, testing_data_config + +_, has_einops = optional_import("einops") +TEST_CASES = [] +for dropout_rate in np.linspace(0, 1, 2): + for attention_layer_dim in [360, 480, 600, 768]: + for num_heads in [4, 6, 8, 12]: + TEST_CASES.append( + [ + { + "num_tokens": 10, + "max_seq_len": 16, + "attn_layers_dim": attention_layer_dim, + "attn_layers_depth": 2, + "attn_layers_heads": num_heads, + "embedding_dropout_rate": dropout_rate, + } + ] + ) + + +class TestDecoderOnlyTransformer(unittest.TestCase): + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_unconditioned_models(self, input_param): + net = DecoderOnlyTransformer(**input_param) + with eval_mode(net): + net.forward(torch.randint(0, 10, (1, 16))) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_conditioned_models(self, input_param): + net = DecoderOnlyTransformer(**input_param, with_cross_attention=True) + with eval_mode(net): + net.forward(torch.randint(0, 10, (1, 16)), context=torch.randn(1, 3, input_param["attn_layers_dim"])) + + def test_attention_dim_not_multiple_of_heads(self): + with self.assertRaises(ValueError): + DecoderOnlyTransformer( + num_tokens=10, max_seq_len=16, attn_layers_dim=8, attn_layers_depth=2, attn_layers_heads=3 + ) + + @skipUnless(has_einops, "Requires einops") + def test_dropout_rate_negative(self): + + with self.assertRaises(ValueError): + DecoderOnlyTransformer( + num_tokens=10, + max_seq_len=16, + attn_layers_dim=8, + attn_layers_depth=2, + attn_layers_heads=2, + embedding_dropout_rate=-1, + ) + + @skipUnless(has_einops, "Requires einops") + def test_compatibility_with_monai_generative(self): + # test loading weights from a model saved in MONAI Generative, version 0.2.3 + with skip_if_downloading_fails(): + net = DecoderOnlyTransformer( + num_tokens=10, + max_seq_len=16, + attn_layers_dim=8, + attn_layers_depth=2, + attn_layers_heads=2, + with_cross_attention=True, + embedding_dropout_rate=0, + ) + + tmpdir = tempfile.mkdtemp() + key = "decoder_only_transformer_monai_generative_weights" + url = testing_data_config("models", key, "url") + hash_type = testing_data_config("models", key, "hash_type") + hash_val = testing_data_config("models", key, "hash_val") + filename = "decoder_only_transformer_monai_generative_weights.pt" + weight_path = os.path.join(tmpdir, filename) + download_url(url=url, filepath=weight_path, hash_val=hash_val, hash_type=hash_type) + + net.load_old_state_dict(torch.load(weight_path), verbose=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_transformerblock.py b/tests/test_transformerblock.py index 5a8dbba83c..a850cc6f74 100644 --- a/tests/test_transformerblock.py +++ b/tests/test_transformerblock.py @@ -12,6 +12,7 @@ from __future__ import annotations import unittest +from unittest import skipUnless import numpy as np import torch @@ -19,28 +20,33 @@ from monai.networks import eval_mode from monai.networks.blocks.transformerblock import TransformerBlock +from monai.utils import optional_import +einops, has_einops = optional_import("einops") TEST_CASE_TRANSFORMERBLOCK = [] for dropout_rate in np.linspace(0, 1, 4): for hidden_size in [360, 480, 600, 768]: for num_heads in [4, 8, 12]: for mlp_dim in [1024, 3072]: - test_case = [ - { - "hidden_size": hidden_size, - "num_heads": num_heads, - "mlp_dim": mlp_dim, - "dropout_rate": dropout_rate, - }, - (2, 512, hidden_size), - (2, 512, hidden_size), - ] - TEST_CASE_TRANSFORMERBLOCK.append(test_case) + for cross_attention in [False, True]: + test_case = [ + { + "hidden_size": hidden_size, + "num_heads": num_heads, + "mlp_dim": mlp_dim, + "dropout_rate": dropout_rate, + "with_cross_attention": cross_attention, + }, + (2, 512, hidden_size), + (2, 512, hidden_size), + ] + TEST_CASE_TRANSFORMERBLOCK.append(test_case) class TestTransformerBlock(unittest.TestCase): @parameterized.expand(TEST_CASE_TRANSFORMERBLOCK) + @skipUnless(has_einops, "Requires einops") def test_shape(self, input_param, input_shape, expected_shape): net = TransformerBlock(**input_param) with eval_mode(net): @@ -54,6 +60,7 @@ def test_ill_arg(self): with self.assertRaises(ValueError): TransformerBlock(hidden_size=622, num_heads=8, mlp_dim=3072, dropout_rate=0.4) + @skipUnless(has_einops, "Requires einops") def test_access_attn_matrix(self): # input format hidden_size = 128 diff --git a/tests/test_tversky_loss.py b/tests/test_tversky_loss.py index efe1f2cdf3..0365503ea2 100644 --- a/tests/test_tversky_loss.py +++ b/tests/test_tversky_loss.py @@ -165,17 +165,12 @@ def test_ill_shape(self): with self.assertRaisesRegex(ValueError, ""): TverskyLoss(reduction=None)(chn_input, chn_target) - def test_input_warnings(self): + @parameterized.expand([(False, False, False), (False, True, False), (False, False, True)]) + def test_input_warnings(self, include_background, softmax, to_onehot_y): chn_input = torch.ones((1, 1, 3)) chn_target = torch.ones((1, 1, 3)) with self.assertWarns(Warning): - loss = TverskyLoss(include_background=False) - loss.forward(chn_input, chn_target) - with self.assertWarns(Warning): - loss = TverskyLoss(softmax=True) - loss.forward(chn_input, chn_target) - with self.assertWarns(Warning): - loss = TverskyLoss(to_onehot_y=True) + loss = TverskyLoss(include_background=include_background, softmax=softmax, to_onehot_y=to_onehot_y) loss.forward(chn_input, chn_target) def test_script(self): diff --git a/tests/test_ultrasound_confidence_map_transform.py b/tests/test_ultrasound_confidence_map_transform.py index f672961700..1c6b8f7635 100644 --- a/tests/test_ultrasound_confidence_map_transform.py +++ b/tests/test_ultrasound_confidence_map_transform.py @@ -11,14 +11,20 @@ from __future__ import annotations +import os import unittest import numpy as np import torch +from parameterized import parameterized +from PIL import Image from monai.transforms import UltrasoundConfidenceMapTransform +from monai.utils import optional_import from tests.utils import assert_allclose +_, has_scipy = optional_import("scipy") + TEST_INPUT = np.array( [ [1, 2, 3, 23, 13, 22, 5, 1, 2, 3], @@ -31,7 +37,8 @@ [1, 2, 3, 32, 33, 34, 35, 1, 2, 3], [1, 2, 3, 36, 37, 38, 39, 1, 2, 3], [1, 2, 3, 40, 41, 42, 43, 1, 2, 3], - ] + ], + dtype=np.float32, ) TEST_MASK = np.array( @@ -46,477 +53,439 @@ [1, 1, 1, 0, 0, 0, 1, 1, 1, 0], [1, 1, 1, 0, 0, 0, 1, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - ] + ], + dtype=np.float32, ) SINK_ALL_OUTPUT = np.array( [ [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [ - 0.97514489, - 0.96762971, - 0.96164186, - 0.95463443, - 0.9941512, - 0.99023054, - 0.98559401, - 0.98230057, - 0.96601224, - 0.95119599, - ], - [ - 0.92960533, - 0.92638451, - 0.9056675, - 0.9487176, - 0.9546961, - 0.96165853, - 0.96172303, - 0.92686401, - 0.92122613, - 0.89957239, - ], - [ - 0.86490963, - 0.85723665, - 0.83798141, - 0.90816201, - 0.90816097, - 0.90815301, - 0.9081427, - 0.85933627, - 0.85146935, - 0.82948586, - ], - [ - 0.77430346, - 0.76731372, - 0.74372311, - 0.89128774, - 0.89126885, - 0.89125066, - 0.89123521, - 0.76858589, - 0.76106647, - 0.73807776, - ], - [ - 0.66098109, - 0.65327697, - 0.63090644, - 0.33086588, - 0.3308383, - 0.33081937, - 0.33080718, - 0.6557468, - 0.64825099, - 0.62593375, - ], - [ - 0.52526945, - 0.51832586, - 0.49709412, - 0.25985059, - 0.25981009, - 0.25977729, - 0.25975222, - 0.52118958, - 0.51426328, - 0.49323164, - ], - [ - 0.3697845, - 0.36318971, - 0.34424661, - 0.17386804, - 0.17382046, - 0.17377993, - 0.17374668, - 0.36689317, - 0.36036096, - 0.3415582, - ], - [ - 0.19546374, - 0.1909659, - 0.17319999, - 0.08423318, - 0.08417993, - 0.08413242, - 0.08409104, - 0.19393909, - 0.18947485, - 0.17185031, + 0.8884930952884654, + 0.8626656901726876, + 0.8301161870669913, + 0.9757179300830185, + 0.9989819637626414, + 0.9994717624885747, + 0.9954377526794013, + 0.8898638133944221, + 0.862604343021387, + 0.8277862494812598, + ], + [ + 0.7765718877433174, + 0.7363731552518268, + 0.6871875923653379, + 0.9753673327387775, + 0.9893175316399789, + 0.9944181334242039, + 0.9936979128319371, + 0.7778001700035326, + 0.7362622619974832, + 0.6848377775329241, + ], + [ + 0.6648416226360719, + 0.6178079903692397, + 0.5630152545966568, + 0.8278402502498404, + 0.82790391019578, + 0.8289702087149963, + 0.8286730258710652, + 0.6658773633169731, + 0.6176836507071695, + 0.5609165245633834, + ], + [ + 0.5534420483956817, + 0.5055401989946189, + 0.451865872383879, + 0.7541423053657541, + 0.7544115886347456, + 0.7536884376055174, + 0.7524927915364896, + 0.5542943466824017, + 0.505422678400297, + 0.4502051549732117, + ], + [ + 0.4423657561928356, + 0.398221575954319, + 0.35030055029978124, + 0.4793202144786371, + 0.48057175662074125, + 0.4812057229564038, + 0.48111949176149327, + 0.44304092606050766, + 0.39812149713417405, + 0.34902458531143377, + ], + [ + 0.3315561576450342, + 0.29476346732036784, + 0.2558303772864961, + 0.35090405668257535, + 0.3515225984307705, + 0.35176548159366317, + 0.3516979775419521, + 0.33205839061494885, + 0.2946859567272435, + 0.2549042599220772, + ], + [ + 0.22094175240967673, + 0.19431840633358133, + 0.16672448058324435, + 0.22716195845848167, + 0.22761996456848282, + 0.22782525614780919, + 0.22781876632199002, + 0.22127471252104777, + 0.19426593309729956, + 0.16612306610996525, + ], + [ + 0.11044782531624744, + 0.09623229814933323, + 0.08174664901235043, + 0.11081911718888311, + 0.11102310514207447, + 0.1111041051969924, + 0.11108329076967229, + 0.11061376973431204, + 0.09620592927336903, + 0.08145227209865454, ], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - ] + ], + dtype=np.float32, ) SINK_MID_OUTPUT = np.array( [ + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [ - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - ], - [ - 9.99996103e-01, - 9.99994823e-01, - 9.99993550e-01, - 9.99930863e-01, - 9.99990782e-01, - 9.99984683e-01, - 9.99979000e-01, - 9.99997804e-01, - 9.99995985e-01, - 9.99994325e-01, - ], - [ - 9.99989344e-01, - 9.99988600e-01, - 9.99984099e-01, - 9.99930123e-01, - 9.99926598e-01, - 9.99824297e-01, - 9.99815032e-01, - 9.99991228e-01, - 9.99990881e-01, - 9.99988462e-01, - ], - [ - 9.99980787e-01, - 9.99979264e-01, - 9.99975828e-01, - 9.59669286e-01, - 9.59664779e-01, - 9.59656566e-01, - 9.59648332e-01, - 9.99983882e-01, - 9.99983038e-01, - 9.99980732e-01, - ], - [ - 9.99970181e-01, - 9.99969032e-01, - 9.99965730e-01, - 9.45197806e-01, - 9.45179593e-01, - 9.45163629e-01, - 9.45151458e-01, - 9.99973352e-01, - 9.99973254e-01, - 9.99971098e-01, - ], - [ - 9.99958608e-01, - 9.99957307e-01, - 9.99953444e-01, - 4.24743523e-01, - 4.24713305e-01, - 4.24694646e-01, - 4.24685271e-01, - 9.99960948e-01, - 9.99961829e-01, - 9.99960347e-01, - ], - [ - 9.99946675e-01, - 9.99945139e-01, - 9.99940312e-01, - 3.51353224e-01, - 3.51304003e-01, - 3.51268260e-01, - 3.51245366e-01, - 9.99947688e-01, - 9.99950165e-01, - 9.99949512e-01, - ], - [ - 9.99935877e-01, - 9.99934088e-01, - 9.99928982e-01, - 2.51197134e-01, - 2.51130273e-01, - 2.51080014e-01, - 2.51045852e-01, - 9.99936187e-01, - 9.99939716e-01, - 9.99940022e-01, - ], - [ - 9.99927846e-01, - 9.99925911e-01, - 9.99920188e-01, - 1.31550973e-01, - 1.31462736e-01, - 1.31394558e-01, - 1.31346069e-01, - 9.99927275e-01, - 9.99932142e-01, - 9.99933313e-01, - ], - [ - 9.99924204e-01, - 9.99922004e-01, - 9.99915767e-01, - 3.04861147e-04, - 1.95998056e-04, - 0.00000000e00, - 2.05182682e-05, - 9.99923115e-01, - 9.99928835e-01, - 9.99930535e-01, - ], - ] + 0.9999957448889315, + 0.9999781044114231, + 0.9999142422442185, + 0.999853253199584, + 0.9999918403054282, + 0.9999874855193227, + 0.9999513619364747, + 0.9999589247003497, + 0.9999861765528631, + 0.9999939213967494, + ], + [ + 0.9999918011366045, + 0.9999588498417253, + 0.9998388659316617, + 0.9998496524281603, + 0.9999154673258592, + 0.9997827845182361, + 0.9998160234579786, + 0.9999163964511287, + 0.9999743435786168, + 0.9999894752861168, + ], + [ + 0.9999883847481621, + 0.9999427334014465, + 0.9997703972600652, + 0.9853967608835997, + 0.9852517829915376, + 0.9853308520519438, + 0.9854102394414211, + 0.9998728503298413, + 0.9999642585978225, + 0.999986204909933, + ], + [ + 0.999985544721449, + 0.9999296195017368, + 0.9997066149628903, + 0.9753803016111353, + 0.9750688049429371, + 0.9749211929217173, + 0.9750052047129354, + 0.9998284130289159, + 0.9999558481338295, + 0.9999837966320273, + ], + [ + 0.9999832723447848, + 0.9999192263814408, + 0.9996472692076177, + 0.90541293509353, + 0.9049945536526819, + 0.9051142437853055, + 0.9057005861296792, + 0.9997839348839027, + 0.9999490318922627, + 0.9999820419085812, + ], + [ + 0.9999815409510937, + 0.9999113168889934, + 0.9995930143319085, + 0.8370025145062345, + 0.8358345435164332, + 0.8358231468627223, + 0.8369430449157075, + 0.9997408260265034, + 0.9999437526409107, + 0.9999808010740554, + ], + [ + 0.9999803198262347, + 0.9999057164296593, + 0.9995461103528891, + 0.7047260555380003, + 0.7023346743490383, + 0.7022946969603594, + 0.7045662738042475, + 0.9997017258131392, + 0.9999399744001316, + 0.9999799785302944, + ], + [ + 0.9999795785255197, + 0.9999022923125928, + 0.999510772973329, + 0.46283993237260707, + 0.4577365087549323, + 0.4571888733219068, + 0.4614967878524538, + 0.9996710272733927, + 0.9999376682163403, + 0.9999795067125865, + ], + [ + 0.9999792877553907, + 0.9999009179811408, + 0.9994950057121632, + 0.05049460567213739, + 0.030946131978013824, + 0.0, + 0.019224121648385283, + 0.9996568912408903, + 0.9999367861122628, + 0.9999793358521326, + ], + ], + dtype=np.float32, ) SINK_MIN_OUTPUT = np.array( [ [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [ - 0.99997545, - 0.99996582, - 0.99995245, - 0.99856594, - 0.99898314, - 0.99777223, - 0.99394423, - 0.98588113, - 0.97283215, - 0.96096504, - ], - [ - 0.99993872, - 0.99993034, - 0.9998832, - 0.9986147, - 0.99848741, - 0.9972981, - 0.99723719, - 0.94157173, - 0.9369832, - 0.91964243, - ], - [ - 0.99990802, - 0.99989475, - 0.99986873, - 0.98610197, - 0.98610047, - 0.98609749, - 0.98609423, - 0.88741275, - 0.88112911, - 0.86349156, - ], - [ - 0.99988924, - 0.99988509, - 0.99988698, - 0.98234089, - 0.98233591, - 0.98233065, - 0.98232562, - 0.81475172, - 0.80865978, - 0.79033138, - ], - [ - 0.99988418, - 0.99988484, - 0.99988323, - 0.86796555, - 0.86795874, - 0.86795283, - 0.86794756, - 0.72418193, - 0.71847704, - 0.70022037, - ], - [ - 0.99988241, - 0.99988184, - 0.99988103, - 0.85528225, - 0.85527303, - 0.85526389, - 0.85525499, - 0.61716519, - 0.61026209, - 0.59503671, - ], - [ - 0.99988015, - 0.99987985, - 0.99987875, - 0.84258114, - 0.84257121, - 0.84256042, - 0.84254897, - 0.48997924, - 0.49083978, - 0.46891561, - ], - [ - 0.99987865, - 0.99987827, - 0.9998772, - 0.83279589, - 0.83278624, - 0.83277384, - 0.83275897, - 0.36345545, - 0.33690244, - 0.35696828, - ], - [ - 0.99987796, - 0.99987756, - 0.99987643, - 0.82873223, - 0.82872648, - 0.82871803, - 0.82870711, - 0.0, - 0.26106012, - 0.29978657, - ], - ] + 0.9999961997987318, + 0.9999801752476248, + 0.9999185667341594, + 0.9993115972922259, + 0.9999536433504382, + 0.9997590064584757, + 0.9963282396026231, + 0.9020645423682648, + 0.965641014946897, + 0.9847003633599846, + ], + [ + 0.9999926824858815, + 0.9999628275604145, + 0.9998472915971415, + 0.9992953054409239, + 0.9995550237000549, + 0.9972853256638443, + 0.9958871482234863, + 0.8006505271617617, + 0.9360757301263053, + 0.9734843475613124, + ], + [ + 0.9999896427490426, + 0.9999484707116104, + 0.9997841142091455, + 0.9321779021295554, + 0.9308591506422442, + 0.9299937642438358, + 0.9286536283468563, + 0.6964658886602826, + 0.9106656689679997, + 0.9652109119709528, + ], + [ + 0.9999871227708508, + 0.9999369646510842, + 0.9997276125796202, + 0.9006206490361908, + 0.8987968702587018, + 0.8965696900664386, + 0.8941507574801211, + 0.5892568658180841, + 0.8892240419729905, + 0.9590996257620853, + ], + [ + 0.9999851119906539, + 0.9999280075234918, + 0.9996788394671484, + 0.778755271203017, + 0.7763917808258874, + 0.7737517385551721, + 0.7707980517990098, + 0.4788014936236403, + 0.8715671104783401, + 0.954632732759503, + ], + [ + 0.9999835837292402, + 0.999921323618806, + 0.9996389455307461, + 0.7222961578407286, + 0.7186158832946955, + 0.7146983167265393, + 0.7105768254632475, + 0.3648911004360315, + 0.8575943501305144, + 0.9514642802768379, + ], + [ + 0.9999825081019064, + 0.999916683268467, + 0.9996093996776352, + 0.6713490686473397, + 0.6664914636518112, + 0.6613110504728309, + 0.6558325489984669, + 0.247299682539502, + 0.8473037957967624, + 0.9493580587294981, + ], + [ + 0.999981856118739, + 0.9999138938063622, + 0.9995907248497593, + 0.6331535096751639, + 0.6271637176135582, + 0.6206687804556549, + 0.6136262027168252, + 0.12576864809108962, + 0.8407892431959736, + 0.9481472656653798, + ], + [ + 0.9999816006081851, + 0.9999127861527936, + 0.9995832399159849, + 0.6133274396648696, + 0.6086364734302403, + 0.6034602717119345, + 0.5978473214165134, + 0.0, + 0.8382338778894218, + 0.9477082231321966, + ], + ], + dtype=np.float32, ) SINK_MASK_OUTPUT = np.array( [ + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.9047934405899283, 0.9936046284605553, 0.9448690902377527, 0.0, 0.0, 0.0, 0.8363773255131761], + [0.0, 0.0, 0.0, 0.90375200446097, 0.9434594475474036, 0.4716831449516178, 0.0, 0.0, 0.0, 0.7364197333910302], + [ + 0.0, + 0.0, + 0.0, + 0.09080438801405301, + 0.06774182873204163, + 0.038207095016625024, + 0.0, + 0.0, + 0.0, + 0.6745641479264269, + ], + [ + 0.0, + 0.0, + 0.0, + 0.01731082802870267, + 0.013540929458217351, + 0.007321202161532623, + 0.0, + 0.0, + 0.0, + 0.6341231654271253, + ], + [ + 0.0, + 0.0, + 0.0, + 0.0006444251665178544, + 0.0005397129128756325, + 0.0003048384803626333, + 0.0, + 0.0, + 0.0, + 0.6070178708536365, + ], + [ + 0.0, + 0.0, + 0.0, + 5.406078586212675e-05, + 4.416783924970537e-05, + 2.4597362039020103e-05, + 0.0, + 0.0, + 0.0, + 0.5889413683184284, + ], [ - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - 1.00000000e00, - ], - [ - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 2.86416400e-01, - 7.93271181e-01, - 5.81341234e-01, - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 1.98395623e-01, - ], - [ - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 2.66733297e-01, - 2.80741490e-01, - 4.14078784e-02, - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 7.91676486e-04, - ], - [ - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 1.86244537e-04, - 1.53413401e-04, - 7.85806495e-05, - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 5.09797387e-06, - ], - [ - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 9.62904581e-07, - 7.23946225e-07, - 3.68824440e-07, - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 4.79525316e-08, - ], - [ - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 1.50939343e-10, - 1.17724874e-10, - 6.21760843e-11, - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 6.08922784e-10, - ], - [ - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 2.57593754e-13, - 1.94066716e-13, - 9.83784370e-14, - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 9.80828665e-12, - ], - [ - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 4.22323494e-16, - 3.17556633e-16, - 1.60789400e-16, - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 1.90789819e-13, - ], - [ - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 7.72677888e-19, - 5.83029424e-19, - 2.95946659e-19, - 0.00000000e00, - 0.00000000e00, - 0.00000000e00, - 4.97038275e-15, - ], - [ - 2.71345908e-24, - 5.92006757e-24, - 2.25580089e-23, - 3.82601970e-18, - 3.82835349e-18, - 3.83302158e-18, - 3.84002606e-18, - 8.40760586e-16, - 1.83433696e-15, - 1.11629633e-15, - ], - ] + 0.0, + 0.0, + 0.0, + 4.39259327223233e-06, + 3.6050656774754658e-06, + 2.0127120155893425e-06, + 0.0, + 0.0, + 0.0, + 0.5774279920364456, + ], + [ + 0.0, + 0.0, + 0.0, + 4.0740501726718113e-07, + 3.374875487404489e-07, + 1.9113630985667455e-07, + 0.0, + 0.0, + 0.0, + 0.5709897726747111, + ], + [ + 3.2266922388030425e-17, + 1.801110982679718e-14, + 9.325899448306927e-12, + 3.913608442133728e-07, + 3.9581822403393465e-07, + 4.02383505118481e-07, + 4.14820241328287e-07, + 4.281640797396309e-06, + 0.0023900192231620593, + 0.5686882523793125, + ], + ], + dtype=np.float32, ) +@unittest.skipUnless(has_scipy, "Requires scipy") class TestUltrasoundConfidenceMapTransform(unittest.TestCase): def setUp(self): @@ -526,6 +495,21 @@ def setUp(self): self.input_img_torch = torch.from_numpy(TEST_INPUT).unsqueeze(0) # mock image (torch tensor) self.input_mask_torch = torch.from_numpy(TEST_MASK).unsqueeze(0) # mock mask (torch tensor) + self.real_input_img_paths = [ + os.path.join(os.path.dirname(__file__), "testing_data", "ultrasound_confidence_map", "neck_input.png"), + os.path.join(os.path.dirname(__file__), "testing_data", "ultrasound_confidence_map", "femur_input.png"), + ] + + self.real_result_npy_paths = [ + os.path.join(os.path.dirname(__file__), "testing_data", "ultrasound_confidence_map", "neck_result.npy"), + os.path.join(os.path.dirname(__file__), "testing_data", "ultrasound_confidence_map", "femur_result.npy"), + ] + + self.real_input_paramaters = [ + {"alpha": 2.0, "beta": 90, "gamma": 0.03}, + {"alpha": 2.0, "beta": 90, "gamma": 0.06}, + ] + def test_parameters(self): # Unknown mode with self.assertRaises(ValueError): @@ -535,162 +519,92 @@ def test_parameters(self): with self.assertRaises(ValueError): UltrasoundConfidenceMapTransform(sink_mode="unknown") - def test_rgb(self): + @parameterized.expand( + [("all", SINK_ALL_OUTPUT), ("mid", SINK_MID_OUTPUT), ("min", SINK_MIN_OUTPUT), ("mask", SINK_MASK_OUTPUT, True)] + ) + def test_ultrasound_confidence_map_transform(self, sink_mode, expected_output, use_mask=False): # RGB image input_img_rgb = np.expand_dims(np.repeat(self.input_img_np, 3, axis=0), axis=0) input_img_rgb_torch = torch.from_numpy(input_img_rgb) - transform = UltrasoundConfidenceMapTransform(sink_mode="all") - result_torch = transform(input_img_rgb_torch) - self.assertIsInstance(result_torch, torch.Tensor) - assert_allclose(result_torch, torch.tensor(SINK_ALL_OUTPUT), rtol=1e-4, atol=1e-4) - result_np = transform(input_img_rgb) - self.assertIsInstance(result_np, np.ndarray) - assert_allclose(result_np, SINK_ALL_OUTPUT, rtol=1e-4, atol=1e-4) - - transform = UltrasoundConfidenceMapTransform(sink_mode="mid") - result_torch = transform(input_img_rgb_torch) - self.assertIsInstance(result_torch, torch.Tensor) - assert_allclose(result_torch, torch.tensor(SINK_MID_OUTPUT), rtol=1e-4, atol=1e-4) - result_np = transform(input_img_rgb) - self.assertIsInstance(result_np, np.ndarray) - assert_allclose(result_np, SINK_MID_OUTPUT, rtol=1e-4, atol=1e-4) + transform = UltrasoundConfidenceMapTransform(sink_mode=sink_mode) - transform = UltrasoundConfidenceMapTransform(sink_mode="min") - result_torch = transform(input_img_rgb_torch) - self.assertIsInstance(result_torch, torch.Tensor) - assert_allclose(result_torch, torch.tensor(SINK_MIN_OUTPUT), rtol=1e-4, atol=1e-4) - result_np = transform(input_img_rgb) - self.assertIsInstance(result_np, np.ndarray) - assert_allclose(result_np, SINK_MIN_OUTPUT, rtol=1e-4, atol=1e-4) + if use_mask: + result_torch = transform(input_img_rgb_torch, self.input_mask_torch) + result_np = transform(input_img_rgb, self.input_mask_np) + else: + result_torch = transform(input_img_rgb_torch) + result_np = transform(input_img_rgb) - transform = UltrasoundConfidenceMapTransform(sink_mode="mask") - result_torch = transform(input_img_rgb_torch, self.input_mask_torch) self.assertIsInstance(result_torch, torch.Tensor) - assert_allclose(result_torch, torch.tensor(SINK_MASK_OUTPUT), rtol=1e-4, atol=1e-4) - result_np = transform(input_img_rgb, self.input_mask_np) + assert_allclose(result_torch, torch.tensor(expected_output), rtol=1e-4, atol=1e-4) self.assertIsInstance(result_np, np.ndarray) - assert_allclose(result_np, SINK_MASK_OUTPUT, rtol=1e-4, atol=1e-4) + assert_allclose(result_np, expected_output, rtol=1e-4, atol=1e-4) - def test_multi_channel_2d(self): - # 2D multi-channel image + @parameterized.expand( + [ + ("all", SINK_ALL_OUTPUT), + ("mid", SINK_MID_OUTPUT), + ("min", SINK_MIN_OUTPUT), + ("mask", SINK_MASK_OUTPUT, True), # Adding a flag for mask cases + ] + ) + def test_multi_channel_2d(self, sink_mode, expected_output, use_mask=False): input_img_rgb = np.expand_dims(np.repeat(self.input_img_np, 17, axis=0), axis=0) input_img_rgb_torch = torch.from_numpy(input_img_rgb) - transform = UltrasoundConfidenceMapTransform(sink_mode="all") - result_torch = transform(input_img_rgb_torch) - self.assertIsInstance(result_torch, torch.Tensor) - assert_allclose(result_torch, torch.tensor(SINK_ALL_OUTPUT), rtol=1e-4, atol=1e-4) - result_np = transform(input_img_rgb) - self.assertIsInstance(result_np, np.ndarray) - assert_allclose(result_np, SINK_ALL_OUTPUT, rtol=1e-4, atol=1e-4) - - transform = UltrasoundConfidenceMapTransform(sink_mode="mid") - result_torch = transform(input_img_rgb_torch) - self.assertIsInstance(result_torch, torch.Tensor) - assert_allclose(result_torch, torch.tensor(SINK_MID_OUTPUT), rtol=1e-4, atol=1e-4) - result_np = transform(input_img_rgb) - self.assertIsInstance(result_np, np.ndarray) - assert_allclose(result_np, SINK_MID_OUTPUT, rtol=1e-4, atol=1e-4) + transform = UltrasoundConfidenceMapTransform(sink_mode=sink_mode) - transform = UltrasoundConfidenceMapTransform(sink_mode="min") - result_torch = transform(input_img_rgb_torch) - self.assertIsInstance(result_torch, torch.Tensor) - assert_allclose(result_torch, torch.tensor(SINK_MIN_OUTPUT), rtol=1e-4, atol=1e-4) - result_np = transform(input_img_rgb) - self.assertIsInstance(result_np, np.ndarray) - assert_allclose(result_np, SINK_MIN_OUTPUT, rtol=1e-4, atol=1e-4) + if use_mask: + result_torch = transform(input_img_rgb_torch, self.input_mask_torch) + result_np = transform(input_img_rgb, self.input_mask_np) + else: + result_torch = transform(input_img_rgb_torch) + result_np = transform(input_img_rgb) - transform = UltrasoundConfidenceMapTransform(sink_mode="mask") - result_torch = transform(input_img_rgb_torch, self.input_mask_torch) self.assertIsInstance(result_torch, torch.Tensor) - assert_allclose(result_torch, torch.tensor(SINK_MASK_OUTPUT), rtol=1e-4, atol=1e-4) - result_np = transform(input_img_rgb, self.input_mask_np) + assert_allclose(result_torch, torch.tensor(expected_output), rtol=1e-4, atol=1e-4) self.assertIsInstance(result_np, np.ndarray) - assert_allclose(result_np, SINK_MASK_OUTPUT, rtol=1e-4, atol=1e-4) + assert_allclose(result_np, expected_output, rtol=1e-4, atol=1e-4) - def test_non_one_first_dim(self): - # Image without first dimension as 1 + @parameterized.expand([("all",), ("mid",), ("min",), ("mask",)]) + def test_non_one_first_dim(self, sink_mode): + transform = UltrasoundConfidenceMapTransform(sink_mode=sink_mode) input_img_rgb = np.repeat(self.input_img_np, 3, axis=0) input_img_rgb_torch = torch.from_numpy(input_img_rgb) - transform = UltrasoundConfidenceMapTransform(sink_mode="all") - with self.assertRaises(ValueError): - transform(input_img_rgb_torch) - with self.assertRaises(ValueError): - transform(input_img_rgb) - - transform = UltrasoundConfidenceMapTransform(sink_mode="mid") - with self.assertRaises(ValueError): - transform(input_img_rgb_torch) - with self.assertRaises(ValueError): - transform(input_img_rgb) - - transform = UltrasoundConfidenceMapTransform(sink_mode="min") - with self.assertRaises(ValueError): - transform(input_img_rgb_torch) - with self.assertRaises(ValueError): - transform(input_img_rgb) - - transform = UltrasoundConfidenceMapTransform(sink_mode="mask") - with self.assertRaises(ValueError): - transform(input_img_rgb_torch, self.input_mask_torch) - with self.assertRaises(ValueError): - transform(input_img_rgb, self.input_mask_np) - - def test_no_first_dim(self): - # Image without first dimension + if sink_mode == "mask": + with self.assertRaises(ValueError): + transform(input_img_rgb_torch, self.input_mask_torch) + with self.assertRaises(ValueError): + transform(input_img_rgb, self.input_mask_np) + else: + with self.assertRaises(ValueError): + transform(input_img_rgb_torch) + with self.assertRaises(ValueError): + transform(input_img_rgb) + + @parameterized.expand([("all",), ("mid",), ("min",), ("mask",)]) + def test_no_first_dim(self, sink_mode): input_img_rgb = self.input_img_np[0] input_img_rgb_torch = torch.from_numpy(input_img_rgb) - transform = UltrasoundConfidenceMapTransform(sink_mode="all") - with self.assertRaises(ValueError): - transform(input_img_rgb_torch) - with self.assertRaises(ValueError): - transform(input_img_rgb) - - transform = UltrasoundConfidenceMapTransform(sink_mode="mid") - with self.assertRaises(ValueError): - transform(input_img_rgb_torch) - with self.assertRaises(ValueError): - transform(input_img_rgb) + transform = UltrasoundConfidenceMapTransform(sink_mode=sink_mode) - transform = UltrasoundConfidenceMapTransform(sink_mode="min") with self.assertRaises(ValueError): transform(input_img_rgb_torch) with self.assertRaises(ValueError): transform(input_img_rgb) - transform = UltrasoundConfidenceMapTransform(sink_mode="mask") - with self.assertRaises(ValueError): - transform(input_img_rgb_torch, self.input_mask_torch) - with self.assertRaises(ValueError): - transform(input_img_rgb, self.input_mask_np) + if sink_mode == "mask": + with self.assertRaises(ValueError): + transform(input_img_rgb_torch, self.input_mask_torch) + with self.assertRaises(ValueError): + transform(input_img_rgb, self.input_mask_np) - def test_sink_all(self): - transform = UltrasoundConfidenceMapTransform(sink_mode="all") - - # This should not raise an exception for torch tensor - result_torch = transform(self.input_img_torch) - self.assertIsInstance(result_torch, torch.Tensor) - - # This should not raise an exception for numpy array - result_np = transform(self.input_img_np) - self.assertIsInstance(result_np, np.ndarray) - - def test_sink_mid(self): - transform = UltrasoundConfidenceMapTransform(sink_mode="mid") - - # This should not raise an exception for torch tensor - result_torch = transform(self.input_img_torch) - self.assertIsInstance(result_torch, torch.Tensor) - - # This should not raise an exception for numpy array - result_np = transform(self.input_img_np) - self.assertIsInstance(result_np, np.ndarray) - - def test_sink_min(self): - transform = UltrasoundConfidenceMapTransform(sink_mode="min") + @parameterized.expand([("all",), ("mid",), ("min",)]) + def test_sink_mode(self, mode): + transform = UltrasoundConfidenceMapTransform(sink_mode=mode) # This should not raise an exception for torch tensor result_torch = transform(self.input_img_torch) @@ -752,6 +666,44 @@ def test_func(self): output = transform(self.input_img_torch, self.input_mask_torch) assert_allclose(output, torch.tensor(SINK_MASK_OUTPUT), rtol=1e-4, atol=1e-4) + def test_against_official_code(self): + # This test is to compare the output of the transform with the official code + # The official code is available at: + # https://campar.in.tum.de/Main/AthanasiosKaramalisCode + + for input_img_path, result_npy_path, params in zip( + self.real_input_img_paths, self.real_result_npy_paths, self.real_input_paramaters + ): + input_img = np.array(Image.open(input_img_path)) + input_img = np.expand_dims(input_img, axis=0) + + result_img = np.load(result_npy_path) + + transform = UltrasoundConfidenceMapTransform(sink_mode="all", **params) + output = transform(input_img) + + assert_allclose(output, result_img, rtol=1e-4, atol=1e-4) + + def test_against_official_code_using_cg(self): + # This test is to compare the output of the transform with the official code + # The official code is available at: + # https://campar.in.tum.de/Main/AthanasiosKaramalisCode + + for input_img_path, result_npy_path, params in zip( + self.real_input_img_paths, self.real_result_npy_paths, self.real_input_paramaters + ): + input_img = np.array(Image.open(input_img_path)) + input_img = np.expand_dims(input_img, axis=0) + + result_img = np.load(result_npy_path) + + transform = UltrasoundConfidenceMapTransform( + sink_mode="all", use_cg=True, cg_tol=1.0e-6, cg_maxiter=300, **params + ) + output = transform(input_img) + + assert_allclose(output, result_img, rtol=1e-2, atol=1e-2) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_unetr.py b/tests/test_unetr.py index 46018d2bc0..8c5ecb32e1 100644 --- a/tests/test_unetr.py +++ b/tests/test_unetr.py @@ -30,7 +30,7 @@ for num_heads in [8]: for mlp_dim in [3072]: for norm_name in ["instance"]: - for pos_embed in ["perceptron"]: + for proj_type in ["perceptron"]: for nd in (2, 3): test_case = [ { @@ -42,7 +42,7 @@ "norm_name": norm_name, "mlp_dim": mlp_dim, "num_heads": num_heads, - "pos_embed": pos_embed, + "proj_type": proj_type, "dropout_rate": dropout_rate, "conv_block": True, "res_block": False, @@ -75,7 +75,7 @@ def test_ill_arg(self): hidden_size=128, mlp_dim=3072, num_heads=12, - pos_embed="conv", + proj_type="conv", norm_name="instance", dropout_rate=5.0, ) @@ -89,7 +89,7 @@ def test_ill_arg(self): hidden_size=512, mlp_dim=3072, num_heads=12, - pos_embed="conv", + proj_type="conv", norm_name="instance", dropout_rate=0.5, ) @@ -103,7 +103,7 @@ def test_ill_arg(self): hidden_size=512, mlp_dim=3072, num_heads=14, - pos_embed="conv", + proj_type="conv", norm_name="batch", dropout_rate=0.4, ) @@ -117,13 +117,13 @@ def test_ill_arg(self): hidden_size=768, mlp_dim=3072, num_heads=12, - pos_embed="perc", + proj_type="perc", norm_name="instance", dropout_rate=0.2, ) @parameterized.expand(TEST_CASE_UNETR) - @SkipIfBeforePyTorchVersion((1, 9)) + @SkipIfBeforePyTorchVersion((2, 0)) def test_script(self, input_param, input_shape, _): net = UNETR(**(input_param)) net.eval() diff --git a/tests/test_vector_quantizer.py b/tests/test_vector_quantizer.py new file mode 100644 index 0000000000..43533d0377 --- /dev/null +++ b/tests/test_vector_quantizer.py @@ -0,0 +1,89 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from math import prod + +import torch +from parameterized import parameterized + +from monai.networks.layers import EMAQuantizer, VectorQuantizer + +TEST_CASES = [ + [{"spatial_dims": 2, "num_embeddings": 16, "embedding_dim": 8}, (1, 8, 4, 4), (1, 4, 4)], + [{"spatial_dims": 3, "num_embeddings": 16, "embedding_dim": 8}, (1, 8, 4, 4, 4), (1, 4, 4, 4)], +] + + +class TestEMA(unittest.TestCase): + @parameterized.expand(TEST_CASES) + def test_ema_shape(self, input_param, input_shape, output_shape): + layer = EMAQuantizer(**input_param) + x = torch.randn(input_shape) + layer = layer.train() + outputs = layer(x) + self.assertEqual(outputs[0].shape, input_shape) + self.assertEqual(outputs[2].shape, output_shape) + + layer = layer.eval() + outputs = layer(x) + self.assertEqual(outputs[0].shape, input_shape) + self.assertEqual(outputs[2].shape, output_shape) + + @parameterized.expand(TEST_CASES) + def test_ema_quantize(self, input_param, input_shape, output_shape): + layer = EMAQuantizer(**input_param) + x = torch.randn(input_shape) + outputs = layer.quantize(x) + self.assertEqual(outputs[0].shape, (prod(input_shape[2:]), input_shape[1])) # (HxW[xD], C) + self.assertEqual(outputs[1].shape, (prod(input_shape[2:]), input_param["num_embeddings"])) # (HxW[xD], E) + self.assertEqual(outputs[2].shape, (input_shape[0],) + input_shape[2:]) # (1, H, W, [D]) + + def test_ema(self): + layer = EMAQuantizer(spatial_dims=2, num_embeddings=2, embedding_dim=2, epsilon=0, decay=0) + original_weight_0 = layer.embedding.weight[0].clone() + original_weight_1 = layer.embedding.weight[1].clone() + x_0 = original_weight_0 + x_0 = x_0.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + x_0 = x_0.repeat(1, 1, 1, 2) + 0.001 + + x_1 = original_weight_1 + x_1 = x_1.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + x_1 = x_1.repeat(1, 1, 1, 2) + + x = torch.cat([x_0, x_1], dim=0) + layer = layer.train() + _ = layer(x) + + self.assertTrue(all(layer.embedding.weight[0] != original_weight_0)) + self.assertTrue(all(layer.embedding.weight[1] == original_weight_1)) + + +class TestVectorQuantizer(unittest.TestCase): + @parameterized.expand(TEST_CASES) + def test_vector_quantizer_shape(self, input_param, input_shape, output_shape): + layer = VectorQuantizer(EMAQuantizer(**input_param)) + x = torch.randn(input_shape) + outputs = layer(x) + self.assertEqual(outputs[1].shape, input_shape) + + @parameterized.expand(TEST_CASES) + def test_vector_quantizer_quantize(self, input_param, input_shape, output_shape): + layer = VectorQuantizer(EMAQuantizer(**input_param)) + x = torch.randn(input_shape) + outputs = layer.quantize(x) + self.assertEqual(outputs.shape, output_shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_vis_cam.py b/tests/test_vis_cam.py index b641599af2..68b12de2f8 100644 --- a/tests/test_vis_cam.py +++ b/tests/test_vis_cam.py @@ -70,6 +70,8 @@ class TestClassActivationMap(unittest.TestCase): @parameterized.expand([TEST_CASE_0, TEST_CASE_1, TEST_CASE_2, TEST_CASE_3]) def test_shape(self, input_data, expected_shape): + model = None + if input_data["model"] == "densenet2d": model = DenseNet121(spatial_dims=2, in_channels=1, out_channels=3) if input_data["model"] == "densenet3d": @@ -80,6 +82,7 @@ def test_shape(self, input_data, expected_shape): model = SEResNet50(spatial_dims=2, in_channels=3, num_classes=4) if input_data["model"] == "senet3d": model = SEResNet50(spatial_dims=3, in_channels=3, num_classes=4) + device = "cuda:0" if torch.cuda.is_available() else "cpu" model.to(device) model.eval() diff --git a/tests/test_vis_gradcam.py b/tests/test_vis_gradcam.py index 325b74b3ce..f77d916a5b 100644 --- a/tests/test_vis_gradcam.py +++ b/tests/test_vis_gradcam.py @@ -153,6 +153,8 @@ class TestGradientClassActivationMap(unittest.TestCase): @parameterized.expand(TESTS) def test_shape(self, cam_class, input_data, expected_shape): + model = None + if input_data["model"] == "densenet2d": model = DenseNet121(spatial_dims=2, in_channels=1, out_channels=3) elif input_data["model"] == "densenet2d_bin": diff --git a/tests/test_vista3d.py b/tests/test_vista3d.py new file mode 100644 index 0000000000..d3b4e0c10e --- /dev/null +++ b/tests/test_vista3d.py @@ -0,0 +1,85 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.networks import eval_mode +from monai.networks.nets import VISTA3D, SegResNetDS2 +from monai.networks.nets.vista3d import ClassMappingClassify, PointMappingSAM +from tests.utils import SkipIfBeforePyTorchVersion, skip_if_quick + +device = "cuda" if torch.cuda.is_available() else "cpu" + +TEST_CASES = [ + [{"encoder_embed_dim": 48, "in_channels": 1}, {}, (1, 1, 64, 64, 64), (1, 1, 64, 64, 64)], + [{"encoder_embed_dim": 48, "in_channels": 2}, {}, (1, 2, 64, 64, 64), (1, 1, 64, 64, 64)], + [ + {"encoder_embed_dim": 48, "in_channels": 1}, + {"class_vector": torch.tensor([1, 2, 3], device=device)}, + (1, 1, 64, 64, 64), + (3, 1, 64, 64, 64), + ], + [ + {"encoder_embed_dim": 48, "in_channels": 1}, + { + "point_coords": torch.tensor([[[1, 2, 3], [1, 2, 3]]], device=device), + "point_labels": torch.tensor([[1, 0]], device=device), + }, + (1, 1, 64, 64, 64), + (1, 1, 64, 64, 64), + ], + [ + {"encoder_embed_dim": 48, "in_channels": 1}, + { + "class_vector": torch.tensor([1, 2], device=device), + "point_coords": torch.tensor([[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]], device=device), + "point_labels": torch.tensor([[1, 0], [1, 0]], device=device), + }, + (1, 1, 64, 64, 64), + (2, 1, 64, 64, 64), + ], +] + + +@SkipIfBeforePyTorchVersion((1, 11)) +@skip_if_quick +class TestVista3d(unittest.TestCase): + + @parameterized.expand(TEST_CASES) + def test_vista3d_shape(self, args, input_params, input_shape, expected_shape): + segresnet = SegResNetDS2( + in_channels=args["in_channels"], + blocks_down=(1, 2, 2, 4, 4), + norm="instance", + out_channels=args["encoder_embed_dim"], + init_filters=args["encoder_embed_dim"], + dsdepth=1, + ) + point_head = PointMappingSAM(feature_size=args["encoder_embed_dim"], n_classes=512, last_supported=132) + class_head = ClassMappingClassify(n_classes=512, feature_size=args["encoder_embed_dim"], use_mlp=True) + net = VISTA3D(image_encoder=segresnet, class_head=class_head, point_head=point_head).to(device) + with eval_mode(net): + result = net.forward( + torch.randn(input_shape).to(device), + point_coords=input_params.get("point_coords", None), + point_labels=input_params.get("point_labels", None), + class_vector=input_params.get("class_vector", None), + ) + self.assertEqual(result.shape, expected_shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_vista3d_utils.py b/tests/test_vista3d_utils.py new file mode 100644 index 0000000000..a940854d88 --- /dev/null +++ b/tests/test_vista3d_utils.py @@ -0,0 +1,133 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest.case import skipUnless + +import numpy as np +import torch +from parameterized import parameterized + +from monai.transforms.utils import ( + convert_points_to_disc, + get_largest_connected_component_mask_point, + sample_points_from_label, +) +from monai.utils import min_version +from monai.utils.module import optional_import +from tests.utils import skip_if_no_cuda, skip_if_quick + +cp, has_cp = optional_import("cupy") +cucim_skimage, has_cucim = optional_import("cucim.skimage") +measure, has_measure = optional_import("skimage.measure", "0.14.2", min_version) + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +TESTS_SAMPLE_POINTS_FROM_LABEL = [] +for use_center in [True, False]: + labels = torch.zeros(1, 1, 32, 32, 32) + labels[0, 0, 5:10, 5:10, 5:10] = 1 + labels[0, 0, 10:15, 10:15, 10:15] = 3 + labels[0, 0, 20:25, 20:25, 20:25] = 5 + TESTS_SAMPLE_POINTS_FROM_LABEL.append( + [{"labels": labels, "label_set": (1, 3, 5), "use_center": use_center}, (3, 1, 3), (3, 1)] + ) + +TEST_CONVERT_POINTS_TO_DISC = [] +for radius in [1, 2]: + for disc in [True, False]: + image_size = (32, 32, 32) + point = torch.randn(3, 1, 3) + point_label = torch.randint(0, 4, (3, 1)) + expected_shape = (point.shape[0], 2, *image_size) + TEST_CONVERT_POINTS_TO_DISC.append( + [ + {"image_size": image_size, "point": point, "point_label": point_label, "radius": radius, "disc": disc}, + expected_shape, + ] + ) + +TEST_LCC_MASK_POINT_TORCH = [] +for bs in [1, 2]: + for num_points in [1, 3]: + shape = (bs, 1, 128, 32, 32) + TEST_LCC_MASK_POINT_TORCH.append( + [ + { + "img_pos": torch.randint(0, 2, shape, dtype=torch.bool), + "img_neg": torch.randint(0, 2, shape, dtype=torch.bool), + "point_coords": torch.randint(0, 10, (bs, num_points, 3)), + "point_labels": torch.randint(0, 4, (bs, num_points)), + }, + shape, + ] + ) + +TEST_LCC_MASK_POINT_NP = [] +for bs in [1, 2]: + for num_points in [1, 3]: + shape = (bs, 1, 32, 32, 64) + TEST_LCC_MASK_POINT_NP.append( + [ + { + "img_pos": np.random.randint(0, 2, shape, dtype=bool), + "img_neg": np.random.randint(0, 2, shape, dtype=bool), + "point_coords": np.random.randint(0, 5, (bs, num_points, 3)), + "point_labels": np.random.randint(0, 4, (bs, num_points)), + }, + shape, + ] + ) + + +@skipUnless(has_measure or cucim_skimage, "skimage or cucim.skimage required") +class TestSamplePointsFromLabel(unittest.TestCase): + + @parameterized.expand(TESTS_SAMPLE_POINTS_FROM_LABEL) + def test_shape(self, input_data, expected_point_shape, expected_point_label_shape): + point, point_label = sample_points_from_label(**input_data) + self.assertEqual(point.shape, expected_point_shape) + self.assertEqual(point_label.shape, expected_point_label_shape) + + +class TestConvertPointsToDisc(unittest.TestCase): + + @parameterized.expand(TEST_CONVERT_POINTS_TO_DISC) + def test_shape(self, input_data, expected_shape): + result = convert_points_to_disc(**input_data) + self.assertEqual(result.shape, expected_shape) + + +@skipUnless(has_measure or cucim_skimage, "skimage or cucim.skimage required") +class TestGetLargestConnectedComponentMaskPoint(unittest.TestCase): + + @skip_if_quick + @skip_if_no_cuda + @skipUnless(has_cp and cucim_skimage, "cupy and cucim.skimage required") + @parameterized.expand(TEST_LCC_MASK_POINT_TORCH) + def test_cp_shape(self, input_data, shape): + for key in input_data: + input_data[key] = input_data[key].to(device) + mask = get_largest_connected_component_mask_point(**input_data) + self.assertEqual(mask.shape, shape) + + @skipUnless(has_measure, "skimage required") + @parameterized.expand(TEST_LCC_MASK_POINT_NP) + def test_np_shape(self, input_data, shape): + mask = get_largest_connected_component_mask_point(**input_data) + self.assertEqual(mask.shape, shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_vit.py b/tests/test_vit.py index a84883cba0..a3ffd0b2ef 100644 --- a/tests/test_vit.py +++ b/tests/test_vit.py @@ -30,7 +30,7 @@ for mlp_dim in [3072]: for num_layers in [4]: for num_classes in [8]: - for pos_embed in ["conv", "perceptron"]: + for proj_type in ["conv", "perceptron"]: for classification in [False, True]: for nd in (2, 3): test_case = [ @@ -42,7 +42,7 @@ "mlp_dim": mlp_dim, "num_layers": num_layers, "num_heads": num_heads, - "pos_embed": pos_embed, + "proj_type": proj_type, "classification": classification, "num_classes": num_classes, "dropout_rate": dropout_rate, @@ -69,79 +69,44 @@ def test_shape(self, input_param, input_shape, expected_shape): result, _ = net(torch.randn(input_shape)) self.assertEqual(result.shape, expected_shape) - def test_ill_arg(self): + @parameterized.expand( + [ + (1, (128, 128, 128), (16, 16, 16), 128, 3072, 12, 12, "conv", False, 5.0), + (1, (32, 32, 32), (64, 64, 64), 512, 3072, 12, 8, "perceptron", False, 0.3), + (1, (96, 96, 96), (8, 8, 8), 512, 3072, 12, 14, "conv", False, 0.3), + (1, (97, 97, 97), (4, 4, 4), 768, 3072, 12, 8, "perceptron", True, 0.3), + (4, (96, 96, 96), (16, 16, 16), 768, 3072, 12, 12, "perc", False, 0.3), + ] + ) + def test_ill_arg( + self, + in_channels, + img_size, + patch_size, + hidden_size, + mlp_dim, + num_layers, + num_heads, + proj_type, + classification, + dropout_rate, + ): with self.assertRaises(ValueError): ViT( - in_channels=1, - img_size=(128, 128, 128), - patch_size=(16, 16, 16), - hidden_size=128, - mlp_dim=3072, - num_layers=12, - num_heads=12, - pos_embed="conv", - classification=False, - dropout_rate=5.0, + in_channels=in_channels, + img_size=img_size, + patch_size=patch_size, + hidden_size=hidden_size, + mlp_dim=mlp_dim, + num_layers=num_layers, + num_heads=num_heads, + proj_type=proj_type, + classification=classification, + dropout_rate=dropout_rate, ) - with self.assertRaises(ValueError): - ViT( - in_channels=1, - img_size=(32, 32, 32), - patch_size=(64, 64, 64), - hidden_size=512, - mlp_dim=3072, - num_layers=12, - num_heads=8, - pos_embed="perceptron", - classification=False, - dropout_rate=0.3, - ) - - with self.assertRaises(ValueError): - ViT( - in_channels=1, - img_size=(96, 96, 96), - patch_size=(8, 8, 8), - hidden_size=512, - mlp_dim=3072, - num_layers=12, - num_heads=14, - pos_embed="conv", - classification=False, - dropout_rate=0.3, - ) - - with self.assertRaises(ValueError): - ViT( - in_channels=1, - img_size=(97, 97, 97), - patch_size=(4, 4, 4), - hidden_size=768, - mlp_dim=3072, - num_layers=12, - num_heads=8, - pos_embed="perceptron", - classification=True, - dropout_rate=0.3, - ) - - with self.assertRaises(ValueError): - ViT( - in_channels=4, - img_size=(96, 96, 96), - patch_size=(16, 16, 16), - hidden_size=768, - mlp_dim=3072, - num_layers=12, - num_heads=12, - pos_embed="perc", - classification=False, - dropout_rate=0.3, - ) - - @parameterized.expand(TEST_CASE_Vit) - @SkipIfBeforePyTorchVersion((1, 9)) + @parameterized.expand(TEST_CASE_Vit[:1]) + @SkipIfBeforePyTorchVersion((2, 0)) def test_script(self, input_param, input_shape, _): net = ViT(**(input_param)) net.eval() diff --git a/tests/test_vitautoenc.py b/tests/test_vitautoenc.py index cc3d493bb3..9a503948d0 100644 --- a/tests/test_vitautoenc.py +++ b/tests/test_vitautoenc.py @@ -23,7 +23,7 @@ for in_channels in [1, 4]: for img_size in [64, 96, 128]: for patch_size in [16]: - for pos_embed in ["conv", "perceptron"]: + for proj_type in ["conv", "perceptron"]: for nd in [2, 3]: test_case = [ { @@ -34,7 +34,7 @@ "mlp_dim": 3072, "num_layers": 4, "num_heads": 12, - "pos_embed": pos_embed, + "proj_type": proj_type, "dropout_rate": 0.6, "spatial_dims": nd, }, @@ -54,7 +54,7 @@ "mlp_dim": 3072, "num_layers": 4, "num_heads": 12, - "pos_embed": "conv", + "proj_type": "conv", "dropout_rate": 0.6, "spatial_dims": 3, }, @@ -82,83 +82,30 @@ def test_shape(self, input_param, input_shape, expected_shape): result, _ = net(torch.randn(input_shape)) self.assertEqual(result.shape, expected_shape) - def test_ill_arg(self): + @parameterized.expand( + [ + (1, (32, 32, 32), (64, 64, 64), 512, 3072, 12, 8, "perceptron", 0.3), # img_size_too_large_for_patch_size + (1, (96, 96, 96), (8, 8, 8), 512, 3072, 12, 14, "conv", 0.3), # num_heads_out_of_bound + (1, (97, 97, 97), (4, 4, 4), 768, 3072, 12, 8, "perceptron", 0.3), # img_size_not_divisible_by_patch_size + (4, (96, 96, 96), (16, 16, 16), 768, 3072, 12, 12, "perc", 0.3), # invalid_pos_embed + (4, (96, 96, 96), (9, 9, 9), 768, 3072, 12, 12, "perc", 0.3), # patch_size_not_divisible + # Add more test cases as needed + ] + ) + def test_ill_arg( + self, in_channels, img_size, patch_size, hidden_size, mlp_dim, num_layers, num_heads, proj_type, dropout_rate + ): with self.assertRaises(ValueError): ViTAutoEnc( - in_channels=1, - img_size=(128, 128, 128), - patch_size=(16, 16, 16), - hidden_size=128, - mlp_dim=3072, - num_layers=12, - num_heads=12, - pos_embed="conv", - dropout_rate=5.0, - ) - - with self.assertRaises(ValueError): - ViTAutoEnc( - in_channels=1, - img_size=(32, 32, 32), - patch_size=(64, 64, 64), - hidden_size=512, - mlp_dim=3072, - num_layers=12, - num_heads=8, - pos_embed="perceptron", - dropout_rate=0.3, - ) - - with self.assertRaises(ValueError): - ViTAutoEnc( - in_channels=1, - img_size=(96, 96, 96), - patch_size=(8, 8, 8), - hidden_size=512, - mlp_dim=3072, - num_layers=12, - num_heads=14, - pos_embed="conv", - dropout_rate=0.3, - ) - - with self.assertRaises(ValueError): - ViTAutoEnc( - in_channels=1, - img_size=(97, 97, 97), - patch_size=(4, 4, 4), - hidden_size=768, - mlp_dim=3072, - num_layers=12, - num_heads=8, - pos_embed="perceptron", - dropout_rate=0.3, - ) - - with self.assertRaises(ValueError): - ViTAutoEnc( - in_channels=4, - img_size=(96, 96, 96), - patch_size=(16, 16, 16), - hidden_size=768, - mlp_dim=3072, - num_layers=12, - num_heads=12, - pos_embed="perc", - dropout_rate=0.3, - ) - - with self.assertRaises(ValueError): - ViTAutoEnc( - in_channels=4, - img_size=(96, 96, 96), - patch_size=(9, 9, 9), - hidden_size=768, - mlp_dim=3072, - num_layers=12, - num_heads=12, - pos_embed="perc", - dropout_rate=0.3, + in_channels=in_channels, + img_size=img_size, + patch_size=patch_size, + hidden_size=hidden_size, + mlp_dim=mlp_dim, + num_layers=num_layers, + num_heads=num_heads, + proj_type=proj_type, + dropout_rate=dropout_rate, ) diff --git a/tests/test_vqvae.py b/tests/test_vqvae.py new file mode 100644 index 0000000000..4916dc2faa --- /dev/null +++ b/tests/test_vqvae.py @@ -0,0 +1,274 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +import torch +from parameterized import parameterized + +from monai.networks import eval_mode +from monai.networks.nets.vqvae import VQVAE +from tests.utils import SkipIfBeforePyTorchVersion + +TEST_CASES = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "num_res_layers": 1, + "num_res_channels": (4, 4), + "downsample_parameters": ((2, 4, 1, 1),) * 2, + "upsample_parameters": ((2, 4, 1, 1, 0),) * 2, + "num_embeddings": 8, + "embedding_dim": 8, + }, + (1, 1, 8, 8), + (1, 1, 8, 8), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "num_res_layers": 1, + "num_res_channels": 4, + "downsample_parameters": ((2, 4, 1, 1),) * 2, + "upsample_parameters": ((2, 4, 1, 1, 0),) * 2, + "num_embeddings": 8, + "embedding_dim": 8, + }, + (1, 1, 8, 8, 8), + (1, 1, 8, 8, 8), + ], + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "num_res_layers": 1, + "num_res_channels": (4, 4), + "downsample_parameters": (2, 4, 1, 1), + "upsample_parameters": ((2, 4, 1, 1, 0),) * 2, + "num_embeddings": 8, + "embedding_dim": 8, + }, + (1, 1, 8, 8), + (1, 1, 8, 8), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (4, 4), + "num_res_layers": 1, + "num_res_channels": (4, 4), + "downsample_parameters": ((2, 4, 1, 1),) * 2, + "upsample_parameters": (2, 4, 1, 1, 0), + "num_embeddings": 8, + "embedding_dim": 8, + }, + (1, 1, 8, 8, 8), + (1, 1, 8, 8, 8), + ], +] + +TEST_LATENT_SHAPE = { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "downsample_parameters": ((2, 4, 1, 1),) * 2, + "upsample_parameters": ((2, 4, 1, 1, 0),) * 2, + "num_res_layers": 1, + "channels": (8, 8), + "num_res_channels": (8, 8), + "num_embeddings": 16, + "embedding_dim": 8, +} + + +class TestVQVAE(unittest.TestCase): + @parameterized.expand(TEST_CASES) + def test_shape(self, input_param, input_shape, expected_shape): + device = "cuda" if torch.cuda.is_available() else "cpu" + + net = VQVAE(**input_param).to(device) + + with eval_mode(net): + result, _ = net(torch.randn(input_shape).to(device)) + + self.assertEqual(result.shape, expected_shape) + + @parameterized.expand(TEST_CASES) + @SkipIfBeforePyTorchVersion((1, 11)) + def test_shape_with_checkpoint(self, input_param, input_shape, expected_shape): + device = "cuda" if torch.cuda.is_available() else "cpu" + input_param = input_param.copy() + input_param.update({"use_checkpointing": True}) + + net = VQVAE(**input_param).to(device) + + with eval_mode(net): + result, _ = net(torch.randn(input_shape).to(device)) + + self.assertEqual(result.shape, expected_shape) + + # Removed this test case since TorchScript currently does not support activation checkpoint. + # def test_script(self): + # net = VQVAE( + # spatial_dims=2, + # in_channels=1, + # out_channels=1, + # downsample_parameters=((2, 4, 1, 1),) * 2, + # upsample_parameters=((2, 4, 1, 1, 0),) * 2, + # num_res_layers=1, + # channels=(8, 8), + # num_res_channels=(8, 8), + # num_embeddings=16, + # embedding_dim=8, + # ddp_sync=False, + # ) + # test_data = torch.randn(1, 1, 16, 16) + # test_script_save(net, test_data) + + def test_channels_not_same_size_of_num_res_channels(self): + with self.assertRaises(ValueError): + VQVAE( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(16, 16), + num_res_channels=(16, 16, 16), + downsample_parameters=((2, 4, 1, 1),) * 2, + upsample_parameters=((2, 4, 1, 1, 0),) * 2, + ) + + def test_channels_not_same_size_of_downsample_parameters(self): + with self.assertRaises(ValueError): + VQVAE( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(16, 16), + num_res_channels=(16, 16), + downsample_parameters=((2, 4, 1, 1),) * 3, + upsample_parameters=((2, 4, 1, 1, 0),) * 2, + ) + + def test_channels_not_same_size_of_upsample_parameters(self): + with self.assertRaises(ValueError): + VQVAE( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(16, 16), + num_res_channels=(16, 16), + downsample_parameters=((2, 4, 1, 1),) * 2, + upsample_parameters=((2, 4, 1, 1, 0),) * 3, + ) + + def test_downsample_parameters_not_sequence_or_int(self): + with self.assertRaises(ValueError): + VQVAE( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(16, 16), + num_res_channels=(16, 16), + downsample_parameters=(("test", 4, 1, 1),) * 2, + upsample_parameters=((2, 4, 1, 1, 0),) * 2, + ) + + def test_upsample_parameters_not_sequence_or_int(self): + with self.assertRaises(ValueError): + VQVAE( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(16, 16), + num_res_channels=(16, 16), + downsample_parameters=((2, 4, 1, 1),) * 2, + upsample_parameters=(("test", 4, 1, 1, 0),) * 2, + ) + + def test_downsample_parameter_length_different_4(self): + with self.assertRaises(ValueError): + VQVAE( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(16, 16), + num_res_channels=(16, 16), + downsample_parameters=((2, 4, 1),) * 3, + upsample_parameters=((2, 4, 1, 1, 0),) * 2, + ) + + def test_upsample_parameter_length_different_5(self): + with self.assertRaises(ValueError): + VQVAE( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(16, 16), + num_res_channels=(16, 16, 16), + downsample_parameters=((2, 4, 1, 1),) * 2, + upsample_parameters=((2, 4, 1, 1, 0, 1),) * 3, + ) + + def test_encode_shape(self): + device = "cuda" if torch.cuda.is_available() else "cpu" + + net = VQVAE(**TEST_LATENT_SHAPE).to(device) + + with eval_mode(net): + latent = net.encode(torch.randn(1, 1, 32, 32).to(device)) + + self.assertEqual(latent.shape, (1, 8, 8, 8)) + + def test_index_quantize_shape(self): + device = "cuda" if torch.cuda.is_available() else "cpu" + + net = VQVAE(**TEST_LATENT_SHAPE).to(device) + + with eval_mode(net): + latent = net.index_quantize(torch.randn(1, 1, 32, 32).to(device)) + + self.assertEqual(latent.shape, (1, 8, 8)) + + def test_decode_shape(self): + device = "cuda" if torch.cuda.is_available() else "cpu" + + net = VQVAE(**TEST_LATENT_SHAPE).to(device) + + with eval_mode(net): + latent = net.decode(torch.randn(1, 8, 8, 8).to(device)) + + self.assertEqual(latent.shape, (1, 1, 32, 32)) + + def test_decode_samples_shape(self): + device = "cuda" if torch.cuda.is_available() else "cpu" + + net = VQVAE(**TEST_LATENT_SHAPE).to(device) + + with eval_mode(net): + latent = net.decode_samples(torch.randint(low=0, high=16, size=(1, 8, 8)).to(device)) + + self.assertEqual(latent.shape, (1, 1, 32, 32)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_vqvaetransformer_inferer.py b/tests/test_vqvaetransformer_inferer.py new file mode 100644 index 0000000000..36b715f588 --- /dev/null +++ b/tests/test_vqvaetransformer_inferer.py @@ -0,0 +1,295 @@ +# Copyright (c) MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest +from unittest import skipUnless + +import torch +from parameterized import parameterized + +from monai.inferers import VQVAETransformerInferer +from monai.networks.nets import VQVAE, DecoderOnlyTransformer +from monai.utils import optional_import +from monai.utils.ordering import Ordering, OrderingType + +einops, has_einops = optional_import("einops") +TEST_CASES = [ + [ + { + "spatial_dims": 2, + "in_channels": 1, + "out_channels": 1, + "channels": (8, 8), + "num_res_channels": (8, 8), + "downsample_parameters": ((2, 4, 1, 1),) * 2, + "upsample_parameters": ((2, 4, 1, 1, 0),) * 2, + "num_res_layers": 1, + "num_embeddings": 16, + "embedding_dim": 8, + }, + { + "num_tokens": 16 + 1, + "max_seq_len": 4, + "attn_layers_dim": 4, + "attn_layers_depth": 2, + "attn_layers_heads": 1, + "with_cross_attention": False, + }, + {"ordering_type": OrderingType.RASTER_SCAN.value, "spatial_dims": 2, "dimensions": (2, 2, 2)}, + (2, 1, 8, 8), + (2, 4, 17), + (2, 2, 2), + ], + [ + { + "spatial_dims": 3, + "in_channels": 1, + "out_channels": 1, + "channels": (8, 8), + "num_res_channels": (8, 8), + "downsample_parameters": ((2, 4, 1, 1),) * 2, + "upsample_parameters": ((2, 4, 1, 1, 0),) * 2, + "num_res_layers": 1, + "num_embeddings": 16, + "embedding_dim": 8, + }, + { + "num_tokens": 16 + 1, + "max_seq_len": 8, + "attn_layers_dim": 4, + "attn_layers_depth": 2, + "attn_layers_heads": 1, + "with_cross_attention": False, + }, + {"ordering_type": OrderingType.RASTER_SCAN.value, "spatial_dims": 3, "dimensions": (2, 2, 2, 2)}, + (2, 1, 8, 8, 8), + (2, 8, 17), + (2, 2, 2, 2), + ], +] + + +class TestVQVAETransformerInferer(unittest.TestCase): + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_prediction_shape( + self, stage_1_params, stage_2_params, ordering_params, input_shape, logits_shape, latent_shape + ): + stage_1 = VQVAE(**stage_1_params) + stage_2 = DecoderOnlyTransformer(**stage_2_params) + ordering = Ordering(**ordering_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + input = torch.randn(input_shape).to(device) + + inferer = VQVAETransformerInferer() + prediction = inferer(inputs=input, vqvae_model=stage_1, transformer_model=stage_2, ordering=ordering) + self.assertEqual(prediction.shape, logits_shape) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_prediction_shape_shorter_sequence( + self, stage_1_params, stage_2_params, ordering_params, input_shape, logits_shape, latent_shape + ): + stage_1 = VQVAE(**stage_1_params) + max_seq_len = 3 + stage_2_params_shorter = dict(stage_2_params) + stage_2_params_shorter["max_seq_len"] = max_seq_len + stage_2 = DecoderOnlyTransformer(**stage_2_params_shorter) + ordering = Ordering(**ordering_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + input = torch.randn(input_shape).to(device) + + inferer = VQVAETransformerInferer() + prediction = inferer(inputs=input, vqvae_model=stage_1, transformer_model=stage_2, ordering=ordering) + cropped_logits_shape = (logits_shape[0], max_seq_len, logits_shape[2]) + self.assertEqual(prediction.shape, cropped_logits_shape) + + @skipUnless(has_einops, "Requires einops") + def test_sample(self): + + stage_1 = VQVAE( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(8, 8), + num_res_channels=(8, 8), + downsample_parameters=((2, 4, 1, 1),) * 2, + upsample_parameters=((2, 4, 1, 1, 0),) * 2, + num_res_layers=1, + num_embeddings=16, + embedding_dim=8, + ) + stage_2 = DecoderOnlyTransformer( + num_tokens=16 + 1, + max_seq_len=4, + attn_layers_dim=4, + attn_layers_depth=2, + attn_layers_heads=1, + with_cross_attention=False, + ) + ordering = Ordering(ordering_type=OrderingType.RASTER_SCAN.value, spatial_dims=2, dimensions=(2, 2, 2)) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + inferer = VQVAETransformerInferer() + + starting_token = 16 # from stage_1 num_embeddings + + sample = inferer.sample( + latent_spatial_dim=(2, 2), + starting_tokens=starting_token * torch.ones((2, 1), device=device), + vqvae_model=stage_1, + transformer_model=stage_2, + ordering=ordering, + ) + self.assertEqual(sample.shape, (2, 1, 8, 8)) + + @skipUnless(has_einops, "Requires einops") + def test_sample_shorter_sequence(self): + stage_1 = VQVAE( + spatial_dims=2, + in_channels=1, + out_channels=1, + channels=(8, 8), + num_res_channels=(8, 8), + downsample_parameters=((2, 4, 1, 1),) * 2, + upsample_parameters=((2, 4, 1, 1, 0),) * 2, + num_res_layers=1, + num_embeddings=16, + embedding_dim=8, + ) + stage_2 = DecoderOnlyTransformer( + num_tokens=16 + 1, + max_seq_len=2, + attn_layers_dim=4, + attn_layers_depth=2, + attn_layers_heads=1, + with_cross_attention=False, + ) + ordering = Ordering(ordering_type=OrderingType.RASTER_SCAN.value, spatial_dims=2, dimensions=(2, 2, 2)) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + inferer = VQVAETransformerInferer() + + starting_token = 16 # from stage_1 num_embeddings + + sample = inferer.sample( + latent_spatial_dim=(2, 2), + starting_tokens=starting_token * torch.ones((2, 1), device=device), + vqvae_model=stage_1, + transformer_model=stage_2, + ordering=ordering, + ) + self.assertEqual(sample.shape, (2, 1, 8, 8)) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_get_likelihood( + self, stage_1_params, stage_2_params, ordering_params, input_shape, logits_shape, latent_shape + ): + stage_1 = VQVAE(**stage_1_params) + stage_2 = DecoderOnlyTransformer(**stage_2_params) + ordering = Ordering(**ordering_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + input = torch.randn(input_shape).to(device) + + inferer = VQVAETransformerInferer() + likelihood = inferer.get_likelihood( + inputs=input, vqvae_model=stage_1, transformer_model=stage_2, ordering=ordering + ) + self.assertEqual(likelihood.shape, latent_shape) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_get_likelihood_shorter_sequence( + self, stage_1_params, stage_2_params, ordering_params, input_shape, logits_shape, latent_shape + ): + stage_1 = VQVAE(**stage_1_params) + max_seq_len = 3 + stage_2_params_shorter = dict(stage_2_params) + stage_2_params_shorter["max_seq_len"] = max_seq_len + stage_2 = DecoderOnlyTransformer(**stage_2_params_shorter) + ordering = Ordering(**ordering_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + input = torch.randn(input_shape).to(device) + + inferer = VQVAETransformerInferer() + likelihood = inferer.get_likelihood( + inputs=input, vqvae_model=stage_1, transformer_model=stage_2, ordering=ordering + ) + self.assertEqual(likelihood.shape, latent_shape) + + @parameterized.expand(TEST_CASES) + @skipUnless(has_einops, "Requires einops") + def test_get_likelihood_resampling( + self, stage_1_params, stage_2_params, ordering_params, input_shape, logits_shape, latent_shape + ): + stage_1 = VQVAE(**stage_1_params) + stage_2 = DecoderOnlyTransformer(**stage_2_params) + ordering = Ordering(**ordering_params) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + stage_1.to(device) + stage_2.to(device) + stage_1.eval() + stage_2.eval() + + input = torch.randn(input_shape).to(device) + + inferer = VQVAETransformerInferer() + likelihood = inferer.get_likelihood( + inputs=input, + vqvae_model=stage_1, + transformer_model=stage_2, + ordering=ordering, + resample_latent_likelihoods=True, + resample_interpolation_mode="nearest", + ) + self.assertEqual(likelihood.shape, input_shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_warp.py b/tests/test_warp.py index bac595224f..0e5f2466db 100644 --- a/tests/test_warp.py +++ b/tests/test_warp.py @@ -124,7 +124,7 @@ def test_itk_benchmark(self): relative_diff = np.mean( np.divide(monai_result - itk_result, itk_result, out=np.zeros_like(itk_result), where=(itk_result != 0)) ) - self.assertTrue(relative_diff < 0.01) + self.assertLess(relative_diff, 0.01) @parameterized.expand(TEST_CASES, skip_on_empty=True) def test_resample(self, input_param, input_data, expected_val): @@ -217,6 +217,7 @@ def itk_warp(img, ddf): # warp warp_filter.SetDisplacementField(displacement_field) warp_filter.SetInput(itk_img) + warp_filter.Update() warped_img = warp_filter.GetOutput() warped_img = np.asarray(warped_img) diff --git a/tests/testing_data/data_config.json b/tests/testing_data/data_config.json index a570c787ba..79033dd0d6 100644 --- a/tests/testing_data/data_config.json +++ b/tests/testing_data/data_config.json @@ -138,13 +138,33 @@ "url": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/ssl_pretrained_weights.pth", "hash_type": "sha256", "hash_val": "c3564f40a6a051d3753a6d8fae5cc8eaf21ce8d82a9a3baf80748d15664055e8" + }, + "decoder_only_transformer_monai_generative_weights": { + "url": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/decoder_only_transformer.pth", + "hash_type": "sha256", + "hash_val": "f93de37d64d77cf91f3bde95cdf93d161aee800074c89a92aff9d5699120ec0d" + }, + "diffusion_model_unet_monai_generative_weights": { + "url": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/diffusion_model_unet.pth", + "hash_type": "sha256", + "hash_val": "0d2171b386902f5b4fd3e967b4024f63e353694ca45091b114970019d045beee" + }, + "autoencoderkl_monai_generative_weights": { + "url": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/autoencoderkl.pth", + "hash_type": "sha256", + "hash_val": "6e02c9540c51b16b9ba98b5c0c75d6b84b430afe9a3237df1d67a520f8d34184" + }, + "controlnet_monai_generative_weights": { + "url": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/controlnet.pth", + "hash_type": "sha256", + "hash_val": "cd100d0c69f47569ae5b4b7df653a1cb19f5e02eff1630db3210e2646fb1ab2e" } }, "configs": { "test_meta_file": { - "url": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json", + "url": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20240725.json", "hash_type": "md5", - "hash_val": "662135097106b71067cd1fc657f8720f" + "hash_val": "06954cad2cc5d3784e72077ac76f0fc8" } } } diff --git a/tests/testing_data/fl_infer_properties.json b/tests/testing_data/fl_infer_properties.json new file mode 100644 index 0000000000..72e97cd2c6 --- /dev/null +++ b/tests/testing_data/fl_infer_properties.json @@ -0,0 +1,67 @@ +{ + "bundle_root": { + "description": "root path of the bundle.", + "required": true, + "id": "bundle_root" + }, + "device": { + "description": "target device to execute the bundle workflow.", + "required": true, + "id": "device" + }, + "dataset_dir": { + "description": "directory path of the dataset.", + "required": true, + "id": "dataset_dir" + }, + "dataset": { + "description": "PyTorch dataset object for the inference / evaluation logic.", + "required": true, + "id": "dataset" + }, + "evaluator": { + "description": "inference / evaluation workflow engine.", + "required": true, + "id": "evaluator" + }, + "network_def": { + "description": "network module for the inference.", + "required": true, + "id": "network_def" + }, + "inferer": { + "description": "MONAI Inferer object to execute the model computation in inference.", + "required": true, + "id": "inferer" + }, + "dataset_data": { + "description": "data source for the inference / evaluation dataset.", + "required": false, + "id": "dataset::data", + "refer_id": null + }, + "handlers": { + "description": "event-handlers for the inference / evaluation logic.", + "required": false, + "id": "handlers", + "refer_id": "evaluator::val_handlers" + }, + "preprocessing": { + "description": "preprocessing for the input data.", + "required": false, + "id": "preprocessing", + "refer_id": "dataset::transform" + }, + "postprocessing": { + "description": "postprocessing for the model output data.", + "required": false, + "id": "postprocessing", + "refer_id": "evaluator::postprocessing" + }, + "key_metric": { + "description": "the key metric during evaluation.", + "required": false, + "id": "key_metric", + "refer_id": "evaluator::key_val_metric" + } +} diff --git a/tests/testing_data/integration_answers.py b/tests/testing_data/integration_answers.py index c0dd973418..e02b9ae995 100644 --- a/tests/testing_data/integration_answers.py +++ b/tests/testing_data/integration_answers.py @@ -600,6 +600,62 @@ ], } }, + { # test answers for 24.03 + "integration_segmentation_3d": { + "losses": [ + 0.5442982316017151, + 0.4741817444562912, + 0.4535954713821411, + 0.44163046181201937, + 0.4307525992393494, + 0.428487154841423, + ], + "best_metric": 0.9314384460449219, + "infer_metric": 0.9315622448921204, + "output_sums": [ + 0.14268704426414708, + 0.1528672845845743, + 0.1521782248125706, + 0.14028769128068194, + 0.1889830671664784, + 0.16999075690664475, + 0.14736282992708227, + 0.16877952654821815, + 0.15779597155181269, + 0.17987829927082263, + 0.16320253928314676, + 0.16854299322173155, + 0.14497470986956967, + 0.11437140546369519, + 0.1624117412960871, + 0.20156009294443875, + 0.1764654154256958, + 0.0982348259217418, + 0.1942436068604293, + 0.20359421536407518, + 0.19661953116976483, + 0.2088326101468625, + 0.16273043545239807, + 0.1326107887439663, + 0.1489245275752285, + 0.143107476635514, + 0.23189027677929547, + 0.1613818424566088, + 0.14889532196775188, + 0.10332622984492143, + 0.11940054688302351, + 0.13040496302762658, + 0.11472123087193181, + 0.15307044007394474, + 0.16371989575844717, + 0.1942898223272055, + 0.2230120930471398, + 0.1814679187634795, + 0.19069496508164732, + 0.07537197031940022, + ], + } + }, ] diff --git a/tests/testing_data/metadata.json b/tests/testing_data/metadata.json index 98a17b73c5..29737e3a9d 100644 --- a/tests/testing_data/metadata.json +++ b/tests/testing_data/metadata.json @@ -1,5 +1,5 @@ { - "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json", + "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20240725.json", "version": "0.1.0", "changelog": { "0.1.0": "complete the model package", @@ -8,7 +8,7 @@ "monai_version": "0.9.0", "pytorch_version": "1.10.0", "numpy_version": "1.21.2", - "optional_packages_version": { + "required_packages_version": { "nibabel": "3.2.1" }, "task": "Decathlon spleen segmentation", diff --git a/tests/testing_data/ultrasound_confidence_map/femur_input.png b/tests/testing_data/ultrasound_confidence_map/femur_input.png new file mode 100644 index 0000000000..0343e58720 Binary files /dev/null and b/tests/testing_data/ultrasound_confidence_map/femur_input.png differ diff --git a/tests/testing_data/ultrasound_confidence_map/femur_result.npy b/tests/testing_data/ultrasound_confidence_map/femur_result.npy new file mode 100644 index 0000000000..a3f322b113 Binary files /dev/null and b/tests/testing_data/ultrasound_confidence_map/femur_result.npy differ diff --git a/tests/testing_data/ultrasound_confidence_map/neck_input.png b/tests/testing_data/ultrasound_confidence_map/neck_input.png new file mode 100644 index 0000000000..74a64a9d90 Binary files /dev/null and b/tests/testing_data/ultrasound_confidence_map/neck_input.png differ diff --git a/tests/testing_data/ultrasound_confidence_map/neck_result.npy b/tests/testing_data/ultrasound_confidence_map/neck_result.npy new file mode 100644 index 0000000000..8bf760182c Binary files /dev/null and b/tests/testing_data/ultrasound_confidence_map/neck_result.npy differ diff --git a/tests/utils.py b/tests/utils.py index ea73a3ed81..77b53cebb8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -156,6 +156,7 @@ def skip_if_downloading_fails(): "limit", # HTTP Error 503: Egress is over the account limit "authenticate", "timed out", # urlopen error [Errno 110] Connection timed out + "HTTPError", # HTTPError: 429 Client Error: Too Many Requests for huggingface hub ) ): raise unittest.SkipTest(f"error while downloading: {rt_e}") from rt_e # incomplete download @@ -474,7 +475,7 @@ def run_process(self, func, local_rank, args, kwargs, results): if self.verbose: os.environ["NCCL_DEBUG"] = "INFO" os.environ["NCCL_DEBUG_SUBSYS"] = "ALL" - os.environ["NCCL_BLOCKING_WAIT"] = str(1) + os.environ["TORCH_NCCL_BLOCKING_WAIT"] = str(1) os.environ["OMP_NUM_THREADS"] = str(1) os.environ["WORLD_SIZE"] = str(self.nproc_per_node * self.nnodes) os.environ["RANK"] = str(self.nproc_per_node * self.node_rank + local_rank)