From ea53bce58003bb2be2d1f7ba162872cf671252aa Mon Sep 17 00:00:00 2001
From: Ma Zerun <mzr1996@163.com>
Date: Mon, 12 Dec 2022 17:57:09 +0800
Subject: [PATCH 01/21] [Project] Add Example project. (#1254)

---
 .gitignore                                    |   2 +
 projects/README.md                            |  21 +++
 projects/example_project/README.md            | 128 ++++++++++++++++++
 .../configs/examplenet_8xb32_in1k.py          |  10 ++
 projects/example_project/models/__init__.py   |   3 +
 .../example_project/models/example_net.py     |  31 +++++
 6 files changed, 195 insertions(+)
 create mode 100644 projects/README.md
 create mode 100644 projects/example_project/README.md
 create mode 100644 projects/example_project/configs/examplenet_8xb32_in1k.py
 create mode 100644 projects/example_project/models/__init__.py
 create mode 100644 projects/example_project/models/example_net.py

diff --git a/.gitignore b/.gitignore
index edddb09cda1..6f33f4c2134 100644
--- a/.gitignore
+++ b/.gitignore
@@ -125,6 +125,8 @@ venv.bak/
 *.pkl.json
 *.log.json
 /work_dirs
+/projects/*/work_dirs
+/projects/*/data
 /mmcls/.mim
 .DS_Store
 
diff --git a/projects/README.md b/projects/README.md
new file mode 100644
index 00000000000..77098dff36d
--- /dev/null
+++ b/projects/README.md
@@ -0,0 +1,21 @@
+# Welcome to Projects of MMClassification
+
+In this folder, we welcome all contribution of vision deep-learning backbone from community.
+
+Here, these requirements, e.g. code standards, are not that strict as in core package. Thus, developers from the community can implement their algorithms much more easily and efficiently in MMClassification. We appreciate all contributions from community to make MMClassification greater.
+
+Here is an [example project](./example_project) about how to add your algorithms easily.
+
+We also provide some documentation listed below:
+
+- [New Model Guide](https://mmclassification.readthedocs.io/en/dev-1.x/advanced_guides/modules.html)
+
+  The documentation of adding new models.
+
+- [Contribution Guide](https://mmclassification.readthedocs.io/en/dev-1.x/notes/contribution_guide.html)
+
+  The guides for new contributors about how to add your projects to MMClassification.
+
+- [Discussions](https://github.com/open-mmlab/mmclassification/discussions)
+
+  Welcome to start discussion!
diff --git a/projects/example_project/README.md b/projects/example_project/README.md
new file mode 100644
index 00000000000..32325b24e02
--- /dev/null
+++ b/projects/example_project/README.md
@@ -0,0 +1,128 @@
+# Example Project
+
+This is an example README for community `projects/`. You can write your README in your own project. Here are
+some recommended parts of a README for others to understand and use your project, you can copy or modify them
+according to your project.
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Get Started](https://mmclassification.readthedocs.io/en/1.x/get_started.html) to install
+MMClassification.
+
+At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the ImageNet-2012 dataset according to the [instruction](https://mmclassification.readthedocs.io/en/dev-1.x/user_guides/dataset_prepare.html#imagenet).
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmcls configs/examplenet_8xb32_in1k.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmcls configs/examplenet_8xb32_in1k.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmcls configs/examplenet_8xb32_in1k.py --launcher slurm \
+    --gpus 16 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmcls configs/examplenet_8xb32_in1k.py $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmcls configs/examplenet_8xb32_in1k.py $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmcls configs/examplenet_8xb32_in1k.py $CHECKPOINT --launcher slurm \
+    --gpus 16 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+|       Model        |   Pretrain   | Top-1 (%) | Top-5 (%) |                 Config                  |                Download                |
+| :----------------: | :----------: | :-------: | :-------: | :-------------------------------------: | :------------------------------------: |
+|  ExampleNet-tiny   | From scratch |   82.33   |   96.15   | [config](./mvitv2-tiny_8xb256_in1k.py)  | [model](MODEL-LINK) \| [log](LOG-LINK) |
+| ExampleNet-small\* | From scratch |   83.63   |   96.51   | [config](./mvitv2-small_8xb256_in1k.py) |          [model](MODEL-LINK)           |
+| ExampleNet-base\*  | From scratch |   84.34   |   96.86   | [config](./mvitv2-base_8xb256_in1k.py)  |          [model](MODEL-LINK)           |
+
+*Models with * are converted from the [official repo](REPO-LINK). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
+## Citation
+
+<!-- Replace to the citation of the paper your project refers to. -->
+
+```bibtex
+@misc{2020mmclassification,
+    title={OpenMMLab's Image Classification Toolbox and Benchmark},
+    author={MMClassification Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmclassification}},
+    year={2020}
+}
+```
+
+## Checklist
+
+Here is a checklist of this project's progress. And you can ignore this part if you don't plan to contribute
+to MMClassification projects.
+
+- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [ ] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmcls.registry.MODELS` and configurable via a config file. -->
+
+  - [ ] Basic docstrings & proper citation
+
+    <!-- Each major class should contains a docstring, describing its functionality and arguments. If your code is copied or modified from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [ ] Converted checkpoint and results (Only for reproduction)
+
+    <!-- If you are reproducing the result from a paper, make sure the model in the project can match that results. Also please provide checkpoint links or a checkpoint conversion script for others to get the pre-trained model. -->
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training results
+
+    <!-- If you are reproducing the result from a paper, train your model from scratch and verified that the final result can match the original result. Usually, ±0.1% is acceptable for the image classification task on ImageNet-1k. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for the major module are required. [Example](https://github.com/open-mmlab/mmclassification/blob/1.x/tests/test_models/test_backbones/test_vision_transformer.py) -->
+
+  - [ ] Code style
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] `metafile.yml` and `README.md`
+
+    <!-- It will used for MMClassification to acquire your models. [Example](https://github.com/open-mmlab/mmclassification/blob/1.x/configs/mvit/metafile.yml). In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmclassification/blob/1.x/configs/swin_transformer/README.md) -->
diff --git a/projects/example_project/configs/examplenet_8xb32_in1k.py b/projects/example_project/configs/examplenet_8xb32_in1k.py
new file mode 100644
index 00000000000..5e5f89eabb9
--- /dev/null
+++ b/projects/example_project/configs/examplenet_8xb32_in1k.py
@@ -0,0 +1,10 @@
+# Directly inherit the entire recipe you want to use.
+_base_ = 'mmcls::resnet/resnet50_8xb32_in1k.py'
+
+# This line is to import your own modules.
+custom_imports = dict(imports='models')
+
+# Modify the backbone to use your own backbone.
+_base_['model']['backbone'] = dict(type='ExampleNet', depth=18)
+# Modify the in_channels of classifier head to fit your backbone.
+_base_['model']['head']['in_channels'] = 512
diff --git a/projects/example_project/models/__init__.py b/projects/example_project/models/__init__.py
new file mode 100644
index 00000000000..e2d4f2f5712
--- /dev/null
+++ b/projects/example_project/models/__init__.py
@@ -0,0 +1,3 @@
+from .example_net import ExampleNet
+
+__all__ = ['ExampleNet']
diff --git a/projects/example_project/models/example_net.py b/projects/example_project/models/example_net.py
new file mode 100644
index 00000000000..b6ff35dcdf1
--- /dev/null
+++ b/projects/example_project/models/example_net.py
@@ -0,0 +1,31 @@
+from mmcls.models import ResNet
+from mmcls.registry import MODELS
+
+
+# Register your model to the `MODELS`.
+@MODELS.register_module()
+class ExampleNet(ResNet):
+    """Implements an example backbone.
+
+    Implement the backbone network just like a normal pytorch network.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        print('#############################\n'
+              '#  Hello MMClassification!  #\n'
+              '#############################')
+        super().__init__(**kwargs)
+
+    def forward(self, x):
+        """The forward method of the network.
+
+        Args:
+            x (torch.Tensor): A tensor of image batch with shape
+                ``(batch_size, num_channels, height, width)``.
+
+        Returns:
+            Tuple[torch.Tensor]: Please return a tuple of tensors and every
+            tensor is a feature map of specified scale. If you only want the
+            final feature map, simply return a tuple with one item.
+        """
+        return super().forward(x)

From 1c6b077bb17e35d04575402fbb4e0cfa8a982a58 Mon Sep 17 00:00:00 2001
From: Ezra-Yu <18586273+Ezra-Yu@users.noreply.github.com>
Date: Mon, 12 Dec 2022 18:55:09 +0800
Subject: [PATCH 02/21] [Project] Add ACCV workshop 1st Solution. (#1245)

* add accv workshop 1st project

* update projects

* update projects

* fix lint

* Update projects/fgia_accv2022_1st/README.md

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>

* Update projects/fgia_accv2022_1st/README.md

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>

* Update projects/fgia_accv2022_1st/README.md

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>

* update

* Update projects/fgia_accv2022_1st/README.md

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>

* Update projects/fgia_accv2022_1st/README.md

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>

* Update projects/fgia_accv2022_1st/README.md

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>

* update

* Update projects/fgia_accv2022_1st/README.md

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>

* Update projects/fgia_accv2022_1st/README.md

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>

* Update projects/fgia_accv2022_1st/README.md

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>

* Update projects/fgia_accv2022_1st/README.md

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>

* update

* update

* update

Co-authored-by: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>
---
 projects/fgia_accv2022_1st/README.md | 76 ++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 projects/fgia_accv2022_1st/README.md

diff --git a/projects/fgia_accv2022_1st/README.md b/projects/fgia_accv2022_1st/README.md
new file mode 100644
index 00000000000..fb642d8ca4c
--- /dev/null
+++ b/projects/fgia_accv2022_1st/README.md
@@ -0,0 +1,76 @@
+# Solution of FGIA ACCV 2022(1st Place)
+
+This is fine-tuning part of the 1st Place Solution for Webly-supervised Fine-grained Recognition, refer to the ACCV workshop competition in https://www.cvmart.net/race/10412/base.
+
+## Result
+
+<details>
+
+<summary>Show the result</summary>
+
+<br>
+
+**LB A**
+
+![LB-A](https://user-images.githubusercontent.com/18586273/205498131-5728e470-b4f6-43b7-82a5-5f8e3bd5168e.png)
+
+**LB B**
+
+![LB-B](https://user-images.githubusercontent.com/18586273/205498171-5a3a3055-370a-4a8b-9779-b686254ebc94.png)
+
+</br>
+
+</details>
+
+## Reproduce / 复现
+
+For detailed self-supervised pretrain code, please refer to [MMSelfSup](https://github.com/open-mmlab/mmselfsup/tree/dev-1.x/projects/fgia_accv2022_1st).
+For detailed finetuning and inference code, please refer to [this repo](https://github.com/Ezra-Yu/ACCV2022_FGIA_1st).
+
+## Description
+
+### Overview of Our Solution
+
+![image](https://user-images.githubusercontent.com/18586273/205498371-31dbc1f4-5814-44bc-904a-f0d32515c7dd.png)
+
+### Our Model
+
+- ViT(MAE-pre-train)   # Pretrained from [**MMSelfSup**](https://github.com/open-mmlab/mmselfsup/tree/dev-1.x/projects/fgia_accv2022_1st).
+- Swin-v2(SimMIM-pre-train)   # From [MMCls-swin_transformer_v2](https://github.com/open-mmlab/mmclassification/tree/dev-1.x/configs/swin_transformer_v2).
+
+\*\*The architectures we use \*\*
+
+- ViT + CE-loss + post-LongTail-Adjusment
+- ViT + SubCenterArcFaceWithAdvMargin(CE)
+- Swin-B + SubCenterArcFaceWithAdvMargin(SoftMax-EQL)
+- Swin-L + SubCenterArcFaceWithAdvMargin(SoftMAx-EQL)
+
+## bag of tricks paper and code
+
+- [MAE](https://github.com/open-mmlab/mmselfsup/tree/dev-1.x/configs/selfsup/mae) |  [Config](https://github.com/Ezra-Yu/ACCV_workshop/tree/master/configs/vit)
+- [Swinv2](https://github.com/open-mmlab/mmclassification/tree/dev-1.x/configs/swin_transformer_v2) | [Config](https://github.com/Ezra-Yu/ACCV_workshop/tree/master/configs/swin)
+- [ArcFace](https://arxiv.org/abs/1801.07698)   |   [Code](https://github.com/Ezra-Yu/ACCV_workshop/blob/master/src/models/arcface_head.py)
+- [SubCenterArcFaceWithAdvMargin](https://paperswithcode.com/paper/sub-center-arcface-boosting-face-recognition)   |   [Code](https://github.com/Ezra-Yu/ACCV_workshop/blob/master/src/models/arcface_head.py)
+- [Post-LT-adjusment](https://paperswithcode.com/paper/long-tail-learning-via-logit-adjustment)   |   [Code](https://github.com/Ezra-Yu/ACCV_workshop/blob/master/src/models/linear_head_lt.py)
+- [SoftMaxEQL](https://paperswithcode.com/paper/the-equalization-losses-gradient-driven)   |   [Code](https://github.com/Ezra-Yu/ACCV_workshop/blob/master/src/models/eql.py)
+- FlipTTA [Code](https://github.com/Ezra-Yu/ACCV_workshop/blob/master/src/models/tta_classifier.py)
+- clean dataset
+- self-emsemble: [Uniform-model-soup](https://arxiv.org/abs/2203.05482) | [code](https://github.com/Ezra-Yu/ACCV_workshop/blob/master/tools/model_soup.py)
+- [pseudo](https://lilianweng.github.io/posts/2021-12-05-semi-supervised/)  | [Code](https://github.com/Ezra-Yu/ACCV_workshop/blob/master/tools/creat_pseudo.py)
+- bagging-emsemble [Code](https://github.com/Ezra-Yu/ACCV_workshop/blob/master/tools/emsemble.py),
+- post-process: [re-distribute-label](https://github.com/Ezra-Yu/ACCV_workshop/blob/master/tools/re-distribute-label.py);
+
+![Overview](https://user-images.githubusercontent.com/18586273/205498258-e5720d83-7006-4aea-86b5-aab1a8998c6c.png)
+
+![image](https://user-images.githubusercontent.com/18586273/205498027-def99b0d-a99a-470b-b292-8d5fc83111fc.png)
+
+#### Used but no improvements
+
+1. Using retrieval paradigm to solve this classification task;
+2. Using EfficientNetv2 backbone.
+
+#### Not used but worth to do
+
+1. Try [DiVE](https://arxiv.org/abs/2103.15042) algorithm to improve performance in long tail dataset;
+2. Use SimMIM to pre-train Swin-v2 on the competition dataset;
+3. refine the re-distribute-label tool.

From 210373c09322d4a804baca35978fe825138d2ff7 Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Wed, 14 Dec 2022 11:46:39 +0800
Subject: [PATCH 03/21] [Feature] Implementation of  RevViT. (#1127)

* [Feature] implement rev-vit network

* can reproduce the RevViT-Small accuracy 79.9

* update

* [Feature] update revvit

* [Feature] update revvit readme

* Update links

Co-authored-by: mzr1996 <mzr1996@163.com>
---
 .../datasets/imagenet_bs128_revvit_224.py     |  85 +++
 configs/_base_/models/revvit/revvit-base.py   |  29 +
 configs/_base_/models/revvit/revvit-small.py  |  29 +
 .../schedules/imagenet_bs1024_adamw_revvit.py |  41 +
 configs/revvit/README.md                      |  50 ++
 configs/revvit/metafile.yml                   |  48 ++
 configs/revvit/revvit-base_8xb256_in1k.py     |   6 +
 configs/revvit/revvit-small_8xb256_in1k.py    |   6 +
 docs/en/api/models.rst                        |   3 +-
 mmcls/models/backbones/__init__.py            |   2 +
 mmcls/models/backbones/revvit.py              | 708 ++++++++++++++++++
 model-index.yml                               |   1 +
 .../test_models/test_backbones/test_revvit.py | 141 ++++
 tools/model_converters/revvit_to_mmcls.py     | 104 +++
 14 files changed, 1252 insertions(+), 1 deletion(-)
 create mode 100644 configs/_base_/datasets/imagenet_bs128_revvit_224.py
 create mode 100644 configs/_base_/models/revvit/revvit-base.py
 create mode 100644 configs/_base_/models/revvit/revvit-small.py
 create mode 100644 configs/_base_/schedules/imagenet_bs1024_adamw_revvit.py
 create mode 100644 configs/revvit/README.md
 create mode 100644 configs/revvit/metafile.yml
 create mode 100644 configs/revvit/revvit-base_8xb256_in1k.py
 create mode 100644 configs/revvit/revvit-small_8xb256_in1k.py
 create mode 100644 mmcls/models/backbones/revvit.py
 create mode 100644 tests/test_models/test_backbones/test_revvit.py
 create mode 100644 tools/model_converters/revvit_to_mmcls.py

diff --git a/configs/_base_/datasets/imagenet_bs128_revvit_224.py b/configs/_base_/datasets/imagenet_bs128_revvit_224.py
new file mode 100644
index 00000000000..12ef45a5676
--- /dev/null
+++ b/configs/_base_/datasets/imagenet_bs128_revvit_224.py
@@ -0,0 +1,85 @@
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+bgr_mean = data_preprocessor['mean'][::-1]
+bgr_std = data_preprocessor['std'][::-1]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=7,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
+    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',  # should be 'pixel', but currently not supported
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=bgr_mean,
+        fill_std=bgr_std),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=256,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(
+    batch_size=256,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/train.txt',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    persistent_workers=True,
+)
+
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        # ann_file='meta/val.txt',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
diff --git a/configs/_base_/models/revvit/revvit-base.py b/configs/_base_/models/revvit/revvit-base.py
new file mode 100644
index 00000000000..354498ed66d
--- /dev/null
+++ b/configs/_base_/models/revvit/revvit-base.py
@@ -0,0 +1,29 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RevVisionTransformer',
+        arch='deit-base',
+        img_size=224,
+        patch_size=16,
+        output_cls_token=False,
+        avg_token=True,
+        with_cls_token=False,
+    ),
+    neck=None,
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]),
+)
diff --git a/configs/_base_/models/revvit/revvit-small.py b/configs/_base_/models/revvit/revvit-small.py
new file mode 100644
index 00000000000..6d43781aee7
--- /dev/null
+++ b/configs/_base_/models/revvit/revvit-small.py
@@ -0,0 +1,29 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='RevVisionTransformer',
+        arch='deit-small',
+        img_size=224,
+        patch_size=16,
+        output_cls_token=False,
+        avg_token=True,
+        with_cls_token=False,
+    ),
+    neck=None,
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]),
+)
diff --git a/configs/_base_/schedules/imagenet_bs1024_adamw_revvit.py b/configs/_base_/schedules/imagenet_bs1024_adamw_revvit.py
new file mode 100644
index 00000000000..87fd202ce40
--- /dev/null
+++ b/configs/_base_/schedules/imagenet_bs1024_adamw_revvit.py
@@ -0,0 +1,41 @@
+# for batch in each gpu is 128, 8 gpu
+# lr = 5e-4 * 128 * 8 / 512 = 0.001
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW',
+        lr=5e-4 * 2048 / 512,
+        weight_decay=0.05,
+        eps=1e-8,
+        betas=(0.9, 0.999)),
+    paramwise_cfg=dict(
+        norm_decay_mult=0.0,
+        bias_decay_mult=0.0,
+        custom_keys={
+            '.cls_token': dict(decay_mult=0.0),
+            '.pos_embed': dict(decay_mult=0.0)
+        }),
+    clip_grad=dict(max_norm=1.0),
+)
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-8 / 2e-3,
+        by_epoch=True,
+        end=70,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=70)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=300, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# based on the actual training batch size.
+auto_scale_lr = dict(base_batch_size=1024)
diff --git a/configs/revvit/README.md b/configs/revvit/README.md
new file mode 100644
index 00000000000..4d0035ea793
--- /dev/null
+++ b/configs/revvit/README.md
@@ -0,0 +1,50 @@
+# Reversible Vision Transformers
+
+> [Reversible Vision Tranformers](https://openaccess.thecvf.com/content/CVPR2022/papers/Mangalam_Reversible_Vision_Transformers_CVPR_2022_paper.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Introduction
+
+**RevViT** is initially described in [Reversible Vision Tranformers](https://openaccess.thecvf.com/content/CVPR2022/papers/Mangalam_Reversible_Vision_Transformers_CVPR_2022_paper.pdf), which introduce the reversible idea into vision transformer, to reduce the GPU memory footprint required for training.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github.com/facebookresearch/SlowFast/raw/main/projects/rev/teaser.png" width="70%"/>
+</div>
+
+## Abstract
+
+<details>
+
+<summary>Show the paper's abstract</summary>
+
+<!-- [ABSTRACT] -->
+
+<br>
+We present Reversible Vision Transformers, a memory efficient architecture design for visual recognition. By decoupling the GPU memory footprint from the depth of the model, Reversible Vision Transformers enable memory efficient scaling of transformer architectures. We adapt two popular models, namely Vision Transformer and Multiscale Vision Transformers, to reversible variants and benchmark extensively across both model sizes and tasks of image classification, object detection and video classification. Reversible Vision Transformers achieve a reduced memory footprint of up to 15.5× at identical model complexity, parameters and accuracy, demonstrating the promise of reversible vision transformers as an efficient backbone for resource limited training regimes. Finally, we find that the additional computational burden of recomputing activations is more than overcome for deeper models, where throughput can increase up to 3.9× over their non-reversible counterparts.
+</br>
+
+</details>
+
+## Results and models
+
+|            Model             |   Pretrain   | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                 Config                  |                                        Download                                        |
+| :--------------------------: | :----------: | :-------: | :------: | :-------: | :-------: | :-------------------------------------: | :------------------------------------------------------------------------------------: |
+| revvit-small_3rdparty_in1k\* | From scratch |   22.43   |   4.58   |   79.87   |   94.90   | [config](./revvit-small_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/revvit/revvit-base_3rdparty_in1k_20221213-87a7b0a5.pth) |
+| revvit-base_3rdparty_in1k\*  | From scratch |   87.33   |  17.49   |   81.81   |   95.56   | [config](./revvit-base_8xb256_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/revvit/revvit-small_3rdparty_in1k_20221213-a3a34f5c.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/SlowFast). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
+## Citation
+
+```bibtex
+@inproceedings{mangalam2022reversible,
+  title={Reversible Vision Transformers},
+  author={Mangalam, Karttikeya and Fan, Haoqi and Li, Yanghao and Wu, Chao-Yuan and Xiong, Bo and Feichtenhofer, Christoph and Malik, Jitendra},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10830--10840},
+  year={2022}
+}
+```
diff --git a/configs/revvit/metafile.yml b/configs/revvit/metafile.yml
new file mode 100644
index 00000000000..6b5b5818125
--- /dev/null
+++ b/configs/revvit/metafile.yml
@@ -0,0 +1,48 @@
+Collections:
+  - Name: RevViT
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - Vision Transformer
+        - Reversible
+    Paper:
+      URL: https://openaccess.thecvf.com/content/CVPR2022/papers/Mangalam_Reversible_Vision_Transformers_CVPR_2022_paper.pdf
+      Title: Reversible Vision Transformers
+    README: configs/revvit/README.md
+    Code:
+      Version: v1.0.0rc5
+      URL: https://github.com/open-mmlab/mmclassification/blob/1.0.0rc5/mmcls/models/backbones/revvit.py
+
+Models:
+  - Name: revvit-small_3rdparty_in1k
+    Metadata:
+      FLOPs: 4583427072
+      Parameters: 22435432
+    In Collection: RevViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 79.87
+          Top 5 Accuracy: 94.90
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/revvit/revvit-base_3rdparty_in1k_20221213-87a7b0a5.pth
+    Config: configs/revvit/revvit-small_8xb256_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/pyslowfast/rev/REV_VIT_S.pyth
+      Code: https://github.com/facebookresearch/SlowFast
+  - Name: revvit-base_3rdparty_in1k
+    Metadata:
+      FLOPs: 17490450432
+      Parameters: 87337192
+    In Collection: RevViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.81
+          Top 5 Accuracy: 95.56
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/revvit/revvit-small_3rdparty_in1k_20221213-a3a34f5c.pth
+    Config: configs/revvit/revvit-base_8xb256_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/pyslowfast/rev/REV_VIT_B.pyth
+      Code: https://github.com/facebookresearch/SlowFast
diff --git a/configs/revvit/revvit-base_8xb256_in1k.py b/configs/revvit/revvit-base_8xb256_in1k.py
new file mode 100644
index 00000000000..e4fde5c9487
--- /dev/null
+++ b/configs/revvit/revvit-base_8xb256_in1k.py
@@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/models/revvit/revvit-base.py',
+    '../_base_/datasets/imagenet_bs128_revvit_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_revvit.py',
+    '../_base_/default_runtime.py'
+]
diff --git a/configs/revvit/revvit-small_8xb256_in1k.py b/configs/revvit/revvit-small_8xb256_in1k.py
new file mode 100644
index 00000000000..ec3904a3da8
--- /dev/null
+++ b/configs/revvit/revvit-small_8xb256_in1k.py
@@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/models/revvit/revvit-small.py',
+    '../_base_/datasets/imagenet_bs128_revvit_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_revvit.py',
+    '../_base_/default_runtime.py'
+]
diff --git a/docs/en/api/models.rst b/docs/en/api/models.rst
index 146b3d2e83e..24ce3f77411 100644
--- a/docs/en/api/models.rst
+++ b/docs/en/api/models.rst
@@ -73,8 +73,8 @@ Backbones
    EdgeNeXt
    EfficientFormer
    EfficientNet
-   HorNet
    HRNet
+   HorNet
    InceptionV3
    LeNet5
    MViT
@@ -96,6 +96,7 @@ Backbones
    ResNetV1c
    ResNetV1d
    ResNet_CIFAR
+   RevVisionTransformer
    SEResNeXt
    SEResNet
    SVT
diff --git a/mmcls/models/backbones/__init__.py b/mmcls/models/backbones/__init__.py
index cde9ed7f276..b29e63e250a 100644
--- a/mmcls/models/backbones/__init__.py
+++ b/mmcls/models/backbones/__init__.py
@@ -32,6 +32,7 @@
 from .resnet import ResNet, ResNetV1c, ResNetV1d
 from .resnet_cifar import ResNet_CIFAR
 from .resnext import ResNeXt
+from .revvit import RevVisionTransformer
 from .seresnet import SEResNet
 from .seresnext import SEResNeXt
 from .shufflenet_v1 import ShuffleNetV1
@@ -99,4 +100,5 @@
     'MobileViT',
     'DaViT',
     'BEiT',
+    'RevVisionTransformer',
 ]
diff --git a/mmcls/models/backbones/revvit.py b/mmcls/models/backbones/revvit.py
new file mode 100644
index 00000000000..56454774ffb
--- /dev/null
+++ b/mmcls/models/backbones/revvit.py
@@ -0,0 +1,708 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from typing import Sequence
+
+import numpy as np
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import FFN, PatchEmbed
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import trunc_normal_
+from torch import nn
+from torch.autograd import Function as Function
+
+from mmcls.models.backbones.base_backbone import BaseBackbone
+from mmcls.registry import MODELS
+from ..utils import MultiheadAttention, resize_pos_embed, to_2tuple
+
+
+class RevBackProp(Function):
+    """Custom Backpropagation function to allow (A) flushing memory in forward
+    and (B) activation recomputation reversibly in backward for gradient
+    calculation.
+
+    Inspired by
+    https://github.com/RobinBruegger/RevTorch/blob/master/revtorch/revtorch.py
+    """
+
+    @staticmethod
+    def forward(
+            ctx,
+            x,
+            layers,
+            buffer_layers,  # List of layer ids for int activation to buffer
+    ):
+        """Reversible Forward pass.
+
+        Any intermediate activations from `buffer_layers` are cached in ctx for
+        forward pass. This is not necessary for standard usecases. Each
+        reversible layer implements its own forward pass logic.
+        """
+        buffer_layers.sort()
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+        intermediate = []
+
+        for layer in layers:
+            x1, x2 = layer(x1, x2)
+            if layer.layer_id in buffer_layers:
+                intermediate.extend([x1.detach(), x2.detach()])
+
+        if len(buffer_layers) == 0:
+            all_tensors = [x1.detach(), x2.detach()]
+        else:
+            intermediate = [torch.LongTensor(buffer_layers), *intermediate]
+            all_tensors = [x1.detach(), x2.detach(), *intermediate]
+
+        ctx.save_for_backward(*all_tensors)
+        ctx.layers = layers
+
+        return torch.cat([x1, x2], dim=-1)
+
+    @staticmethod
+    def backward(ctx, dx):
+        """Reversible Backward pass.
+
+        Any intermediate activations from `buffer_layers` are recovered from
+        ctx. Each layer implements its own loic for backward pass (both
+        activation recomputation and grad calculation).
+        """
+        d_x1, d_x2 = torch.chunk(dx, 2, dim=-1)
+        # retrieve params from ctx for backward
+        x1, x2, *int_tensors = ctx.saved_tensors
+        # no buffering
+        if len(int_tensors) != 0:
+            buffer_layers = int_tensors[0].tolist()
+        else:
+            buffer_layers = []
+
+        layers = ctx.layers
+
+        for _, layer in enumerate(layers[::-1]):
+            if layer.layer_id in buffer_layers:
+                x1, x2, d_x1, d_x2 = layer.backward_pass(
+                    y1=int_tensors[buffer_layers.index(layer.layer_id) * 2 +
+                                   1],
+                    y2=int_tensors[buffer_layers.index(layer.layer_id) * 2 +
+                                   2],
+                    d_y1=d_x1,
+                    d_y2=d_x2,
+                )
+            else:
+                x1, x2, d_x1, d_x2 = layer.backward_pass(
+                    y1=x1,
+                    y2=x2,
+                    d_y1=d_x1,
+                    d_y2=d_x2,
+                )
+
+        dx = torch.cat([d_x1, d_x2], dim=-1)
+
+        del int_tensors
+        del d_x1, d_x2, x1, x2
+
+        return dx, None, None
+
+
+class RevTransformerEncoderLayer(BaseModule):
+    """Reversible Transformer Encoder Layer.
+
+    This module is a building block of Reversible Transformer Encoder,
+    which support backpropagation without storing activations.
+    The residual connection is not applied to the FFN layer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default: 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0
+        drop_path_rate (float): stochastic depth rate.
+            Default 0.0
+        num_fcs (int): The number of linear in FFN
+            Default: 2
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU')
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        layer_id (int): The layer id of current layer. Used in RevBackProp.
+            Default: 0
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int,
+                 feedforward_channels: int,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 num_fcs: int = 2,
+                 qkv_bias: bool = True,
+                 act_cfg: dict = dict(type='GELU'),
+                 norm_cfg: dict = dict(type='LN'),
+                 layer_id: int = 0,
+                 init_cfg=None):
+        super(RevTransformerEncoderLayer, self).__init__(init_cfg=init_cfg)
+
+        self.drop_path_cfg = dict(type='DropPath', drop_prob=drop_path_rate)
+        self.embed_dims = embed_dims
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.embed_dims, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+
+        self.attn = MultiheadAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            qkv_bias=qkv_bias)
+
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, self.embed_dims, postfix=2)
+        self.add_module(self.norm2_name, norm2)
+
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=num_fcs,
+            ffn_drop=drop_rate,
+            act_cfg=act_cfg,
+            add_identity=False)
+
+        self.layer_id = layer_id
+        self.seeds = {}
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
+
+    def init_weights(self):
+        super(RevTransformerEncoderLayer, self).init_weights()
+        for m in self.ffn.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                nn.init.normal_(m.bias, std=1e-6)
+
+    def seed_cuda(self, key):
+        """Fix seeds to allow for stochastic elements such as dropout to be
+        reproduced exactly in activation recomputation in the backward pass."""
+        # randomize seeds
+        # use cuda generator if available
+        if (hasattr(torch.cuda, 'default_generators')
+                and len(torch.cuda.default_generators) > 0):
+            # GPU
+            device_idx = torch.cuda.current_device()
+            seed = torch.cuda.default_generators[device_idx].seed()
+        else:
+            # CPU
+            seed = int(torch.seed() % sys.maxsize)
+
+        self.seeds[key] = seed
+        torch.manual_seed(self.seeds[key])
+
+    def forward(self, x1, x2):
+        """
+        Implementation of Reversible TransformerEncoderLayer
+
+        `
+        x = x + self.attn(self.norm1(x))
+        x = self.ffn(self.norm2(x), identity=x)
+        `
+        """
+        self.seed_cuda('attn')
+        # attention output
+        f_x2 = self.attn(self.norm1(x2))
+        # apply droppath on attention output
+        self.seed_cuda('droppath')
+        f_x2_dropped = build_dropout(self.drop_path_cfg)(f_x2)
+        y1 = x1 + f_x2_dropped
+
+        # free memory
+        if self.training:
+            del x1
+
+        # ffn output
+        self.seed_cuda('ffn')
+        g_y1 = self.ffn(self.norm2(y1))
+        # apply droppath on ffn output
+        torch.manual_seed(self.seeds['droppath'])
+        g_y1_dropped = build_dropout(self.drop_path_cfg)(g_y1)
+        # final output
+        y2 = x2 + g_y1_dropped
+
+        # free memory
+        if self.training:
+            del x2
+
+        return y1, y2
+
+    def backward_pass(self, y1, y2, d_y1, d_y2):
+        """Activation re-compute with the following equation.
+
+        x2 = y2 - g(y1), g = FFN
+        x1 = y1 - f(x2), f = MSHA
+        """
+
+        # temporarily record intermediate activation for G
+        # and use them for gradient calculation of G
+        with torch.enable_grad():
+            y1.requires_grad = True
+
+            torch.manual_seed(self.seeds['ffn'])
+            g_y1 = self.ffn(self.norm2(y1))
+
+            torch.manual_seed(self.seeds['droppath'])
+            g_y1 = build_dropout(self.drop_path_cfg)(g_y1)
+
+            g_y1.backward(d_y2, retain_graph=True)
+
+        # activate recomputation is by design and not part of
+        # the computation graph in forward pass
+        with torch.no_grad():
+            x2 = y2 - g_y1
+            del g_y1
+
+            d_y1 = d_y1 + y1.grad
+            y1.grad = None
+
+        # record F activation and calculate gradients on F
+        with torch.enable_grad():
+            x2.requires_grad = True
+
+            torch.manual_seed(self.seeds['attn'])
+            f_x2 = self.attn(self.norm1(x2))
+
+            torch.manual_seed(self.seeds['droppath'])
+            f_x2 = build_dropout(self.drop_path_cfg)(f_x2)
+
+            f_x2.backward(d_y1, retain_graph=True)
+
+        # propagate reverse computed activations at the
+        # start of the previous block
+        with torch.no_grad():
+            x1 = y1 - f_x2
+            del f_x2, y1
+
+            d_y2 = d_y2 + x2.grad
+
+            x2.grad = None
+            x2 = x2.detach()
+
+        return x1, x2, d_y1, d_y2
+
+
+class TwoStreamFusion(nn.Module):
+    """A general constructor for neural modules fusing two equal sized tensors
+    in forward.
+
+    Args:
+        mode (str): The mode of fusion. Options are 'add', 'max', 'min',
+            'avg', 'concat'.
+    """
+
+    def __init__(self, mode: str):
+        super().__init__()
+        self.mode = mode
+
+        if mode == 'add':
+            self.fuse_fn = lambda x: torch.stack(x).sum(dim=0)
+        elif mode == 'max':
+            self.fuse_fn = lambda x: torch.stack(x).max(dim=0).values
+        elif mode == 'min':
+            self.fuse_fn = lambda x: torch.stack(x).min(dim=0).values
+        elif mode == 'avg':
+            self.fuse_fn = lambda x: torch.stack(x).mean(dim=0)
+        elif mode == 'concat':
+            self.fuse_fn = lambda x: torch.cat(x, dim=-1)
+        else:
+            raise NotImplementedError
+
+    def forward(self, x):
+        # split the tensor into two halves in the channel dimension
+        x = torch.chunk(x, 2, dim=2)
+        return self.fuse_fn(x)
+
+
+@MODELS.register_module()
+class RevVisionTransformer(BaseBackbone):
+    """Reversible Vision Transformer.
+
+    A PyTorch implementation of : `Reversible Vision Transformers <https://openaccess.thecvf.com/content/CVPR2022/papers/Mangalam_Reversible_Vision_Transformers_CVPR_2022_paper.pdf>`_ # noqa: E501
+
+    Args:
+        arch (str | dict): Vision Transformer architecture. If use string,
+            choose from 'small', 'base', 'large', 'deit-tiny', 'deit-small'
+            and 'deit-base'. If use dict, it should have below keys:
+
+            - **embed_dims** (int): The dimensions of embedding.
+            - **num_layers** (int): The number of transformer encoder layers.
+            - **num_heads** (int): The number of heads in attention modules.
+            - **feedforward_channels** (int): The hidden dimensions in
+              feedforward modules.
+
+            Defaults to 'base'.
+        img_size (int | tuple): The expected input image shape. Because we
+            support dynamic input shape, just set the argument to the most
+            common input image shape. Defaults to 224.
+        patch_size (int | tuple): The patch size in patch embedding.
+            Defaults to 16.
+        in_channels (int): The num of input channels. Defaults to 3.
+        out_indices (Sequence | int): Output from which stages.
+            Defaults to -1, means the last stage.
+        drop_rate (float): Probability of an element to be zeroed.
+            Defaults to 0.
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        qkv_bias (bool): Whether to add bias for qkv in attention modules.
+            Defaults to True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='LN')``.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Defaults to True.
+        with_cls_token (bool): Whether concatenating class token into image
+            tokens as transformer input. Defaults to True.
+        avg_token (bool): Whether or not to use the mean patch token for
+            classification. If True, the model will only take the average
+            of all patch tokens. Defaults to False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Defaults to -1.
+        output_cls_token (bool): Whether output the cls_token. If set True,
+            ``with_cls_token`` must be True. Defaults to True.
+        interpolate_mode (str): Select the interpolate mode for position
+            embeding vector resize. Defaults to "bicubic".
+        patch_cfg (dict): Configs of patch embeding. Defaults to an empty dict.
+        layer_cfgs (Sequence | dict): Configs of each transformer layer in
+            encoder. Defaults to an empty dict.
+        fusion_mode (str): The fusion mode of transformer layers.
+            Defaults to 'concat'.
+        no_custom_backward (bool): Whether to use custom backward.
+            Defaults to False.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to None.
+    """
+    arch_zoo = {
+        **dict.fromkeys(
+            ['s', 'small'], {
+                'embed_dims': 768,
+                'num_layers': 8,
+                'num_heads': 8,
+                'feedforward_channels': 768 * 3,
+            }),
+        **dict.fromkeys(
+            ['b', 'base'], {
+                'embed_dims': 768,
+                'num_layers': 12,
+                'num_heads': 12,
+                'feedforward_channels': 3072
+            }),
+        **dict.fromkeys(
+            ['l', 'large'], {
+                'embed_dims': 1024,
+                'num_layers': 24,
+                'num_heads': 16,
+                'feedforward_channels': 4096
+            }),
+        **dict.fromkeys(
+            ['h', 'huge'],
+            {
+                # The same as the implementation in MAE
+                # <https://arxiv.org/abs/2111.06377>
+                'embed_dims': 1280,
+                'num_layers': 32,
+                'num_heads': 16,
+                'feedforward_channels': 5120
+            }),
+        **dict.fromkeys(
+            ['deit-t', 'deit-tiny'], {
+                'embed_dims': 192,
+                'num_layers': 12,
+                'num_heads': 3,
+                'feedforward_channels': 192 * 4
+            }),
+        **dict.fromkeys(
+            ['deit-s', 'deit-small'], {
+                'embed_dims': 384,
+                'num_layers': 12,
+                'num_heads': 6,
+                'feedforward_channels': 384 * 4
+            }),
+        **dict.fromkeys(
+            ['deit-b', 'deit-base'], {
+                'embed_dims': 768,
+                'num_layers': 12,
+                'num_heads': 12,
+                'feedforward_channels': 768 * 4
+            }),
+    }
+    # Some structures have multiple extra tokens, like DeiT.
+    num_extra_tokens = 1  # cls_token
+
+    def __init__(self,
+                 arch='base',
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 out_indices=-1,
+                 drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 final_norm=True,
+                 with_cls_token=False,
+                 avg_token=True,
+                 frozen_stages=-1,
+                 output_cls_token=False,
+                 interpolate_mode='bicubic',
+                 patch_cfg=dict(),
+                 layer_cfgs=dict(),
+                 fusion_mode='concat',
+                 no_custom_backward=False,
+                 init_cfg=None):
+        super(RevVisionTransformer, self).__init__(init_cfg)
+
+        if isinstance(arch, str):
+            arch = arch.lower()
+            assert arch in set(self.arch_zoo), \
+                f'Arch {arch} is not in default archs {set(self.arch_zoo)}'
+            self.arch_settings = self.arch_zoo[arch]
+        else:
+            essential_keys = {
+                'embed_dims', 'num_layers', 'num_heads', 'feedforward_channels'
+            }
+            assert isinstance(arch, dict) and essential_keys <= set(arch), \
+                f'Custom arch needs a dict with keys {essential_keys}'
+            self.arch_settings = arch
+
+        self.embed_dims = self.arch_settings['embed_dims']
+        self.num_layers = self.arch_settings['num_layers']
+        self.img_size = to_2tuple(img_size)
+        self.no_custom_backward = no_custom_backward
+
+        # Set patch embedding
+        _patch_cfg = dict(
+            in_channels=in_channels,
+            input_size=img_size,
+            embed_dims=self.embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=patch_size,
+        )
+        _patch_cfg.update(patch_cfg)
+        self.patch_embed = PatchEmbed(**_patch_cfg)
+        self.patch_resolution = self.patch_embed.init_out_size
+        num_patches = self.patch_resolution[0] * self.patch_resolution[1]
+
+        # Set cls token
+        if output_cls_token:
+            assert with_cls_token is True, f'with_cls_token must be True if' \
+                f'set output_cls_token to True, but got {with_cls_token}'
+        self.with_cls_token = with_cls_token
+        assert with_cls_token is False, 'with_cls_token=True is not supported'
+
+        self.output_cls_token = output_cls_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims))
+
+        # Set position embedding
+        self.interpolate_mode = interpolate_mode
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + self.num_extra_tokens,
+                        self.embed_dims))
+        self._register_load_state_dict_pre_hook(self._prepare_pos_embed)
+
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        if isinstance(out_indices, int):
+            out_indices = [out_indices]
+        assert isinstance(out_indices, Sequence), \
+            f'"out_indices" must by a sequence or int, ' \
+            f'get {type(out_indices)} instead.'
+        for i, index in enumerate(out_indices):
+            if index < 0:
+                out_indices[i] = self.num_layers + index
+            assert 0 <= out_indices[i] <= self.num_layers, \
+                f'Invalid out_indices {index}'
+        self.out_indices = out_indices
+        assert out_indices == [-1] or out_indices == [self.num_layers - 1], \
+            f'only support output last layer current, but got {out_indices}'
+
+        # stochastic depth decay rule
+        dpr = np.linspace(0, drop_path_rate, self.num_layers)
+
+        self.layers = ModuleList()
+        if isinstance(layer_cfgs, dict):
+            layer_cfgs = [layer_cfgs] * self.num_layers
+        for i in range(self.num_layers):
+            _layer_cfg = dict(
+                embed_dims=self.embed_dims,
+                num_heads=self.arch_settings['num_heads'],
+                feedforward_channels=self.
+                arch_settings['feedforward_channels'],
+                drop_rate=drop_rate,
+                drop_path_rate=dpr[i],
+                qkv_bias=qkv_bias,
+                layer_id=i,
+                norm_cfg=norm_cfg)
+            _layer_cfg.update(layer_cfgs[i])
+            self.layers.append(RevTransformerEncoderLayer(**_layer_cfg))
+
+        # fusion operation for the final output
+        self.fusion_layer = TwoStreamFusion(mode=fusion_mode)
+
+        self.frozen_stages = frozen_stages
+        self.final_norm = final_norm
+        if final_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, self.embed_dims * 2, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+
+        self.avg_token = avg_token
+
+        # freeze stages only when self.frozen_stages > 0
+        if self.frozen_stages > 0:
+            self._freeze_stages()
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def init_weights(self):
+        super(RevVisionTransformer, self).init_weights()
+        if not (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            trunc_normal_(self.pos_embed, std=0.02)
+
+    def _prepare_pos_embed(self, state_dict, prefix, *args, **kwargs):
+        name = prefix + 'pos_embed'
+        if name not in state_dict.keys():
+            return
+
+        ckpt_pos_embed_shape = state_dict[name].shape
+        if self.pos_embed.shape != ckpt_pos_embed_shape:
+            from mmengine.logging import MMLogger
+            logger = MMLogger.get_current_instance()
+            logger.info(
+                f'Resize the pos_embed shape from {ckpt_pos_embed_shape} '
+                f'to {self.pos_embed.shape}.')
+
+            ckpt_pos_embed_shape = to_2tuple(
+                int(np.sqrt(ckpt_pos_embed_shape[1] - self.num_extra_tokens)))
+            pos_embed_shape = self.patch_embed.init_out_size
+
+            state_dict[name] = resize_pos_embed(state_dict[name],
+                                                ckpt_pos_embed_shape,
+                                                pos_embed_shape,
+                                                self.interpolate_mode,
+                                                self.num_extra_tokens)
+
+    @staticmethod
+    def resize_pos_embed(*args, **kwargs):
+        """Interface for backward-compatibility."""
+        return resize_pos_embed(*args, **kwargs)
+
+    def _freeze_stages(self):
+        # freeze position embedding
+        self.pos_embed.requires_grad = False
+        # set dropout to eval model
+        self.drop_after_pos.eval()
+        # freeze patch embedding
+        self.patch_embed.eval()
+        for param in self.patch_embed.parameters():
+            param.requires_grad = False
+        # freeze cls_token
+        # self.cls_token.requires_grad = False
+        # freeze layers
+        for i in range(1, self.frozen_stages + 1):
+            m = self.layers[i - 1]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+        # freeze the last layer norm
+        if self.frozen_stages == len(self.layers) and self.final_norm:
+            self.norm1.eval()
+            for param in self.norm1.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        B = x.shape[0]
+        x, patch_resolution = self.patch_embed(x)
+
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        x = x + resize_pos_embed(
+            self.pos_embed,
+            self.patch_resolution,
+            patch_resolution,
+            mode=self.interpolate_mode,
+            num_extra_tokens=self.num_extra_tokens)
+        x = self.drop_after_pos(x)
+
+        if not self.with_cls_token:
+            # Remove class token for transformer encoder input
+            x = x[:, 1:]
+
+        x = torch.cat([x, x], dim=-1)
+
+        # forward with different conditions
+        if not self.training or self.no_custom_backward:
+            # in eval/inference model
+            executing_fn = RevVisionTransformer._forward_vanilla_bp
+        else:
+            # use custom backward when self.training=True.
+            executing_fn = RevBackProp.apply
+
+        x = executing_fn(x, self.layers, [])
+
+        if self.final_norm:
+            x = self.norm1(x)
+        x = self.fusion_layer(x)
+
+        if self.with_cls_token:
+            # RevViT does not allow cls_token
+            raise NotImplementedError
+        else:
+            # (B, H, W, C)
+            _, __, C = x.shape
+            patch_token = x.reshape(B, *patch_resolution, C)
+            # (B, C, H, W)
+            patch_token = patch_token.permute(0, 3, 1, 2)
+            cls_token = None
+
+        if self.avg_token:
+            # (B, H, W, C)
+            patch_token = patch_token.permute(0, 2, 3, 1)
+            # (B, L, C) -> (B, C)
+            patch_token = patch_token.reshape(
+                B, patch_resolution[0] * patch_resolution[1], C).mean(dim=1)
+
+        if self.output_cls_token:
+            out = [patch_token, cls_token]
+        else:
+            out = patch_token
+
+        return tuple([out])
+
+    @staticmethod
+    def _forward_vanilla_bp(hidden_state, layers, buffer=[]):
+        """Using reversible layers without reversible backpropagation.
+
+        Debugging purpose only. Activated with self.no_custom_backward
+        """
+        # split into ffn state(ffn_out) and attention output(attn_out)
+        ffn_out, attn_out = torch.chunk(hidden_state, 2, dim=-1)
+        del hidden_state
+
+        for _, layer in enumerate(layers):
+            attn_out, ffn_out = layer(attn_out, ffn_out)
+
+        return torch.cat([attn_out, ffn_out], dim=-1)
diff --git a/model-index.yml b/model-index.yml
index 8f57a4e14a7..60ef92c60c3 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -42,3 +42,4 @@ Import:
   - configs/csra/metafile.yml
   - configs/beit/metafile.yml
   - configs/beitv2/metafile.yml
+  - configs/revvit/metafile.yml
diff --git a/tests/test_models/test_backbones/test_revvit.py b/tests/test_models/test_backbones/test_revvit.py
new file mode 100644
index 00000000000..d50bffc11ac
--- /dev/null
+++ b/tests/test_models/test_backbones/test_revvit.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+from mmengine.runner import load_checkpoint, save_checkpoint
+
+from mmcls.models.backbones import RevVisionTransformer
+from .utils import timm_resize_pos_embed
+
+
+class TestRevVisionTransformer(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(
+            arch='b', img_size=224, patch_size=16, drop_path_rate=0.1)
+
+    def test_structure(self):
+        # Test invalid default arch
+        with self.assertRaisesRegex(AssertionError, 'not in default archs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = 'unknown'
+            RevVisionTransformer(**cfg)
+
+        # Test invalid custom arch
+        with self.assertRaisesRegex(AssertionError, 'Custom arch needs'):
+            cfg = deepcopy(self.cfg)
+            cfg['arch'] = {
+                'num_layers': 24,
+                'num_heads': 16,
+                'feedforward_channels': 4096
+            }
+            RevVisionTransformer(**cfg)
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+        cfg['arch'] = {
+            'embed_dims': 128,
+            'num_layers': 24,
+            'num_heads': 16,
+            'feedforward_channels': 1024
+        }
+        model = RevVisionTransformer(**cfg)
+        self.assertEqual(model.embed_dims, 128)
+        self.assertEqual(model.num_layers, 24)
+        for layer in model.layers:
+            self.assertEqual(layer.attn.num_heads, 16)
+            self.assertEqual(layer.ffn.feedforward_channels, 1024)
+
+        # Test out_indices
+        # TODO: to be implemented, current only support last layer
+        cfg = deepcopy(self.cfg)
+        cfg['out_indices'] = {1: 1}
+        with self.assertRaisesRegex(AssertionError, "get <class 'dict'>"):
+            RevVisionTransformer(**cfg)
+        cfg['out_indices'] = [13]
+        with self.assertRaisesRegex(AssertionError, 'Invalid out_indices 13'):
+            RevVisionTransformer(**cfg)
+
+        # Test model structure
+        cfg = deepcopy(self.cfg)
+        model = RevVisionTransformer(**cfg)
+        self.assertEqual(len(model.layers), 12)
+        dpr_inc = 0.1 / (12 - 1)
+        dpr = 0
+        for layer in model.layers:
+            self.assertEqual(layer.attn.embed_dims, 768)
+            self.assertEqual(layer.attn.num_heads, 12)
+            self.assertEqual(layer.ffn.feedforward_channels, 3072)
+            # self.assertAlmostEqual(layer.attn.out_drop.drop_prob, dpr)
+            # self.assertAlmostEqual(layer.ffn.dropout_layer.drop_prob, dpr)
+            dpr += dpr_inc
+
+    def test_init_weights(self):
+        # test weight init cfg
+        cfg = deepcopy(self.cfg)
+        cfg['init_cfg'] = [
+            dict(
+                type='Kaiming',
+                layer='Conv2d',
+                mode='fan_in',
+                nonlinearity='linear')
+        ]
+        model = RevVisionTransformer(**cfg)
+        ori_weight = model.patch_embed.projection.weight.clone().detach()
+        # The pos_embed is all zero before initialize
+        self.assertTrue(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+        model.init_weights()
+        initialized_weight = model.patch_embed.projection.weight
+        self.assertFalse(torch.allclose(ori_weight, initialized_weight))
+        self.assertFalse(torch.allclose(model.pos_embed, torch.tensor(0.)))
+
+        # test load checkpoint
+        pretrain_pos_embed = model.pos_embed.clone().detach()
+        tmpdir = tempfile.gettempdir()
+        checkpoint = os.path.join(tmpdir, 'test.pth')
+        save_checkpoint(model.state_dict(), checkpoint)
+        cfg = deepcopy(self.cfg)
+        model = RevVisionTransformer(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        self.assertTrue(torch.allclose(model.pos_embed, pretrain_pos_embed))
+
+        # test load checkpoint with different img_size
+        cfg = deepcopy(self.cfg)
+        cfg['img_size'] = 384
+        model = RevVisionTransformer(**cfg)
+        load_checkpoint(model, checkpoint, strict=True)
+        resized_pos_embed = timm_resize_pos_embed(pretrain_pos_embed,
+                                                  model.pos_embed)
+        self.assertTrue(torch.allclose(model.pos_embed, resized_pos_embed))
+
+        os.remove(checkpoint)
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        cfg = deepcopy(self.cfg)
+        cfg['with_cls_token'] = False
+        cfg['output_cls_token'] = False
+        model = RevVisionTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        patch_token = outs[-1]
+        self.assertEqual(patch_token.shape, (1, 768 * 2))
+
+        # Test forward with dynamic input size
+        imgs1 = torch.randn(1, 3, 224, 224)
+        imgs2 = torch.randn(1, 3, 256, 256)
+        imgs3 = torch.randn(1, 3, 256, 309)
+        cfg = deepcopy(self.cfg)
+        model = RevVisionTransformer(**cfg)
+        for imgs in [imgs1, imgs2, imgs3]:
+            outs = model(imgs)
+            self.assertIsInstance(outs, tuple)
+            self.assertEqual(len(outs), 1)
+            avg_token = outs[-1]
+            self.assertEqual(avg_token.shape, (1, 768 * 2))
diff --git a/tools/model_converters/revvit_to_mmcls.py b/tools/model_converters/revvit_to_mmcls.py
new file mode 100644
index 00000000000..8b5f1dd5edb
--- /dev/null
+++ b/tools/model_converters/revvit_to_mmcls.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_revvit(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    for k, v in list(ckpt.items()):
+        new_v = v
+        if k.startswith('head.projection'):
+            new_k = k.replace('head.projection', 'head.fc')
+            new_ckpt[new_k] = new_v
+            continue
+        elif k.startswith('patch_embed'):
+            if 'proj.' in k:
+                new_k = k.replace('proj.', 'projection.')
+            else:
+                new_k = k
+        elif k.startswith('rev_backbone'):
+            new_k = k.replace('rev_backbone.', '')
+            if 'F.norm' in k:
+                new_k = new_k.replace('F.norm', 'ln1')
+            elif 'G.norm' in k:
+                new_k = new_k.replace('G.norm', 'ln2')
+            elif 'F.attn' in k:
+                new_k = new_k.replace('F.attn', 'attn')
+            elif 'G.mlp.fc1' in k:
+                new_k = new_k.replace('G.mlp.fc1', 'ffn.layers.0.0')
+            elif 'G.mlp.fc2' in k:
+                new_k = new_k.replace('G.mlp.fc2', 'ffn.layers.1')
+        elif k.startswith('norm'):
+            new_k = k.replace('norm', 'ln1')
+        else:
+            new_k = k
+
+        if not new_k.startswith('head'):
+            new_k = 'backbone.' + new_k
+        new_ckpt[new_k] = new_v
+
+    tmp_weight_dir = []
+    tmp_bias_dir = []
+    final_ckpt = OrderedDict()
+    for k, v in list(new_ckpt.items()):
+        if 'attn.q.weight' in k:
+            tmp_weight_dir.append(v)
+        elif 'attn.k.weight' in k:
+            tmp_weight_dir.append(v)
+        elif 'attn.v.weight' in k:
+            tmp_weight_dir.append(v)
+            new_k = k.replace('attn.v.weight', 'attn.qkv.weight')
+            final_ckpt[new_k] = torch.cat(tmp_weight_dir, dim=0)
+            tmp_weight_dir = []
+        elif 'attn.q.bias' in k:
+            tmp_bias_dir.append(v)
+        elif 'attn.k.bias' in k:
+            tmp_bias_dir.append(v)
+        elif 'attn.v.bias' in k:
+            tmp_bias_dir.append(v)
+            new_k = k.replace('attn.v.bias', 'attn.qkv.bias')
+            final_ckpt[new_k] = torch.cat(tmp_bias_dir, dim=0)
+            tmp_bias_dir = []
+        else:
+            final_ckpt[k] = v
+
+        # add pos embed for cls token
+        if k == 'backbone.pos_embed':
+            v = torch.cat([torch.ones_like(v).mean(dim=1, keepdim=True), v],
+                          dim=1)
+            final_ckpt[k] = v
+
+    return final_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in pretrained van models to mmcls style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'model_state' in checkpoint:
+        state_dict = checkpoint['model_state']
+    else:
+        state_dict = checkpoint
+
+    weight = convert_revvit(state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+    print('Done!!')
+
+
+if __name__ == '__main__':
+    main()

From 2535c1ecd7e3509dcd7a86c9dcb55736bb4ad86d Mon Sep 17 00:00:00 2001
From: takuoko <to78314910@gmail.com>
Date: Wed, 14 Dec 2022 14:21:33 +0900
Subject: [PATCH 04/21] [Feature] Support EVA. (#1239)

* add eva

* add eva

* add eva

* sklearn -> scikit-learn

* add large

* Update model names and links.

* Fix resize pos embed error when loading fp16 weight.

* Remove verbose configs.

Co-authored-by: mzr1996 <mzr1996@163.com>
---
 .../benchmark_regression/1-benchmark_valid.py |  48 ++--
 README.md                                     |   1 +
 README_zh-CN.md                               |   1 +
 .../_base_/datasets/imagenet_bs16_eva_196.py  |  62 +++++
 .../_base_/datasets/imagenet_bs16_eva_336.py  |  62 +++++
 .../_base_/datasets/imagenet_bs16_eva_560.py  |  62 +++++
 configs/_base_/models/eva/eva-g.py            |  30 +++
 configs/_base_/models/eva/eva-l.py            |  31 +++
 configs/eva/README.md                         |  62 +++++
 configs/eva/eva-g-p14_8xb16_in1k-336px.py     |   9 +
 configs/eva/eva-g-p14_8xb16_in1k-560px.py     |   9 +
 configs/eva/eva-g-p14_headless.py             |  25 ++
 configs/eva/eva-g-p16_headless.py             |  25 ++
 configs/eva/eva-l-p14_8xb16_in1k-196px.py     |   9 +
 configs/eva/eva-l-p14_8xb16_in1k-336px.py     |   9 +
 configs/eva/eva-l-p14_headless.py             |  26 +++
 configs/eva/metafile.yml                      | 214 ++++++++++++++++++
 mmcls/apis/model.py                           |   5 +-
 mmcls/models/backbones/vision_transformer.py  |  10 +
 mmcls/models/utils/embed.py                   |   4 +-
 model-index.yml                               |   1 +
 tests/test_models/test_backbones/test_beit.py |   1 +
 tools/model_converters/eva_to_mmcls.py        |  75 ++++++
 23 files changed, 759 insertions(+), 22 deletions(-)
 create mode 100644 configs/_base_/datasets/imagenet_bs16_eva_196.py
 create mode 100644 configs/_base_/datasets/imagenet_bs16_eva_336.py
 create mode 100644 configs/_base_/datasets/imagenet_bs16_eva_560.py
 create mode 100644 configs/_base_/models/eva/eva-g.py
 create mode 100644 configs/_base_/models/eva/eva-l.py
 create mode 100644 configs/eva/README.md
 create mode 100644 configs/eva/eva-g-p14_8xb16_in1k-336px.py
 create mode 100644 configs/eva/eva-g-p14_8xb16_in1k-560px.py
 create mode 100644 configs/eva/eva-g-p14_headless.py
 create mode 100644 configs/eva/eva-g-p16_headless.py
 create mode 100644 configs/eva/eva-l-p14_8xb16_in1k-196px.py
 create mode 100644 configs/eva/eva-l-p14_8xb16_in1k-336px.py
 create mode 100644 configs/eva/eva-l-p14_headless.py
 create mode 100644 configs/eva/metafile.yml
 create mode 100644 tools/model_converters/eva_to_mmcls.py

diff --git a/.dev_scripts/benchmark_regression/1-benchmark_valid.py b/.dev_scripts/benchmark_regression/1-benchmark_valid.py
index f2335a4fdd5..c8c63c2daa1 100644
--- a/.dev_scripts/benchmark_regression/1-benchmark_valid.py
+++ b/.dev_scripts/benchmark_regression/1-benchmark_valid.py
@@ -12,11 +12,12 @@
 from mmengine import Config, DictAction, MMLogger
 from mmengine.dataset import Compose, default_collate
 from mmengine.fileio import FileClient
-from mmengine.runner import Runner
+from mmengine.runner import Runner, load_checkpoint
 from modelindex.load_model_index import load
 from rich.console import Console
 from rich.table import Table
 
+from mmcls.apis import init_model
 from mmcls.datasets import CIFAR10, CIFAR100, ImageNet
 from mmcls.utils import register_all_modules
 from mmcls.visualization import ClsVisualizer
@@ -82,20 +83,27 @@ def inference(config_file, checkpoint, work_dir, args, exp_name):
     if args.cfg_options is not None:
         cfg.merge_from_dict(args.cfg_options)
 
-    # build the data pipeline
-    test_dataset = cfg.test_dataloader.dataset
-    if test_dataset.pipeline[0]['type'] != 'LoadImageFromFile':
-        test_dataset.pipeline.insert(0, dict(type='LoadImageFromFile'))
-    if test_dataset.type in ['CIFAR10', 'CIFAR100']:
-        # The image shape of CIFAR is (32, 32, 3)
-        test_dataset.pipeline.insert(1, dict(type='Resize', scale=32))
-
-    data = Compose(test_dataset.pipeline)({'img_path': args.img})
-    data = default_collate([data] * args.batch_size)
-    resolution = tuple(data['inputs'].shape[-2:])
-
-    runner: Runner = Runner.from_cfg(cfg)
-    model = runner.model
+    if 'test_dataloader' in cfg:
+        # build the data pipeline
+        test_dataset = cfg.test_dataloader.dataset
+        if test_dataset.pipeline[0]['type'] != 'LoadImageFromFile':
+            test_dataset.pipeline.insert(0, dict(type='LoadImageFromFile'))
+        if test_dataset.type in ['CIFAR10', 'CIFAR100']:
+            # The image shape of CIFAR is (32, 32, 3)
+            test_dataset.pipeline.insert(1, dict(type='Resize', scale=32))
+
+        data = Compose(test_dataset.pipeline)({'img_path': args.img})
+        data = default_collate([data] * args.batch_size)
+        resolution = tuple(data['inputs'].shape[-2:])
+        model = Runner.from_cfg(cfg).model
+        forward = model.val_step
+    else:
+        # For configs only for get model.
+        model = init_model(cfg)
+        load_checkpoint(model, checkpoint, map_location='cpu')
+        data = torch.empty(1, 3, 224, 224).to(model.data_preprocessor.device)
+        resolution = (224, 224)
+        forward = model.extract_feat
 
     # forward the model
     result = {'resolution': resolution}
@@ -103,16 +111,16 @@ def inference(config_file, checkpoint, work_dir, args, exp_name):
         if args.inference_time:
             time_record = []
             for _ in range(10):
-                model.val_step(data)  # warmup before profiling
+                forward(data)  # warmup before profiling
                 torch.cuda.synchronize()
                 start = time()
-                model.val_step(data)
+                forward(data)
                 torch.cuda.synchronize()
                 time_record.append((time() - start) / args.batch_size * 1000)
             result['time_mean'] = np.mean(time_record[1:-1])
             result['time_std'] = np.std(time_record[1:-1])
         else:
-            model.val_step(data)
+            forward(data)
 
     result['model'] = config_file.stem
 
@@ -144,8 +152,8 @@ def show_summary(summary_data, args):
     if args.inference_time:
         table.add_column('Inference Time (std) (ms/im)')
     if args.flops:
-        table.add_column('Flops', justify='right', width=11)
-        table.add_column('Params', justify='right')
+        table.add_column('Flops', justify='right', width=13)
+        table.add_column('Params', justify='right', width=11)
 
     for model_name, summary in summary_data.items():
         row = [model_name]
diff --git a/README.md b/README.md
index e75daf82afb..3bdb53c7de5 100644
--- a/README.md
+++ b/README.md
@@ -153,6 +153,7 @@ Results and models are available in the [model zoo](https://mmclassification.rea
 - [x] [DaViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/davit)
 - [x] [RepLKNet](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/replknet)
 - [x] [BEiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/beit) / [BEiT v2](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/beitv2)
+- [x] [EVA](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/eva)
 
 </details>
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 01c728c7bf4..0ebde4aefa0 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -154,6 +154,7 @@ mim install -e .
 - [x] [DaViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/davit)
 - [x] [RepLKNet](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/replknet)
 - [x] [BEiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/beit) / [BEiT v2](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/beitv2)
+- [x] [EVA](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/eva)
 
 </details>
 
diff --git a/configs/_base_/datasets/imagenet_bs16_eva_196.py b/configs/_base_/datasets/imagenet_bs16_eva_196.py
new file mode 100644
index 00000000000..292603c0030
--- /dev/null
+++ b/configs/_base_/datasets/imagenet_bs16_eva_196.py
@@ -0,0 +1,62 @@
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=196,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=196,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=196),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/train.txt',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/val.txt',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/imagenet_bs16_eva_336.py b/configs/_base_/datasets/imagenet_bs16_eva_336.py
new file mode 100644
index 00000000000..094c7ddd22f
--- /dev/null
+++ b/configs/_base_/datasets/imagenet_bs16_eva_336.py
@@ -0,0 +1,62 @@
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=336,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=336,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=336),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/train.txt',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/val.txt',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/imagenet_bs16_eva_560.py b/configs/_base_/datasets/imagenet_bs16_eva_560.py
new file mode 100644
index 00000000000..2df2ab45db5
--- /dev/null
+++ b/configs/_base_/datasets/imagenet_bs16_eva_560.py
@@ -0,0 +1,62 @@
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=560,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=560,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=560),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/train.txt',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/val.txt',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
diff --git a/configs/_base_/models/eva/eva-g.py b/configs/_base_/models/eva/eva-g.py
new file mode 100644
index 00000000000..629e32e20b9
--- /dev/null
+++ b/configs/_base_/models/eva/eva-g.py
@@ -0,0 +1,30 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='BEiT',
+        arch='eva-g',
+        img_size=224,
+        patch_size=14,
+        avg_token=True,
+        layer_scale_init_value=0.0,
+        output_cls_token=False,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        use_shared_rel_pos_bias=False,
+    ),
+    neck=None,
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1408,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]))
diff --git a/configs/_base_/models/eva/eva-l.py b/configs/_base_/models/eva/eva-l.py
new file mode 100644
index 00000000000..dcc94f2d429
--- /dev/null
+++ b/configs/_base_/models/eva/eva-l.py
@@ -0,0 +1,31 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='BEiT',
+        arch='l',
+        img_size=224,
+        patch_size=14,
+        avg_token=True,
+        layer_scale_init_value=0.0,
+        output_cls_token=False,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        use_shared_rel_pos_bias=False,
+        layer_cfgs=dict(bias=True),
+    ),
+    neck=None,
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]))
diff --git a/configs/eva/README.md b/configs/eva/README.md
new file mode 100644
index 00000000000..be9724fa906
--- /dev/null
+++ b/configs/eva/README.md
@@ -0,0 +1,62 @@
+# EVA
+
+> [EVA: Exploring the Limits of Masked Visual Representation Learning at Scale](https://arxiv.org/abs/2211.07636)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We launch EVA, a vision-centric foundation model to explore the limits of visual representation at scale using only publicly accessible data. EVA is a vanilla ViT pre-trained to reconstruct the masked out image-text aligned vision features conditioned on visible image patches. Via this pretext task, we can efficiently scale up EVA to one billion parameters, and sets new records on a broad range of representative vision downstream tasks, such as image recognition, video action recognition, object detection, instance segmentation and semantic segmentation without heavy supervised training. Moreover, we observe quantitative changes in scaling EVA result in qualitative changes in transfer learning performance that are not present in other models. For instance, EVA takes a great leap in the challenging large vocabulary instance segmentation task: our model achieves almost the same state-of-the-art performance on LVISv1.0 dataset with over a thousand categories and COCO dataset with only eighty categories. Beyond a pure vision encoder, EVA can also serve as a vision-centric, multi-modal pivot to connect images and text. We find initializing the vision tower of a giant CLIP from EVA can greatly stabilize the training and outperform the training from scratch counterpart with much fewer samples and less compute, providing a new direction for scaling up and accelerating the costly training of multi-modal foundation models.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/24734142/205410193-f1164e56-c117-4165-86f5-4cbfd797bc87.png" width="70%"/>
+</div>
+
+## Results and models
+
+### merged-30M
+
+The pre-trained models on merged-30M are used to fine-tune, and therefore don't have evaluation results.
+
+| Model                              | patch size | resolution |                                                   Download                                                   |
+| :--------------------------------- | :--------: | :--------: | :----------------------------------------------------------------------------------------------------------: |
+| EVA-G (`eva-g-p14_3rdparty_30m`)\* |     14     |  224x224   | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_3rdparty_30m_20221213-3b7aca97.pth) |
+| EVA-G (`eva-g-p16_3rdparty_30m`)\* |  14 to 16  |  224x224   | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p16_3rdparty_30m_20221213-7bed23ee.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/baaivision/EVA).*
+
+### ImageNet-21k
+
+The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don't have evaluation results.
+
+| Model                                        |       Pretrain        | resolution |                                                        Download                                                        |
+| :------------------------------------------- | :-------------------: | :--------: | :--------------------------------------------------------------------------------------------------------------------: |
+| EVA-G (`eva-g-p14_30m-pre_3rdparty_in21k`)\* |      merged-30M       |  224x224   | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-pre_3rdparty_in21k_20221213-d72285b7.pth) |
+| EVA-L (`eva-l-p14_3rdparty-mim_in21k`)\*     | From scratch with MIM |  224x224   |   [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_3rdparty-mim_in21k_20221213-3a5da50b.pth)   |
+| EVA-L (`eva-l-p14_mim-pre_3rdparty_in21k`)\* |          MIM          |  224x224   | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-pre_3rdparty_in21k_20221213-8f194fa2.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/baaivision/EVA).*
+
+### ImageNet-1k
+
+| Model                                       |         Pretrain          | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                  Config                   |                    Download                    |
+| :------------------------------------------ | :-----------------------: | :--------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------: | :--------------------------------------------: |
+| EVA-G (`eva-g-p14_30m-in21k-pre_3rdparty_in1k-336px`)\* | merged-30M & ImageNet-21k |  336x336   |  1013.01  |  620.64  |   89.61   |   98.93   | [config](./eva-g-p14_8xb16_in1k-336px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-in21k-pre_3rdparty_in1k-336px_20221213-210f9071.pth) |
+| EVA-G (`eva-g-p14_30m-in21k-pre_3rdparty_in1k-560px`)\* | merged-30M & ImageNet-21k |  560x560   |  1014.45  | 1906.76  |   89.71   |   98.96   | [config](./eva-g-p14_8xb16_in1k-560px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-in21k-pre_3rdparty_in1k-560px_20221213-fa1c3652.pth) |
+| EVA-L (`eva-l-p14_mim-pre_3rdparty_in1k-336px`)\* |            MIM            |  336x336   |  304.53   |  191.10  |   88.66   |   98.75   | [config](./eva-l-p14_8xb16_in1k-336px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-in21k-pre_3rdparty_in1k-336px_20221213-f25b7634.pth) |
+| EVA-L (`eva-l-p14_mim-in21k-pre_3rdparty_in1k-336px`)\* |    MIM & ImageNet-21k     |  336x336   |  304.53   |  191.10  |   89.17   |   98.86   | [config](./eva-l-p14_8xb16_in1k-336px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-in21k-pre_3rdparty_in1k-336px_20221213-f25b7634.pth) |
+| EVA-L (`eva-l-p14_mim-pre_3rdparty_in1k-196px`)\* |            MIM            |  196x196   |  304.14   |  61.57   |   87.94   |   98.50   | [config](./eva-l-p14_8xb16_in1k-196px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-pre_3rdparty_in1k-196px_20221214-2adf4d28.pth) |
+| EVA-L (`eva-l-p14_mim-in21k-pre_3rdparty_in1k-196px`)\* |    MIM & ImageNet-21k     |  196x196   |  304.14   |  61.57   |   88.58   |   98.65   | [config](./eva-l-p14_8xb16_in1k-196px.py) | [model](https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-in21k-pre_3rdparty_in1k-196px_20221213-b730c7e7.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/baaivision/EVA). The config files of these models are only for inference.*
+
+## Citation
+
+```bibtex
+@article{EVA,
+  title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
+  author={Fang, Yuxin and Wang, Wen and Xie, Binhui and Sun, Quan and Wu, Ledell and Wang, Xinggang and Huang, Tiejun and Wang, Xinlong and Cao, Yue},
+  journal={arXiv preprint arXiv:2211.07636},
+  year={2022}
+}
+```
diff --git a/configs/eva/eva-g-p14_8xb16_in1k-336px.py b/configs/eva/eva-g-p14_8xb16_in1k-336px.py
new file mode 100644
index 00000000000..aa2bd7ee5be
--- /dev/null
+++ b/configs/eva/eva-g-p14_8xb16_in1k-336px.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/eva/eva-g.py',
+    '../_base_/datasets/imagenet_bs16_eva_336.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(backbone=dict(img_size=336))
diff --git a/configs/eva/eva-g-p14_8xb16_in1k-560px.py b/configs/eva/eva-g-p14_8xb16_in1k-560px.py
new file mode 100644
index 00000000000..ed20866b7f0
--- /dev/null
+++ b/configs/eva/eva-g-p14_8xb16_in1k-560px.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/eva/eva-g.py',
+    '../_base_/datasets/imagenet_bs16_eva_560.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(backbone=dict(img_size=560))
diff --git a/configs/eva/eva-g-p14_headless.py b/configs/eva/eva-g-p14_headless.py
new file mode 100644
index 00000000000..ba3bb55289d
--- /dev/null
+++ b/configs/eva/eva-g-p14_headless.py
@@ -0,0 +1,25 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='BEiT',
+        arch='eva-g',
+        img_size=224,
+        patch_size=14,
+        avg_token=True,
+        layer_scale_init_value=0.0,
+        output_cls_token=False,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        use_shared_rel_pos_bias=False,
+    ),
+    neck=None,
+    head=None,
+)
+
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
diff --git a/configs/eva/eva-g-p16_headless.py b/configs/eva/eva-g-p16_headless.py
new file mode 100644
index 00000000000..10ac1a8c26e
--- /dev/null
+++ b/configs/eva/eva-g-p16_headless.py
@@ -0,0 +1,25 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='BEiT',
+        arch='eva-g',
+        img_size=224,
+        patch_size=16,
+        avg_token=True,
+        layer_scale_init_value=0.0,
+        output_cls_token=False,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        use_shared_rel_pos_bias=False,
+    ),
+    neck=None,
+    head=None,
+)
+
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
diff --git a/configs/eva/eva-l-p14_8xb16_in1k-196px.py b/configs/eva/eva-l-p14_8xb16_in1k-196px.py
new file mode 100644
index 00000000000..3503ca5d780
--- /dev/null
+++ b/configs/eva/eva-l-p14_8xb16_in1k-196px.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/eva/eva-l.py',
+    '../_base_/datasets/imagenet_bs16_eva_196.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(backbone=dict(img_size=196))
diff --git a/configs/eva/eva-l-p14_8xb16_in1k-336px.py b/configs/eva/eva-l-p14_8xb16_in1k-336px.py
new file mode 100644
index 00000000000..7094df8ba3d
--- /dev/null
+++ b/configs/eva/eva-l-p14_8xb16_in1k-336px.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/eva/eva-l.py',
+    '../_base_/datasets/imagenet_bs16_eva_336.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(backbone=dict(img_size=336))
diff --git a/configs/eva/eva-l-p14_headless.py b/configs/eva/eva-l-p14_headless.py
new file mode 100644
index 00000000000..d32346550f2
--- /dev/null
+++ b/configs/eva/eva-l-p14_headless.py
@@ -0,0 +1,26 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='BEiT',
+        arch='l',
+        img_size=224,
+        patch_size=14,
+        avg_token=True,
+        layer_scale_init_value=0.0,
+        output_cls_token=False,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        use_shared_rel_pos_bias=False,
+        layer_cfgs=dict(bias=True),
+    ),
+    neck=None,
+    head=None,
+)
+
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
diff --git a/configs/eva/metafile.yml b/configs/eva/metafile.yml
new file mode 100644
index 00000000000..7dbb4b16cb1
--- /dev/null
+++ b/configs/eva/metafile.yml
@@ -0,0 +1,214 @@
+Collections:
+  - Name: EVA
+    Metadata:
+      Architecture:
+        - Attention Dropout
+        - Convolution
+        - Dense Connections
+        - Dropout
+        - GELU
+        - Layer Normalization
+        - Multi-Head Attention
+        - Scaled Dot-Product Attention
+        - Tanh Activation
+    Paper:
+      URL: https://arxiv.org/abs/2211.07636
+      Title: 'EVA: Exploring the Limits of Masked Visual Representation Learning at Scale'
+    README: configs/eva/README.md
+    Code:
+      URL:
+      Version:
+
+Models:
+  - Name: eva-g-p14_3rdparty_30m
+    In Collection: EVA
+    Metadata:
+      FLOPs: 267174833024
+      Parameters: 1011596672
+      Training Data:
+        - merged-30M
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_3rdparty_30m_20221213-3b7aca97.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_psz14.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-g-p14_headless.py
+
+  - Name: eva-g-p16_3rdparty_30m
+    In Collection: EVA
+    Metadata:
+      FLOPs: 203517463424
+      Parameters: 1011315072
+      Training Data:
+        - merged-30M
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p16_3rdparty_30m_20221213-7bed23ee.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_psz14to16.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-g-p16_headless.py
+
+  - Name: eva-g-p14_30m-pre_3rdparty_in21k
+    In Collection: EVA
+    Metadata:
+      FLOPs: 267174833024
+      Parameters: 1011596672
+      Training Data:
+        - merged-30M
+        - ImageNet-21k
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-pre_3rdparty_in21k_20221213-d72285b7.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_21k_224px_psz14.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-g-p14_headless.py
+
+  - Name: eva-g-p14_30m-in21k-pre_3rdparty_in1k-336px
+    In Collection: EVA
+    Metadata:
+      FLOPs: 620642757504
+      Parameters: 1013005672
+      Training Data:
+        - merged-30M
+        - ImageNet-21k
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 89.61
+        Top 5 Accuracy: 98.93
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-in21k-pre_3rdparty_in1k-336px_20221213-210f9071.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_21k_1k_336px_psz14_ema_89p6.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-g-p14_8xb16_in1k-336px.py
+
+  - Name: eva-g-p14_30m-in21k-pre_3rdparty_in1k-560px
+    In Collection: EVA
+    Metadata:
+      FLOPs: 1906761591680
+      Parameters: 1014447464
+      Training Data:
+        - merged-30M
+        - ImageNet-21k
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 89.71
+        Top 5 Accuracy: 98.96
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-g-p14_30m-in21k-pre_3rdparty_in1k-560px_20221213-fa1c3652.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_21k_1k_560px_psz14_ema_89p7.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-g-p14_8xb16_in1k-560px.py
+
+  - Name: eva-l-p14_3rdparty-mim_in21k
+    In Collection: EVA
+    Metadata:
+      FLOPs: 81075147776
+      Parameters: 303178752
+      Training Data:
+        - ImageNet-21k
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_3rdparty-mim_in21k_20221213-3a5da50b.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_l_psz14.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-l-p14_headless.py
+
+  - Name: eva-l-p14_mim-pre_3rdparty_in21k
+    In Collection: EVA
+    Metadata:
+      FLOPs: 81075147776
+      Parameters: 303178752
+      Training Data:
+        - ImageNet-21k
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-pre_3rdparty_in21k_20221213-8f194fa2.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_l_psz14_21k_ft.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-l-p14_headless.py
+
+  - Name: eva-l-p14_mim-pre_3rdparty_in1k-336px
+    In Collection: EVA
+    Metadata:
+      FLOPs: 191100916736
+      Parameters: 304531432
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 88.66
+        Top 5 Accuracy: 98.75
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-pre_3rdparty_in1k-336px_20221214-07785cfd.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_l_psz14_336px_1k_ft_88p65.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-l-p14_8xb16_in1k-336px.py
+
+  - Name: eva-l-p14_mim-in21k-pre_3rdparty_in1k-336px
+    In Collection: EVA
+    Metadata:
+      FLOPs: 191100916736
+      Parameters: 304531432
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 89.17
+        Top 5 Accuracy: 98.86
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-in21k-pre_3rdparty_in1k-336px_20221213-f25b7634.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_l_psz14_336px_21k_to_1k_ft_89p2.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-l-p14_8xb16_in1k-336px.py
+
+  - Name: eva-l-p14_mim-pre_3rdparty_in1k-196px
+    In Collection: EVA
+    Metadata:
+      FLOPs: 61565981696
+      Parameters: 304142312
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 87.94
+        Top 5 Accuracy: 98.50
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-pre_3rdparty_in1k-196px_20221214-2adf4d28.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_l_psz14_196px_1k_ft_88p0.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-l-p14_8xb16_in1k-196px.py
+
+  - Name: eva-l-p14_mim-in21k-pre_3rdparty_in1k-196px
+    In Collection: EVA
+    Metadata:
+      FLOPs: 61565981696
+      Parameters: 304142312
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 88.58
+        Top 5 Accuracy: 98.65
+    Weights: https://download.openmmlab.com/mmclassification/v0/eva/eva-l-p14_mim-in21k-pre_3rdparty_in1k-196px_20221213-b730c7e7.pth
+    Converted From:
+      Weights: https://huggingface.co/BAAI/EVA/blob/main/eva_l_psz14_196px_21k_to_1k_ft_88p6.pt
+      Code: https://github.com/baaivision/EVA
+    Config: configs/eva/eva-l-p14_8xb16_in1k-196px.py
diff --git a/mmcls/apis/model.py b/mmcls/apis/model.py
index ccf6438d29d..11407cc0cdd 100644
--- a/mmcls/apis/model.py
+++ b/mmcls/apis/model.py
@@ -114,7 +114,10 @@ def init_model(config, checkpoint=None, device=None, **kwargs):
         # Mapping the weights to GPU may cause unexpected video memory leak
         # which refers to https://github.com/open-mmlab/mmdetection/pull/6405
         checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
-        if 'dataset_meta' in checkpoint.get('meta', {}):
+        if not model.with_head:
+            # Don't set CLASSES if the model is headless.
+            pass
+        elif 'dataset_meta' in checkpoint.get('meta', {}):
             # mmcls 1.x
             model.CLASSES = checkpoint['meta']['dataset_meta']['classes']
         elif 'CLASSES' in checkpoint.get('meta', {}):
diff --git a/mmcls/models/backbones/vision_transformer.py b/mmcls/models/backbones/vision_transformer.py
index 5ec53a13e3a..a3771f2695f 100644
--- a/mmcls/models/backbones/vision_transformer.py
+++ b/mmcls/models/backbones/vision_transformer.py
@@ -183,6 +183,16 @@ class VisionTransformer(BaseBackbone):
                 'num_heads': 16,
                 'feedforward_channels': 5120
             }),
+        **dict.fromkeys(
+            ['eva-g', 'eva-giant'],
+            {
+                # The implementation in EVA
+                # <https://arxiv.org/abs/2211.07636>
+                'embed_dims': 1408,
+                'num_layers': 40,
+                'num_heads': 16,
+                'feedforward_channels': 6144
+            }),
         **dict.fromkeys(
             ['deit-t', 'deit-tiny'], {
                 'embed_dims': 192,
diff --git a/mmcls/models/utils/embed.py b/mmcls/models/utils/embed.py
index 1a1a8369912..bd2b17dc24b 100644
--- a/mmcls/models/utils/embed.py
+++ b/mmcls/models/utils/embed.py
@@ -50,9 +50,11 @@ def resize_pos_embed(pos_embed,
     src_weight = pos_embed[:, num_extra_tokens:]
     src_weight = src_weight.reshape(1, src_h, src_w, C).permute(0, 3, 1, 2)
 
+    # The cubic interpolate algorithm only accepts float32
     dst_weight = F.interpolate(
-        src_weight, size=dst_shape, align_corners=False, mode=mode)
+        src_weight.float(), size=dst_shape, align_corners=False, mode=mode)
     dst_weight = torch.flatten(dst_weight, 2).transpose(1, 2)
+    dst_weight = dst_weight.to(src_weight.dtype)
 
     return torch.cat((extra_tokens, dst_weight), dim=1)
 
diff --git a/model-index.yml b/model-index.yml
index 60ef92c60c3..adce8e2ae91 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -42,4 +42,5 @@ Import:
   - configs/csra/metafile.yml
   - configs/beit/metafile.yml
   - configs/beitv2/metafile.yml
+  - configs/eva/metafile.yml
   - configs/revvit/metafile.yml
diff --git a/tests/test_models/test_backbones/test_beit.py b/tests/test_models/test_backbones/test_beit.py
index 02968bdec5f..cb9512b6b2a 100644
--- a/tests/test_models/test_backbones/test_beit.py
+++ b/tests/test_models/test_backbones/test_beit.py
@@ -75,6 +75,7 @@ def test_structure(self):
             self.assertEqual(layer.attn.embed_dims, 768)
             self.assertEqual(layer.attn.num_heads, 12)
             self.assertEqual(layer.ffn.feedforward_channels, 3072)
+            self.assertFalse(layer.ffn.add_identity)
             self.assertAlmostEqual(layer.ffn.dropout_layer.drop_prob, dpr)
             dpr += dpr_inc
 
diff --git a/tools/model_converters/eva_to_mmcls.py b/tools/model_converters/eva_to_mmcls.py
new file mode 100644
index 00000000000..14ac24ec4f1
--- /dev/null
+++ b/tools/model_converters/eva_to_mmcls.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_eva(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    for k, v in list(ckpt.items()):
+        if 'decoder' in k or 'mask_token' in k:
+            continue
+        new_v = v
+        if k.startswith('head'):
+            new_k = k.replace('head.', 'head.fc.')
+            new_ckpt[new_k] = new_v
+            continue
+        elif k.startswith('patch_embed'):
+            if 'proj.' in k:
+                new_k = k.replace('proj.', 'projection.')
+            else:
+                new_k = k
+        elif k.startswith('blocks'):
+            new_k = k.replace('blocks.', 'layers.')
+            if 'norm1' in k:
+                new_k = new_k.replace('norm1', 'ln1')
+            elif 'norm2' in k:
+                new_k = new_k.replace('norm2', 'ln2')
+            elif 'mlp.fc1' in k:
+                new_k = new_k.replace('mlp.fc1', 'ffn.layers.0.0')
+            elif 'mlp.fc2' in k:
+                new_k = new_k.replace('mlp.fc2', 'ffn.layers.1')
+        elif 'fc_norm' in k:
+            new_k = k.replace('fc_norm', 'ln2')
+        elif k.startswith('norm'):
+            # for mim pretrain
+            new_k = k.replace('norm', 'ln2')
+        else:
+            new_k = k
+
+        if not new_k.startswith('head'):
+            new_k = 'backbone.' + new_k
+        new_ckpt[new_k] = new_v
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in pretrained van models to mmcls style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+
+    weight = convert_eva(state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+    print('Done!!')
+
+
+if __name__ == '__main__':
+    main()

From 46af7d3ed2538dea3133c06250dd3683fd792e38 Mon Sep 17 00:00:00 2001
From: Ma Zerun <mzr1996@163.com>
Date: Wed, 14 Dec 2022 13:47:32 +0800
Subject: [PATCH 05/21] [CI] Update CI to test PyTorch 1.13.0. (#1260)

---
 .circleci/test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/test.yml b/.circleci/test.yml
index 12ea1636d72..a0fcb0fe05d 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -207,9 +207,9 @@ workflows:
             - lint
       - build_cpu_with_3rdparty:
           name: maximum_version_cpu
-          torch: 1.12.1
-          torchvision: 0.13.1
-          python: 3.9.0
+          torch: 1.13.0
+          torchvision: 0.14.0
+          python: 3.10.0
           requires:
             - minimum_version_cpu
       - hold:

From 6ea59bd8461a137db0ce4419b6eb049eded74fad Mon Sep 17 00:00:00 2001
From: Ma Zerun <mzr1996@163.com>
Date: Mon, 19 Dec 2022 13:01:11 +0800
Subject: [PATCH 06/21] [Fix] Fix the requirements and lazy register mmcls
 models. (#1275)

---
 mmcls/apis/inference.py  | 15 +++++++++------
 mmcls/apis/model.py      | 29 ++++++++++++++++++-----------
 requirements/runtime.txt |  1 +
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/mmcls/apis/inference.py b/mmcls/apis/inference.py
index 4a2160424dc..e1e8a0be390 100644
--- a/mmcls/apis/inference.py
+++ b/mmcls/apis/inference.py
@@ -1,16 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Union
+from typing import TYPE_CHECKING, Union
 
 import numpy as np
 import torch
-from mmengine.dataset import Compose, default_collate
-from mmengine.model import BaseModel
-from mmengine.registry import DefaultScope
 
-import mmcls.datasets  # noqa: F401
+if TYPE_CHECKING:
+    from mmengine.model import BaseModel
 
 
-def inference_model(model: BaseModel, img: Union[str, np.ndarray]):
+def inference_model(model: 'BaseModel', img: Union[str, np.ndarray]):
     """Inference image(s) with the classifier.
 
     Args:
@@ -21,6 +19,11 @@ def inference_model(model: BaseModel, img: Union[str, np.ndarray]):
         result (dict): The classification results that contains
             `class_name`, `pred_label` and `pred_score`.
     """
+    from mmengine.dataset import Compose, default_collate
+    from mmengine.registry import DefaultScope
+
+    import mmcls.datasets  # noqa: F401
+
     cfg = model.cfg
     # build the data pipeline
     test_pipeline_cfg = cfg.test_dataloader.dataset.pipeline
diff --git a/mmcls/apis/model.py b/mmcls/apis/model.py
index 11407cc0cdd..a4882102bc6 100644
--- a/mmcls/apis/model.py
+++ b/mmcls/apis/model.py
@@ -8,18 +8,14 @@
 from typing import List, Union
 
 from mmengine.config import Config
-from mmengine.runner import load_checkpoint
-from mmengine.utils import get_installed_path
 from modelindex.load_model_index import load
 from modelindex.models.Model import Model
 
-import mmcls.models  # noqa: F401
-from mmcls.registry import MODELS
-
 
 class ModelHub:
     """A hub to host the meta information of all pre-defined models."""
     _models_dict = {}
+    __mmcls_registered = False
 
     @classmethod
     def register_model_index(cls,
@@ -56,6 +52,7 @@ def get(cls, model_name):
         Returns:
             modelindex.models.Model: The metainfo of the specified model.
         """
+        cls._register_mmcls_models()
         # lazy load config
         metainfo = copy.deepcopy(cls._models_dict.get(model_name.lower()))
         if metainfo is None:
@@ -77,12 +74,16 @@ def _expand_config_path(metainfo: Model,
 
         return config_path
 
-
-# register models in mmcls
-mmcls_root = Path(get_installed_path('mmcls'))
-model_index_path = mmcls_root / '.mim' / 'model-index.yml'
-ModelHub.register_model_index(
-    model_index_path, config_prefix=mmcls_root / '.mim')
+    @classmethod
+    def _register_mmcls_models(cls):
+        # register models in mmcls
+        if not cls.__mmcls_registered:
+            from mmengine.utils import get_installed_path
+            mmcls_root = Path(get_installed_path('mmcls'))
+            model_index_path = mmcls_root / '.mim' / 'model-index.yml'
+            ModelHub.register_model_index(
+                model_index_path, config_prefix=mmcls_root / '.mim')
+            cls.__mmcls_registered = True
 
 
 def init_model(config, checkpoint=None, device=None, **kwargs):
@@ -109,10 +110,15 @@ def init_model(config, checkpoint=None, device=None, **kwargs):
         config.merge_from_dict({'model': kwargs})
     config.model.setdefault('data_preprocessor',
                             config.get('data_preprocessor', None))
+
+    import mmcls.models  # noqa: F401
+    from mmcls.registry import MODELS
+
     model = MODELS.build(config.model)
     if checkpoint is not None:
         # Mapping the weights to GPU may cause unexpected video memory leak
         # which refers to https://github.com/open-mmlab/mmdetection/pull/6405
+        from mmengine.runner import load_checkpoint
         checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
         if not model.with_head:
             # Don't set CLASSES if the model is headless.
@@ -216,6 +222,7 @@ def list_models(pattern=None) -> List[str]:
          'resnet50_8xb256-rsb-a2-300e_in1k',
          'resnet50_8xb256-rsb-a3-100e_in1k']
     """
+    ModelHub._register_mmcls_models()
     if pattern is None:
         return sorted(list(ModelHub._models_dict.keys()))
     # Always match keys with any postfix.
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 9f814b0eab9..e0acc881b7e 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,4 +1,5 @@
 matplotlib
+modelindex
 numpy
 packaging
 rich

From 0e4163668ffdfad95a82331093b833305c3b19af Mon Sep 17 00:00:00 2001
From: Ma Zerun <mzr1996@163.com>
Date: Mon, 19 Dec 2022 13:53:13 +0800
Subject: [PATCH 07/21] [Feature] Add some scripts for development. (#1257)

* [Feature] Add some scripts for development.

* Add `generate_readme.py`.

* Update according to comments
---
 .dev_scripts/ckpt_tree.py       | 186 ++++++++++++++++++++++++++++++++
 .dev_scripts/compare_init.py    | 121 +++++++++++++++++++++
 .dev_scripts/generate_readme.py | 161 +++++++++++++++++++++++++++
 3 files changed, 468 insertions(+)
 create mode 100644 .dev_scripts/ckpt_tree.py
 create mode 100644 .dev_scripts/compare_init.py
 create mode 100644 .dev_scripts/generate_readme.py

diff --git a/.dev_scripts/ckpt_tree.py b/.dev_scripts/ckpt_tree.py
new file mode 100644
index 00000000000..787020e1be0
--- /dev/null
+++ b/.dev_scripts/ckpt_tree.py
@@ -0,0 +1,186 @@
+import argparse
+import math
+from pathlib import Path
+
+import torch
+from rich.console import Console
+
+console = Console()
+
+prog_description = """\
+Draw the state dict tree.
+"""
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=prog_description)
+    parser.add_argument(
+        'path',
+        type=Path,
+        help='The path of the checkpoint or model config to draw.')
+    parser.add_argument('--depth', type=int, help='The max depth to draw.')
+    parser.add_argument(
+        '--full-name',
+        action='store_true',
+        help='Whether to print the full name of the key.')
+    parser.add_argument(
+        '--shape',
+        action='store_true',
+        help='Whether to print the shape of the parameter.')
+    parser.add_argument(
+        '--state-key',
+        type=str,
+        help='The key of the state dict in the checkpoint.')
+    parser.add_argument(
+        '--number',
+        action='store_true',
+        help='Mark all parameters and their index number.')
+    parser.add_argument(
+        '--node',
+        type=str,
+        help='Show the sub-tree of a node, like "backbone.layers".')
+    args = parser.parse_args()
+    return args
+
+
+def ckpt_to_state_dict(checkpoint, key=None):
+    if key is not None:
+        state_dict = checkpoint[key]
+    elif 'state_dict' in checkpoint:
+        # try mmcls style
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    elif isinstance(next(iter(checkpoint.values())), torch.Tensor):
+        # try native style
+        state_dict = checkpoint
+    else:
+        raise KeyError('Please specify the key of state '
+                       f'dict from {list(checkpoint.keys())}.')
+    return state_dict
+
+
+class StateDictTree:
+
+    def __init__(self, key='', value=None):
+        self.children = {}
+        self.key: str = key
+        self.value = value
+
+    def add_parameter(self, key, value):
+        keys = key.split('.', 1)
+        if len(keys) == 1:
+            self.children[key] = StateDictTree(key, value)
+        elif keys[0] in self.children:
+            self.children[keys[0]].add_parameter(keys[1], value)
+        else:
+            node = StateDictTree(keys[0])
+            node.add_parameter(keys[1], value)
+            self.children[keys[0]] = node
+
+    def __getitem__(self, key: str):
+        return self.children[key]
+
+    def __repr__(self) -> str:
+        with console.capture() as capture:
+            for line in self.iter_tree():
+                console.print(line)
+        return capture.get()
+
+    def __len__(self):
+        return len(self.children)
+
+    def draw_tree(self,
+                  max_depth=None,
+                  full_name=False,
+                  with_shape=False,
+                  with_value=False):
+        for line in self.iter_tree(
+                max_depth=max_depth,
+                full_name=full_name,
+                with_shape=with_shape,
+                with_value=with_value,
+        ):
+            console.print(line, highlight=False)
+
+    def iter_tree(
+        self,
+        lead='',
+        prefix='',
+        max_depth=None,
+        full_name=False,
+        with_shape=False,
+        with_value=False,
+    ):
+        if self.value is None:
+            key_str = f'[blue]{self.key}[/]'
+        elif with_shape:
+            key_str = f'[green]{self.key}[/] {tuple(self.value.shape)}'
+        elif with_value:
+            key_str = f'[green]{self.key}[/] {self.value}'
+        else:
+            key_str = f'[green]{self.key}[/]'
+
+        yield lead + prefix + key_str
+
+        lead = lead.replace('├─', '│ ')
+        lead = lead.replace('└─', '  ')
+        if self.key and full_name:
+            prefix = f'{prefix}{self.key}.'
+
+        if max_depth == 0:
+            return
+        elif max_depth is not None:
+            max_depth -= 1
+
+        for i, child in enumerate(self.children.values()):
+            level_lead = '├─' if i < len(self.children) - 1 else '└─'
+            yield from child.iter_tree(
+                lead=f'{lead}{level_lead} ',
+                prefix=prefix,
+                max_depth=max_depth,
+                full_name=full_name,
+                with_shape=with_shape,
+                with_value=with_value)
+
+
+def main():
+    args = parse_args()
+    if args.path.suffix in ['.json', '.py', '.yml']:
+        from mmengine.runner import get_state_dict
+
+        from mmcls.apis import init_model
+        model = init_model(args.path, device='cpu')
+        state_dict = get_state_dict(model)
+    else:
+        ckpt = torch.load(args.path, map_location='cpu')
+        state_dict = ckpt_to_state_dict(ckpt, args.state_key)
+
+    root = StateDictTree()
+    for k, v in state_dict.items():
+        root.add_parameter(k, v)
+
+    para_index = 0
+    mark_width = math.floor(math.log(len(state_dict), 10) + 1)
+    if args.node is not None:
+        for key in args.node.split('.'):
+            root = root[key]
+
+    for line in root.iter_tree(
+            max_depth=args.depth,
+            full_name=args.full_name,
+            with_shape=args.shape,
+    ):
+        if not args.number:
+            mark = ''
+        # A hack method to determine whether a line is parameter.
+        elif '[green]' in line:
+            mark = f'[red]({str(para_index).ljust(mark_width)})[/]'
+            para_index += 1
+        else:
+            mark = ' ' * (mark_width + 2)
+        console.print(mark + line, highlight=False)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.dev_scripts/compare_init.py b/.dev_scripts/compare_init.py
new file mode 100644
index 00000000000..71030f67d46
--- /dev/null
+++ b/.dev_scripts/compare_init.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+import argparse
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import torch
+from ckpt_tree import StateDictTree, ckpt_to_state_dict
+from rich.progress import track
+from scipy import stats
+
+prog_description = """\
+Compare the initialization distribution between state dicts by Kolmogorov-Smirnov test.
+"""  # noqa: E501
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=prog_description)
+    parser.add_argument(
+        'model_a',
+        type=Path,
+        help='The path of the first checkpoint or model config.')
+    parser.add_argument(
+        'model_b',
+        type=Path,
+        help='The path of the second checkpoint or model config.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Whether to draw the KDE of variables')
+    parser.add_argument(
+        '-p',
+        default=0.01,
+        type=float,
+        help='The threshold of p-value. '
+        'Higher threshold means more strict test.')
+    args = parser.parse_args()
+    return args
+
+
+def compare_distribution(state_dict_a, state_dict_b, p_thres):
+    assert len(state_dict_a) == len(state_dict_b)
+    for k, v1 in state_dict_a.items():
+        assert k in state_dict_b
+        v2 = state_dict_b[k]
+        v1 = v1.cpu().flatten()
+        v2 = v2.cpu().flatten()
+        pvalue = stats.kstest(v1, v2).pvalue
+        if pvalue < p_thres:
+            yield k, pvalue, v1, v2
+
+
+def state_dict_from_cfg_or_ckpt(path, state_key=None):
+    if path.suffix in ['.json', '.py', '.yml']:
+        from mmengine.runner import get_state_dict
+
+        from mmcls.apis import init_model
+        model = init_model(path, device='cpu')
+        model.init_weights()
+        return get_state_dict(model)
+    else:
+        ckpt = torch.load(path, map_location='cpu')
+        return ckpt_to_state_dict(ckpt, state_key)
+
+
+def main():
+    args = parse_args()
+
+    state_dict_a = state_dict_from_cfg_or_ckpt(args.model_a)
+    state_dict_b = state_dict_from_cfg_or_ckpt(args.model_b)
+    compare_keys = state_dict_a.keys() & state_dict_b.keys()
+    if len(compare_keys) == 0:
+        raise ValueError("The state dicts don't match, please convert "
+                         'to the same keys before comparison.')
+
+    root = StateDictTree()
+    for key in track(compare_keys):
+        if state_dict_a[key].shape != state_dict_b[key].shape:
+            raise ValueError(f'The shapes of "{key}" are different. '
+                             'Please check models in the same architecture.')
+
+        # Sample at most 30000 items to prevent long-time calcuation.
+        perm_ids = torch.randperm(state_dict_a[key].numel())[:30000]
+        value_a = state_dict_a[key].flatten()[perm_ids]
+        value_b = state_dict_b[key].flatten()[perm_ids]
+        pvalue = stats.kstest(value_a, value_b).pvalue
+        if pvalue < args.p:
+            root.add_parameter(key, round(pvalue, 4))
+            if args.show:
+                try:
+                    import seaborn as sns
+                except ImportError:
+                    raise ImportError('Please install `seaborn` by '
+                                      '`pip install seaborn` to show KDE.')
+                sample_a = str([round(v.item(), 2) for v in value_a[:10]])
+                sample_b = str([round(v.item(), 2) for v in value_b[:10]])
+                if value_a.std() > 0:
+                    sns.kdeplot(value_a, fill=True)
+                else:
+                    sns.scatterplot(x=[value_a[0].item()], y=[1])
+                if value_b.std() > 0:
+                    sns.kdeplot(value_b, fill=True)
+                else:
+                    sns.scatterplot(x=[value_b[0].item()], y=[1])
+                plt.legend([
+                    f'{args.model_a.stem}: {sample_a}',
+                    f'{args.model_b.stem}: {sample_b}'
+                ])
+                plt.title(key)
+                plt.show()
+    if len(root) > 0:
+        root.draw_tree(with_value=True)
+        print("Above parameters didn't pass the test, "
+              'and the values are their similarity score.')
+    else:
+        print('The distributions of all weights are the same.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.dev_scripts/generate_readme.py b/.dev_scripts/generate_readme.py
new file mode 100644
index 00000000000..bb7482b94f6
--- /dev/null
+++ b/.dev_scripts/generate_readme.py
@@ -0,0 +1,161 @@
+# flake8: noqa
+import argparse
+import warnings
+from collections import defaultdict
+from pathlib import Path
+
+from modelindex.load_model_index import load
+from modelindex.models.ModelIndex import ModelIndex
+
+prog_description = """\
+Use metafile to generate a README.md.
+
+Notice that the tool may fail in some corner cases, and you still need to check and fill some contents manually in the generated README.
+"""
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=prog_description)
+    parser.add_argument('metafile', type=Path, help='The path of metafile')
+    parser.add_argument(
+        '--table', action='store_true', help='Only generate summary tables')
+    args = parser.parse_args()
+    return args
+
+
+def add_title(metafile: ModelIndex, readme: list):
+    paper = metafile.collections[0].paper
+    title = paper['Title']
+    url = paper['URL']
+    abbr = metafile.collections[0].name
+    papertype = metafile.collections[0].data.get('type', 'Algorithm')
+
+    readme.append(f'# {abbr}\n')
+    readme.append(f'> [{title}]({url})')
+    readme.append(f'<!-- [{papertype.upper()}] -->')
+    readme.append('')
+
+
+def add_abstract(metafile, readme):
+    paper = metafile.collections[0].paper
+    url = paper['URL']
+    if 'arxiv' in url:
+        try:
+            import arxiv
+            search = arxiv.Search(id_list=[url.split('/')[-1]])
+            info = next(search.results())
+            abstract = info.summary
+        except ImportError:
+            warnings.warn('Install arxiv parser by `pip install arxiv` '
+                          'to automatically generate abstract.')
+            abstract = None
+    else:
+        abstract = None
+
+    readme.append('## Abstract\n')
+    if abstract is not None:
+        readme.append(abstract.replace('\n', ' '))
+
+    readme.append('')
+    readme.append('<div align=center>\n'
+                  '<img src="" width="50%"/>\n'
+                  '</div>')
+    readme.append('')
+
+
+def add_models(metafile, readme):
+    models = metafile.models
+    if len(models) == 0:
+        return
+
+    readme.append('## Results and models')
+    readme.append('')
+
+    datasets = defaultdict(list)
+    for model in models:
+        if model.results is None:
+            # No results on pretrained model.
+            datasets['Pre-trained Models'].append(model)
+        else:
+            datasets[model.results[0].dataset].append(model)
+
+    for dataset, models in datasets.items():
+        if dataset == 'Pre-trained Models':
+            readme.append(f'### {dataset}\n')
+            readme.append(
+                'The pre-trained models are only used to fine-tune, '
+                "and therefore cannot be trained and don't have evaluation results.\n"
+            )
+            readme.append(
+                '|         Model         |  Pretrain | Params(M) | Flops(G) | Config | Download |\n'
+                '|:---------------------:|:---------:|:---------:|:--------:|:------:|:--------:|'
+            )
+            converted_from = None
+            for model in models:
+                name = model.name.center(21)
+                params = model.metadata.parameters / 1e6
+                flops = model.metadata.flops / 1e9
+                converted_from = converted_from or model.data.get(
+                    'Converted From', None)
+                config = './' + Path(model.config).name
+                weights = model.weights
+                star = '\*' if '3rdparty' in weights else ''
+                readme.append(
+                    f'| {name}{star} | {params:.2f} | {flops:.2f} | [config]({config}) | [model]({weights}) |'
+                ),
+            if converted_from is not None:
+                readme.append('')
+                readme.append(
+                    f"*Models with \* are converted from the [official repo]({converted_from['Code']}).*\n"
+                )
+        else:
+            readme.append(f'### {dataset}\n')
+            readme.append(
+                '|         Model         |  Pretrain  | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download |\n'
+                '|:---------------------:|:----------:|:---------:|:--------:|:---------:|:---------:|:------:|:--------:|'
+            )
+            converted_from = None
+            for model in models:
+                name = model.name.center(21)
+                params = model.metadata.parameters / 1e6
+                flops = model.metadata.flops / 1e9
+                metrics = model.results[0].metrics
+                top1 = metrics.get('Top 1 Accuracy')
+                top5 = metrics.get('Top 5 Accuracy', 0)
+                converted_from = converted_from or model.data.get(
+                    'Converted From', None)
+                config = './' + Path(model.config).name
+                weights = model.weights
+                star = '\*' if '3rdparty' in weights else ''
+                if 'in21k-pre' in weights:
+                    pretrain = 'ImageNet 21k'
+                else:
+                    pretrain = 'From scratch'
+                readme.append(
+                    f'| {name}{star} | {pretrain} | {params:.2f} | {flops:.2f} | {top1:.2f} | {top5:.2f} | [config]({config}) | [model]({weights}) |'
+                ),
+            if converted_from is not None:
+                readme.append('')
+                readme.append(
+                    f"*Models with \* are converted from the [official repo]({converted_from['Code']}). "
+                    'The config files of these models are only for inference. '
+                    "We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*\n"
+                )
+
+
+def main():
+    args = parse_args()
+    metafile = load(str(args.metafile))
+    readme_lines = []
+    if not args.table:
+        add_title(metafile, readme_lines)
+        add_abstract(metafile, readme_lines)
+    add_models(metafile, readme_lines)
+    if not args.table:
+        readme_lines.append('## Citation\n')
+        readme_lines.append('```bibtex\n\n```\n')
+    print('\n'.join(readme_lines))
+
+
+if __name__ == '__main__':
+    main()

From b63515111b53029dedc25f7a025d88be64dbf1f6 Mon Sep 17 00:00:00 2001
From: Ma Zerun <mzr1996@163.com>
Date: Mon, 19 Dec 2022 13:54:24 +0800
Subject: [PATCH 08/21] [Reproduce] Update ConvNeXt config files. (#1256)

* Update ConvNeXt training configs.

* Update ConvNeXt network.

* Update metafile and README.

* Update README
---
 .../_base_/models/convnext/convnext-base.py   | 10 ++-
 .../_base_/models/convnext/convnext-large.py  | 10 ++-
 .../_base_/models/convnext/convnext-small.py  | 10 ++-
 .../_base_/models/convnext/convnext-tiny.py   | 10 ++-
 .../_base_/models/convnext/convnext-xlarge.py | 10 ++-
 .../schedules/imagenet_bs1024_adamw_swin.py   |  1 +
 configs/convnext/README.md                    | 56 ++++++------
 .../convnext/convnext-base_32xb128_in1k.py    |  4 +-
 .../convnext/convnext-base_32xb128_in21k.py   | 24 +++++
 .../convnext/convnext-large_64xb64_in1k.py    |  4 +-
 .../convnext/convnext-large_64xb64_in21k.py   | 24 +++++
 .../convnext/convnext-small_32xb128_in1k.py   |  4 +-
 .../convnext/convnext-tiny_32xb128_in1k.py    |  4 +-
 .../convnext/convnext-xlarge_64xb64_in1k.py   |  4 +-
 .../convnext/convnext-xlarge_64xb64_in21k.py  | 24 +++++
 configs/convnext/metafile.yml                 | 87 +++++++++++--------
 docs/en/migration.md                          |  2 +-
 docs/en/user_guides/config.md                 |  8 +-
 mmcls/models/backbones/convnext.py            | 24 +++--
 19 files changed, 225 insertions(+), 95 deletions(-)
 create mode 100644 configs/convnext/convnext-base_32xb128_in21k.py
 create mode 100644 configs/convnext/convnext-large_64xb64_in21k.py
 create mode 100644 configs/convnext/convnext-xlarge_64xb64_in21k.py

diff --git a/configs/_base_/models/convnext/convnext-base.py b/configs/_base_/models/convnext/convnext-base.py
index 7fc5ce71a74..86d611a640f 100644
--- a/configs/_base_/models/convnext/convnext-base.py
+++ b/configs/_base_/models/convnext/convnext-base.py
@@ -19,5 +19,11 @@
         type='LinearClsHead',
         num_classes=1000,
         in_channels=1024,
-        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
-    ))
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
diff --git a/configs/_base_/models/convnext/convnext-large.py b/configs/_base_/models/convnext/convnext-large.py
index 4d9e37c0df9..fcdac8791cd 100644
--- a/configs/_base_/models/convnext/convnext-large.py
+++ b/configs/_base_/models/convnext/convnext-large.py
@@ -19,5 +19,11 @@
         type='LinearClsHead',
         num_classes=1000,
         in_channels=1536,
-        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
-    ))
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
diff --git a/configs/_base_/models/convnext/convnext-small.py b/configs/_base_/models/convnext/convnext-small.py
index 989ad1d4e63..5686767f8ab 100644
--- a/configs/_base_/models/convnext/convnext-small.py
+++ b/configs/_base_/models/convnext/convnext-small.py
@@ -19,5 +19,11 @@
         type='LinearClsHead',
         num_classes=1000,
         in_channels=768,
-        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
-    ))
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
diff --git a/configs/_base_/models/convnext/convnext-tiny.py b/configs/_base_/models/convnext/convnext-tiny.py
index 0b692abb1cb..9db9d50a1cb 100644
--- a/configs/_base_/models/convnext/convnext-tiny.py
+++ b/configs/_base_/models/convnext/convnext-tiny.py
@@ -19,5 +19,11 @@
         type='LinearClsHead',
         num_classes=1000,
         in_channels=768,
-        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
-    ))
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
diff --git a/configs/_base_/models/convnext/convnext-xlarge.py b/configs/_base_/models/convnext/convnext-xlarge.py
index 0c75e32547b..00751c58aba 100644
--- a/configs/_base_/models/convnext/convnext-xlarge.py
+++ b/configs/_base_/models/convnext/convnext-xlarge.py
@@ -19,5 +19,11 @@
         type='LinearClsHead',
         num_classes=1000,
         in_channels=2048,
-        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
-    ))
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
diff --git a/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py b/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py
index 5b52ea6edb4..fd06cc115a7 100644
--- a/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py
+++ b/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py
@@ -10,6 +10,7 @@
     paramwise_cfg=dict(
         norm_decay_mult=0.0,
         bias_decay_mult=0.0,
+        flat_decay_mult=0.0,
         custom_keys={
             '.absolute_pos_embed': dict(decay_mult=0.0),
             '.relative_position_bias_table': dict(decay_mult=0.0)
diff --git a/configs/convnext/README.md b/configs/convnext/README.md
index 18c3fcff772..e87302e381e 100644
--- a/configs/convnext/README.md
+++ b/configs/convnext/README.md
@@ -36,9 +36,9 @@ The "Roaring 20s" of visual recognition began with the introduction of Vision Tr
 
 ```python
 >>> import torch
->>> from mmcls.apis import init_model, inference_model
+>>> from mmcls.apis import get_model, inference_model
 >>>
->>> model = init_model('configs/convnext/convnext-tiny_32xb128_in1k.py', 'https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128-noema_in1k_20220222-2908964a.pth')
+>>> model = get_model('convnext-tiny_32xb128_in1k', pretrained=True)
 >>> predict = inference_model(model, 'demo/demo.JPEG')
 >>> print(predict['pred_class'])
 sea snake
@@ -50,10 +50,10 @@ sea snake
 
 ```python
 >>> import torch
->>> from mmcls.apis import init_model
+>>> from mmcls.apis import get_model
 >>>
->>> model = init_model('configs/convnext/convnext-tiny_32xb128_in1k.py', 'https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128-noema_in1k_20220222-2908964a.pth')
->>> inputs = torch.rand(1, 3, 224, 224).to(model.data_preprocessor.device)
+>>> model = get_model('convnext-tiny_32xb128_in1k', pretrained=True)
+>>> inputs = torch.rand(1, 3, 224, 224))
 >>> # To get classification scores.
 >>> out = model(inputs)
 >>> print(out.shape)
@@ -85,35 +85,37 @@ For more configurable parameters, please refer to the [API](https://mmclassifica
 
 ## Results and models
 
-### ImageNet-1k
-
-|     Model     |   Pretrain   | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                   Config                   |                                              Download                                              |
-| :-----------: | :----------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------: | :------------------------------------------------------------------------------------------------: |
-| ConvNeXt-T\*  | From scratch |   28.59   |   4.46   |   82.05   |   95.86   | [config](./convnext-tiny_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth) |
-| ConvNeXt-S\*  | From scratch |   50.22   |   8.69   |   83.13   |   96.44   | [config](./convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth) |
-| ConvNeXt-B\*  | From scratch |   88.59   |  15.36   |   83.85   |   96.74   | [config](./convnext-base_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) |
-| ConvNeXt-B\*  | ImageNet-21k |   88.59   |  15.36   |   85.81   |   97.86   | [config](./convnext-base_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) |
-| ConvNeXt-L\*  | From scratch |  197.77   |  34.37   |   84.30   |   96.89   | [config](./convnext-large_64xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) |
-| ConvNeXt-L\*  | ImageNet-21k |  197.77   |  34.37   |   86.61   |   98.04   | [config](./convnext-large_64xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) |
-| ConvNeXt-XL\* | ImageNet-21k |  350.20   |  60.93   |   86.97   |   98.20   | [config](./convnext-xlarge_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) |
-
-*Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
-
 ### Pre-trained Models
 
 The pre-trained models on ImageNet-1k or ImageNet-21k are used to fine-tune on the downstream tasks.
 
-|     Model     | Training Data | Params(M) | Flops(G) |                                                               Download                                                                |
-| :-----------: | :-----------: | :-------: | :------: | :-----------------------------------------------------------------------------------------------------------------------------------: |
-| ConvNeXt-T\*  |  ImageNet-1k  |   28.59   |   4.46   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128-noema_in1k_20220222-2908964a.pth)  |
-| ConvNeXt-S\*  |  ImageNet-1k  |   50.22   |   8.69   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128-noema_in1k_20220222-fa001ca5.pth) |
-| ConvNeXt-B\*  |  ImageNet-1k  |   88.59   |  15.36   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128-noema_in1k_20220222-dba4f95f.pth)  |
-| ConvNeXt-B\*  | ImageNet-21k  |   88.59   |  15.36   |        [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth)        |
-| ConvNeXt-L\*  | ImageNet-21k  |  197.77   |  34.37   |       [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth)        |
-| ConvNeXt-XL\* | ImageNet-21k  |  350.20   |  60.93   |       [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth)       |
+| Model                                              | Training Data | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                                                Download                                                |
+| :------------------------------------------------- | :-----------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------------------------------------------------------------: |
+| ConvNeXt-T (`convnext-tiny_32xb128-noema_in1k`)    |  ImageNet-1k  |   28.59   |   4.46   |   81.95   |   95.89   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128-noema_in1k_20221208-5d4509c7.pth) |
+| ConvNeXt-S (`convnext-small_32xb128-noema_in1k`)   |  ImageNet-1k  |   50.22   |   8.69   |   83.21   |   96.48   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128-noema_in1k_20221208-4a618995.pth) |
+| ConvNeXt-B (`convnext-base_32xb128-noema_in1k`)    |  ImageNet-1k  |   88.59   |  15.36   |   83.64   |   96.61   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128-noema_in1k_20221208-f8182678.pth) |
+| ConvNeXt-B (`convnext-base_3rdparty-noema_in1k`)\* |  ImageNet-1k  |   88.59   |  15.36   |   83.71   |   96.60   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128-noema_in1k_20220222-dba4f95f.pth) |
+| ConvNeXt-B (`convnext-base_3rdparty_in21k`)\*      | ImageNet-21k  |   88.59   |  15.36   |    N/A    |    N/A    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth) |
+| ConvNeXt-L (`convnext-large_3rdparty_in21k`)\*     | ImageNet-21k  |  197.77   |  34.37   |    N/A    |    N/A    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth) |
+| ConvNeXt-XL (`convnext-xlarge_3rdparty_in21k`)\*   | ImageNet-21k  |  350.20   |  60.93   |    N/A    |    N/A    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth) |
 
 *Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt).*
 
+### ImageNet-1k
+
+| Model                                                  |   Pretrain   | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                   Config                   |                         Download                          |
+| :----------------------------------------------------- | :----------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------: | :-------------------------------------------------------: |
+| ConvNeXt-T (`convnext-tiny_32xb128_in1k`)              | From scratch |   28.59   |   4.46   |   82.14   |   96.06   | [config](./convnext-tiny_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.log.json) |
+| ConvNeXt-S (`convnext-small_32xb128_in1k`)             | From scratch |   50.22   |   8.69   |   83.16   |   96.56   | [config](./convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.log.json) |
+| ConvNeXt-B (`convnext-base_32xb128_in1k`)              | From scratch |   88.59   |  15.36   |   83.66   |   96.74   | [config](./convnext-base_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.log.json) |
+| ConvNeXt-B (`convnext-base_3rdparty_in1k`)\*           | From scratch |   88.59   |  15.36   |   83.85   |   96.74   | [config](./convnext-base_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) |
+| ConvNeXt-B (`convnext-base_in21k-pre_3rdparty_in1k`)\* | ImageNet 21k |   88.59   |  15.36   |   85.81   |   97.86   | [config](./convnext-base_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) |
+| ConvNeXt-L (`convnext-large_3rdparty_in1k`)\*          | From scratch |  197.77   |  34.37   |   84.30   |   96.89   | [config](./convnext-large_64xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) |
+| ConvNeXt-L (`convnext-large_in21k-pre_3rdparty_in1k`)\* | ImageNet 21k |  197.77   |  34.37   |   86.61   |   98.04   | [config](./convnext-large_64xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) |
+| ConvNeXt-XL (`convnext-xlarge_in21k-pre_3rdparty_in1k`)\* | ImageNet 21k |  350.20   |  60.93   |   86.97   |   98.20   | [config](./convnext-xlarge_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
 ## Citation
 
 ```bibtex
diff --git a/configs/convnext/convnext-base_32xb128_in1k.py b/configs/convnext/convnext-base_32xb128_in1k.py
index ffa0eb981d9..5ae8ec47c4c 100644
--- a/configs/convnext/convnext-base_32xb128_in1k.py
+++ b/configs/convnext/convnext-base_32xb128_in1k.py
@@ -11,11 +11,11 @@
 # schedule setting
 optim_wrapper = dict(
     optimizer=dict(lr=4e-3),
-    clip_grad=dict(max_norm=5.0),
+    clip_grad=None,
 )
 
 # runtime setting
-custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
 
 # NOTE: `auto_scale_lr` is for automatically scaling LR
 # based on the actual training batch size.
diff --git a/configs/convnext/convnext-base_32xb128_in21k.py b/configs/convnext/convnext-base_32xb128_in21k.py
new file mode 100644
index 00000000000..c343526c7f0
--- /dev/null
+++ b/configs/convnext/convnext-base_32xb128_in21k.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/convnext/convnext-base.py',
+    '../_base_/datasets/imagenet21k_bs128.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# model setting
+model = dict(head=dict(num_classes=21841))
+
+# dataset setting
+data_preprocessor = dict(num_classes=21841)
+train_dataloader = dict(batch_size=128)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/convnext-large_64xb64_in1k.py b/configs/convnext/convnext-large_64xb64_in1k.py
index 4f344a6ca7d..8a78c58bc3d 100644
--- a/configs/convnext/convnext-large_64xb64_in1k.py
+++ b/configs/convnext/convnext-large_64xb64_in1k.py
@@ -11,11 +11,11 @@
 # schedule setting
 optim_wrapper = dict(
     optimizer=dict(lr=4e-3),
-    clip_grad=dict(max_norm=5.0),
+    clip_grad=None,
 )
 
 # runtime setting
-custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
 
 # NOTE: `auto_scale_lr` is for automatically scaling LR
 # based on the actual training batch size.
diff --git a/configs/convnext/convnext-large_64xb64_in21k.py b/configs/convnext/convnext-large_64xb64_in21k.py
new file mode 100644
index 00000000000..420edab67b1
--- /dev/null
+++ b/configs/convnext/convnext-large_64xb64_in21k.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/convnext/convnext-base.py',
+    '../_base_/datasets/imagenet21k_bs128.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# model setting
+model = dict(head=dict(num_classes=21841))
+
+# dataset setting
+data_preprocessor = dict(num_classes=21841)
+train_dataloader = dict(batch_size=64)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/convnext-small_32xb128_in1k.py b/configs/convnext/convnext-small_32xb128_in1k.py
index c7973068b62..b623e900f83 100644
--- a/configs/convnext/convnext-small_32xb128_in1k.py
+++ b/configs/convnext/convnext-small_32xb128_in1k.py
@@ -11,11 +11,11 @@
 # schedule setting
 optim_wrapper = dict(
     optimizer=dict(lr=4e-3),
-    clip_grad=dict(max_norm=5.0),
+    clip_grad=None,
 )
 
 # runtime setting
-custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
 
 # NOTE: `auto_scale_lr` is for automatically scaling LR
 # based on the actual training batch size.
diff --git a/configs/convnext/convnext-tiny_32xb128_in1k.py b/configs/convnext/convnext-tiny_32xb128_in1k.py
index d4c9d40eadf..59d3004bde8 100644
--- a/configs/convnext/convnext-tiny_32xb128_in1k.py
+++ b/configs/convnext/convnext-tiny_32xb128_in1k.py
@@ -11,11 +11,11 @@
 # schedule setting
 optim_wrapper = dict(
     optimizer=dict(lr=4e-3),
-    clip_grad=dict(max_norm=5.0),
+    clip_grad=None,
 )
 
 # runtime setting
-custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
 
 # NOTE: `auto_scale_lr` is for automatically scaling LR
 # based on the actual training batch size.
diff --git a/configs/convnext/convnext-xlarge_64xb64_in1k.py b/configs/convnext/convnext-xlarge_64xb64_in1k.py
index e431adb5e14..528894e808b 100644
--- a/configs/convnext/convnext-xlarge_64xb64_in1k.py
+++ b/configs/convnext/convnext-xlarge_64xb64_in1k.py
@@ -11,11 +11,11 @@
 # schedule setting
 optim_wrapper = dict(
     optimizer=dict(lr=4e-3),
-    clip_grad=dict(max_norm=5.0),
+    clip_grad=None,
 )
 
 # runtime setting
-custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
 
 # NOTE: `auto_scale_lr` is for automatically scaling LR
 # based on the actual training batch size.
diff --git a/configs/convnext/convnext-xlarge_64xb64_in21k.py b/configs/convnext/convnext-xlarge_64xb64_in21k.py
new file mode 100644
index 00000000000..420edab67b1
--- /dev/null
+++ b/configs/convnext/convnext-xlarge_64xb64_in21k.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/convnext/convnext-base.py',
+    '../_base_/datasets/imagenet21k_bs128.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# model setting
+model = dict(head=dict(num_classes=21841))
+
+# dataset setting
+data_preprocessor = dict(num_classes=21841)
+train_dataloader = dict(batch_size=64)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/metafile.yml b/configs/convnext/metafile.yml
index 74b0a041c42..114a151e1a5 100644
--- a/configs/convnext/metafile.yml
+++ b/configs/convnext/metafile.yml
@@ -14,7 +14,7 @@ Collections:
       URL: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py
 
 Models:
-  - Name: convnext-tiny_3rdparty_32xb128_in1k
+  - Name: convnext-tiny_32xb128_in1k
     Metadata:
       FLOPs: 4457472768
       Parameters: 28589128
@@ -22,15 +22,12 @@ Models:
     Results:
       - Dataset: ImageNet-1k
         Metrics:
-          Top 1 Accuracy: 82.05
-          Top 5 Accuracy: 95.86
+          Top 1 Accuracy: 82.14
+          Top 5 Accuracy: 96.06
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.pth
     Config: configs/convnext/convnext-tiny_32xb128_in1k.py
-    Converted From:
-      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth
-      Code: https://github.com/facebookresearch/ConvNeXt
-  - Name: convnext-tiny_3rdparty_32xb128-noema_in1k
+  - Name: convnext-tiny_32xb128-noema_in1k
     Metadata:
       Training Data: ImageNet-1k
       FLOPs: 4457472768
@@ -39,15 +36,12 @@ Models:
     Results:
       - Dataset: ImageNet-1k
         Metrics:
-          Top 1 Accuracy: 81.81
-          Top 5 Accuracy: 95.67
+          Top 1 Accuracy: 81.95
+          Top 5 Accuracy: 95.89
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128-noema_in1k_20220222-2908964a.pth
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128-noema_in1k_20221208-5d4509c7.pth
     Config: configs/convnext/convnext-tiny_32xb128_in1k.py
-    Converted From:
-      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224.pth
-      Code: https://github.com/facebookresearch/ConvNeXt
-  - Name: convnext-small_3rdparty_32xb128_in1k
+  - Name: convnext-small_32xb128_in1k
     Metadata:
       Training Data: ImageNet-1k
       FLOPs: 8687008512
@@ -56,15 +50,12 @@ Models:
     Results:
       - Dataset: ImageNet-1k
         Metrics:
-          Top 1 Accuracy: 83.13
-          Top 5 Accuracy: 96.44
+          Top 1 Accuracy: 83.16
+          Top 5 Accuracy: 96.56
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.pth
     Config: configs/convnext/convnext-small_32xb128_in1k.py
-    Converted From:
-      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth
-      Code: https://github.com/facebookresearch/ConvNeXt
-  - Name: convnext-small_3rdparty_32xb128-noema_in1k
+  - Name: convnext-small_32xb128-noema_in1k
     Metadata:
       Training Data: ImageNet-1k
       FLOPs: 8687008512
@@ -73,15 +64,40 @@ Models:
     Results:
       - Dataset: ImageNet-1k
         Metrics:
-          Top 1 Accuracy: 83.11
-          Top 5 Accuracy: 96.34
+          Top 1 Accuracy: 83.21
+          Top 5 Accuracy: 96.48
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128-noema_in1k_20220222-fa001ca5.pth
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128-noema_in1k_20221208-4a618995.pth
     Config: configs/convnext/convnext-small_32xb128_in1k.py
-    Converted From:
-      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224.pth
-      Code: https://github.com/facebookresearch/ConvNeXt
-  - Name: convnext-base_3rdparty_32xb128_in1k
+  - Name: convnext-base_32xb128_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 15359124480
+      Parameters: 88591464
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.66
+          Top 5 Accuracy: 96.74
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.pth
+    Config: configs/convnext/convnext-base_32xb128_in1k.py
+  - Name: convnext-base_32xb128-noema_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 15359124480
+      Parameters: 88591464
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.64
+          Top 5 Accuracy: 96.61
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128-noema_in1k_20221208-f8182678.pth
+    Config: configs/convnext/convnext-base_32xb128_in1k.py
+  - Name: convnext-base_3rdparty_in1k
     Metadata:
       Training Data: ImageNet-1k
       FLOPs: 15359124480
@@ -98,7 +114,7 @@ Models:
     Converted From:
       Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth
       Code: https://github.com/facebookresearch/ConvNeXt
-  - Name: convnext-base_3rdparty_32xb128-noema_in1k
+  - Name: convnext-base_3rdparty-noema_in1k
     Metadata:
       Training Data: ImageNet-1k
       FLOPs: 15359124480
@@ -123,10 +139,11 @@ Models:
     In Collection: ConvNeXt
     Results: null
     Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth
+    Config: configs/convnext/convnext-base_32xb128_in21k.py
     Converted From:
       Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth
       Code: https://github.com/facebookresearch/ConvNeXt
-  - Name: convnext-base_in21k-pre-3rdparty_32xb128_in1k
+  - Name: convnext-base_in21k-pre_3rdparty_in1k
     Metadata:
       Training Data:
         - ImageNet-21k
@@ -145,7 +162,7 @@ Models:
     Converted From:
       Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth
       Code: https://github.com/facebookresearch/ConvNeXt
-  - Name: convnext-large_3rdparty_64xb64_in1k
+  - Name: convnext-large_3rdparty_in1k
     Metadata:
       Training Data: ImageNet-1k
       FLOPs: 34368026112
@@ -170,10 +187,11 @@ Models:
     In Collection: ConvNeXt
     Results: null
     Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth
+    Config: configs/convnext/convnext-large_64xb64_in21k.py
     Converted From:
       Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth
       Code: https://github.com/facebookresearch/ConvNeXt
-  - Name: convnext-large_in21k-pre-3rdparty_64xb64_in1k
+  - Name: convnext-large_in21k-pre_3rdparty_in1k
     Metadata:
       Training Data:
         - ImageNet-21k
@@ -200,10 +218,11 @@ Models:
     In Collection: ConvNeXt
     Results: null
     Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth
+    Config: configs/convnext/convnext-xlarge_64xb64_in21k.py
     Converted From:
       Weights: https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth
       Code: https://github.com/facebookresearch/ConvNeXt
-  - Name: convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k
+  - Name: convnext-xlarge_in21k-pre_3rdparty_in1k
     Metadata:
       Training Data:
         - ImageNet-21k
diff --git a/docs/en/migration.md b/docs/en/migration.md
index ff585d4c5c2..b5fda68063f 100644
--- a/docs/en/migration.md
+++ b/docs/en/migration.md
@@ -481,7 +481,7 @@ visualizer = dict(
 )
 ```
 
-New field **`default_scope`**: The start point to search module for all registries. The `default_scope` in MMClassification is `mmcls`. See {external+mmengine:doc}`the registry tutorial <tutorials/registry>` for more details.
+New field **`default_scope`**: The start point to search module for all registries. The `default_scope` in MMClassification is `mmcls`. See {external+mmengine:doc}`the registry tutorial <advanced_tutorials/registry>` for more details.
 
 ## Packages
 
diff --git a/docs/en/user_guides/config.md b/docs/en/user_guides/config.md
index 39e95c7cb78..84acc55aeb7 100644
--- a/docs/en/user_guides/config.md
+++ b/docs/en/user_guides/config.md
@@ -2,7 +2,7 @@
 
 To manage various configurations in a deep-learning experiment, we use a kind of config file to record all of
 these configurations. This config system has a modular and inheritance design, and more details can be found in
-{external+mmengine:doc}`the tutorial in MMEngine <tutorials/config>`.
+{external+mmengine:doc}`the tutorial in MMEngine <advanced_tutorials/config>`.
 
 Usually, we use python files as config file. All configuration files are placed under the [`configs`](https://github.com/open-mmlab/mmclassification/tree/1.x/configs) folder, and the directory structure is as follows:
 
@@ -64,7 +64,7 @@ This primitive config file includes a dict variable `model`, which mainly includ
 
 ```{note}
 Usually, we use the `type` field to specify the class of the component and use other fields to pass
-the initialization arguments of the class. The {external+mmengine:doc}`registry tutorial <tutorials/registry>` describes it in detail.
+the initialization arguments of the class. The {external+mmengine:doc}`registry tutorial <advanced_tutorials/registry>` describes it in detail.
 ```
 
 Following is the model primitive config of the ResNet50 config file in [`configs/_base_/models/resnet50.py`](https://github.com/open-mmlab/mmclassification/blob/1.x/configs/_base_/models/resnet50.py):
@@ -348,7 +348,7 @@ test_dataloader = dict(dataset=dict(pipeline=val_pipeline))
 
 ### Ignore some fields in the base configs
 
-Sometimes, you need to set `_delete_=True` to ignore some domain content in the basic configuration file. You can refer to the {external+mmengine:doc}`documentation in MMEngine <tutorials/config>` for more instructions.
+Sometimes, you need to set `_delete_=True` to ignore some domain content in the basic configuration file. You can refer to the {external+mmengine:doc}`documentation in MMEngine <advanced_tutorials/config>` for more instructions.
 
 The following is an example. If you want to use cosine schedule in the above ResNet50 case, just using inheritance and directly modifying it will report `get unexpected keyword 'step'` error, because the `'step'` field of the basic config in `param_scheduler` domain information is reserved, and you need to add `_delete_ =True` to ignore the content of `param_scheduler` related fields in the basic configuration file:
 
@@ -361,7 +361,7 @@ param_scheduler = dict(type='CosineAnnealingLR', by_epoch=True, _delete_=True)
 
 ### Use some fields in the base configs
 
-Sometimes, you may refer to some fields in the `_base_` config, to avoid duplication of definitions. You can refer to {external+mmengine:doc}`MMEngine <tutorials/config>` for some more instructions.
+Sometimes, you may refer to some fields in the `_base_` config, to avoid duplication of definitions. You can refer to {external+mmengine:doc}`MMEngine <advanced_tutorials/config>` for some more instructions.
 
 The following is an example of using auto augment in the training data preprocessing pipeline, refer to [`configs/resnest/resnest50_32xb64_in1k.py`](https://github.com/open-mmlab/mmclassification/blob/1.x/configs/resnest/resnest50_32xb64_in1k.py). When defining `train_pipeline`, just add the definition file name of auto augment to `_base_`, and then use `_base_.auto_increasing_policies` to reference the variables in the primitive config:
 
diff --git a/mmcls/models/backbones/convnext.py b/mmcls/models/backbones/convnext.py
index eeb4b420556..15096b5fdb1 100644
--- a/mmcls/models/backbones/convnext.py
+++ b/mmcls/models/backbones/convnext.py
@@ -31,12 +31,20 @@ def __init__(self, num_channels: int, **kwargs) -> None:
         super().__init__(num_channels, **kwargs)
         self.num_channels = self.normalized_shape[0]
 
-    def forward(self, x):
+    def forward(self, x, data_format='channel_first'):
         assert x.dim() == 4, 'LayerNorm2d only supports inputs with shape ' \
             f'(N, C, H, W), but got tensor with shape {x.shape}'
-        return F.layer_norm(
-            x.permute(0, 2, 3, 1), self.normalized_shape, self.weight,
-            self.bias, self.eps).permute(0, 3, 1, 2)
+        if data_format == 'channel_last':
+            x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias,
+                             self.eps)
+        elif data_format == 'channel_first':
+            x = x.permute(0, 2, 3, 1)
+            x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias,
+                             self.eps)
+            # If the output is discontiguous, it may cause some unexpected
+            # problem in the downstream tasks
+            x = x.permute(0, 3, 1, 2).contiguous()
+        return x
 
 
 class ConvNeXtBlock(BaseModule):
@@ -113,10 +121,10 @@ def forward(self, x):
         def _inner_forward(x):
             shortcut = x
             x = self.depthwise_conv(x)
-            x = self.norm(x)
 
             if self.linear_pw_conv:
                 x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+            x = self.norm(x, data_format='channel_last')
 
             x = self.pointwise_conv1(x)
             x = self.act(x)
@@ -284,7 +292,7 @@ def __init__(self,
 
             if i >= 1:
                 downsample_layer = nn.Sequential(
-                    LayerNorm2d(self.channels[i - 1]),
+                    build_norm_layer(norm_cfg, self.channels[i - 1])[1],
                     nn.Conv2d(
                         self.channels[i - 1],
                         channels,
@@ -324,9 +332,7 @@ def forward(self, x):
                     gap = x.mean([-2, -1], keepdim=True)
                     outs.append(norm_layer(gap).flatten(1))
                 else:
-                    # The output of LayerNorm2d may be discontiguous, which
-                    # may cause some problem in the downstream tasks
-                    outs.append(norm_layer(x).contiguous())
+                    outs.append(norm_layer(x))
 
         return tuple(outs)
 

From 3006fa26ab1ffea64bcfde0a9c8fddf20b24e879 Mon Sep 17 00:00:00 2001
From: Ma Zerun <mzr1996@163.com>
Date: Mon, 19 Dec 2022 13:54:52 +0800
Subject: [PATCH 09/21] [Fix] Fix CAM visualization. (#1248)

---
 tools/visualizations/vis_cam.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/visualizations/vis_cam.py b/tools/visualizations/vis_cam.py
index 83241cae36e..61b5fa5716a 100644
--- a/tools/visualizations/vis_cam.py
+++ b/tools/visualizations/vis_cam.py
@@ -10,6 +10,7 @@
 import numpy as np
 from mmcv.transforms import Compose
 from mmengine.config import Config, DictAction
+from mmengine.dataset import default_collate
 from mmengine.utils import to_2tuple
 from torch.nn import BatchNorm1d, BatchNorm2d, GroupNorm, LayerNorm
 
@@ -276,7 +277,7 @@ def main():
     transforms = Compose(cfg.test_dataloader.dataset.pipeline)
     data = transforms({'img_path': args.img})
     src_img = copy.deepcopy(data['inputs']).numpy().transpose(1, 2, 0)
-    data = model.data_preprocessor(data, False)
+    data = model.data_preprocessor(default_collate([data]), False)
 
     # build target layers
     if args.target_layers:
@@ -306,7 +307,7 @@ def main():
 
     # calculate cam grads and show|save the visualization image
     grayscale_cam = cam(
-        data['inputs'].unsqueeze(0),
+        data['inputs'],
         targets,
         eigen_smooth=args.eigen_smooth,
         aug_smooth=args.aug_smooth)

From 9e82db6032e2c0ae665ffb6842b6066b322b6fb0 Mon Sep 17 00:00:00 2001
From: takuoko <to78314910@gmail.com>
Date: Mon, 19 Dec 2022 18:17:57 +0900
Subject: [PATCH 10/21] [Enhance] Support ConvNeXt More Weights. (#1240)

* convnext more weights

* Update metafile and README

* Fix link

Co-authored-by: mzr1996 <mzr1996@163.com>
---
 configs/convnext/README.md                    |  37 ++--
 .../convnext-base_32xb128_in1k-384px.py       |  23 +++
 .../convnext-large_64xb64_in1k-384px.py       |  23 +++
 .../convnext-small_32xb128_in1k-384px.py      |  23 +++
 .../convnext-tiny_32xb128_in1k-384px.py       |  23 +++
 .../convnext-xlarge_64xb64_in1k-384px.py      |  23 +++
 configs/convnext/metafile.yml                 | 167 ++++++++++++++++++
 tools/model_converters/convnext_to_mmcls.py   |  62 +++++++
 8 files changed, 367 insertions(+), 14 deletions(-)
 create mode 100644 configs/convnext/convnext-base_32xb128_in1k-384px.py
 create mode 100644 configs/convnext/convnext-large_64xb64_in1k-384px.py
 create mode 100644 configs/convnext/convnext-small_32xb128_in1k-384px.py
 create mode 100644 configs/convnext/convnext-tiny_32xb128_in1k-384px.py
 create mode 100644 configs/convnext/convnext-xlarge_64xb64_in1k-384px.py
 create mode 100644 tools/model_converters/convnext_to_mmcls.py

diff --git a/configs/convnext/README.md b/configs/convnext/README.md
index e87302e381e..4391348c521 100644
--- a/configs/convnext/README.md
+++ b/configs/convnext/README.md
@@ -94,25 +94,34 @@ The pre-trained models on ImageNet-1k or ImageNet-21k are used to fine-tune on t
 | ConvNeXt-T (`convnext-tiny_32xb128-noema_in1k`)    |  ImageNet-1k  |   28.59   |   4.46   |   81.95   |   95.89   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128-noema_in1k_20221208-5d4509c7.pth) |
 | ConvNeXt-S (`convnext-small_32xb128-noema_in1k`)   |  ImageNet-1k  |   50.22   |   8.69   |   83.21   |   96.48   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128-noema_in1k_20221208-4a618995.pth) |
 | ConvNeXt-B (`convnext-base_32xb128-noema_in1k`)    |  ImageNet-1k  |   88.59   |  15.36   |   83.64   |   96.61   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128-noema_in1k_20221208-f8182678.pth) |
-| ConvNeXt-B (`convnext-base_3rdparty-noema_in1k`)\* |  ImageNet-1k  |   88.59   |  15.36   |   83.71   |   96.60   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128-noema_in1k_20220222-dba4f95f.pth) |
-| ConvNeXt-B (`convnext-base_3rdparty_in21k`)\*      | ImageNet-21k  |   88.59   |  15.36   |    N/A    |    N/A    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth) |
-| ConvNeXt-L (`convnext-large_3rdparty_in21k`)\*     | ImageNet-21k  |  197.77   |  34.37   |    N/A    |    N/A    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth) |
-| ConvNeXt-XL (`convnext-xlarge_3rdparty_in21k`)\*   | ImageNet-21k  |  350.20   |  60.93   |    N/A    |    N/A    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth) |
+| ConvNeXt-B\* (`convnext-base_3rdparty-noema_in1k`) |  ImageNet-1k  |   88.59   |  15.36   |   83.71   |   96.60   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128-noema_in1k_20220222-dba4f95f.pth) |
+| ConvNeXt-B\* (`convnext-base_3rdparty_in21k`)      | ImageNet-21k  |   88.59   |  15.36   |    N/A    |    N/A    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth) |
+| ConvNeXt-L\* (`convnext-large_3rdparty_in21k`)     | ImageNet-21k  |  197.77   |  34.37   |    N/A    |    N/A    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth) |
+| ConvNeXt-XL\* (`convnext-xlarge_3rdparty_in21k`)   | ImageNet-21k  |  350.20   |  60.93   |    N/A    |    N/A    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth) |
 
 *Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt).*
 
 ### ImageNet-1k
 
-| Model                                                  |   Pretrain   | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                   Config                   |                         Download                          |
-| :----------------------------------------------------- | :----------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------: | :-------------------------------------------------------: |
-| ConvNeXt-T (`convnext-tiny_32xb128_in1k`)              | From scratch |   28.59   |   4.46   |   82.14   |   96.06   | [config](./convnext-tiny_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.log.json) |
-| ConvNeXt-S (`convnext-small_32xb128_in1k`)             | From scratch |   50.22   |   8.69   |   83.16   |   96.56   | [config](./convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.log.json) |
-| ConvNeXt-B (`convnext-base_32xb128_in1k`)              | From scratch |   88.59   |  15.36   |   83.66   |   96.74   | [config](./convnext-base_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.log.json) |
-| ConvNeXt-B (`convnext-base_3rdparty_in1k`)\*           | From scratch |   88.59   |  15.36   |   83.85   |   96.74   | [config](./convnext-base_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) |
-| ConvNeXt-B (`convnext-base_in21k-pre_3rdparty_in1k`)\* | ImageNet 21k |   88.59   |  15.36   |   85.81   |   97.86   | [config](./convnext-base_32xb128_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) |
-| ConvNeXt-L (`convnext-large_3rdparty_in1k`)\*          | From scratch |  197.77   |  34.37   |   84.30   |   96.89   | [config](./convnext-large_64xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) |
-| ConvNeXt-L (`convnext-large_in21k-pre_3rdparty_in1k`)\* | ImageNet 21k |  197.77   |  34.37   |   86.61   |   98.04   | [config](./convnext-large_64xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) |
-| ConvNeXt-XL (`convnext-xlarge_in21k-pre_3rdparty_in1k`)\* | ImageNet 21k |  350.20   |  60.93   |   86.97   |   98.20   | [config](./convnext-xlarge_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) |
+| Model                                          |   Pretrain   | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                     Config                      |                      Download                      |
+| :--------------------------------------------- | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------: | :------------------------------------------------: |
+| ConvNeXt-T (`convnext-tiny_32xb128_in1k`)      | From scratch |  224x224   |   28.59   |   4.46   |   82.14   |   96.06   |    [config](./convnext-tiny_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128_in1k_20221207-998cf3e9.log.json) |
+| ConvNeXt-T\* (`convnext-tiny_in21k-pre_3rdparty_in1k`) | ImageNet-21k |  224x224   |   28.59   |   4.46   |   82.90   |   96.62   |    [config](./convnext-tiny_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_in21k-pre_3rdparty_in1k_20221219-7501e534.pth) |
+| ConvNeXt-T\* (`convnext-tiny_in21k-pre_3rdparty_in1k-384px`) | ImageNet-21k |  384x384   |   28.59   |  13.13   |   84.11   |   97.14   | [config](./convnext-tiny_32xb128_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_in21k-pre_3rdparty_in1k-384px_20221219-c1182362.pth) |
+| ConvNeXt-S (`convnext-small_32xb128_in1k`)     | From scratch |  224x224   |   50.22   |   8.69   |   83.16   |   96.56   |   [config](./convnext-small_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128_in1k_20221207-4ab7052c.log.json) |
+| ConvNeXt-S\* (`convnext-small_in21k-pre_3rdparty_in1k`) | ImageNet-21k |  224x224   |   50.22   |   8.69   |   84.59   |   97.41   |   [config](./convnext-small_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_in21k-pre_3rdparty_in1k_20221219-aeca4c93.pth) |
+| ConvNeXt-S\* (`convnext-small_in21k-pre_3rdparty_in1k-384px`) | ImageNet-21k |  384x384   |   50.22   |  25.58   |   85.75   |   97.88   |   [config](./convnext-small_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_in21k-pre_3rdparty_in1k-384px_20221219-96f0bb87.pth) |
+| ConvNeXt-B (`convnext-base_32xb128_in1k`)      | From scratch |  224x224   |   88.59   |  15.36   |   83.66   |   96.74   |    [config](./convnext-base_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_32xb128_in1k_20221207-fbdb5eb9.log.json) |
+| ConvNeXt-B\* (`convnext-base_3rdparty_in1k`)   | From scratch |  224x224   |   88.59   |  15.36   |   83.85   |   96.74   |    [config](./convnext-base_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) |
+| ConvNeXt-B (`convnext-base_3rdparty_in1k-384px`)\* | From scratch |  384x384   |   88.59   |  45.21   |   85.10   |   97.34   | [config](./convnext-base_32xb128_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128-noema_in1k_20220222-dba4f95f.pth) |
+| ConvNeXt-B\* (`convnext-base_in21k-pre_3rdparty_in1k`) | ImageNet 21k |  224x224   |   88.59   |  15.36   |   85.81   |   97.86   |    [config](./convnext-base_32xb128_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) |
+| ConvNeXt-B\* (`convnext-base_in21k-pre-3rdparty_in1k-384px`) | ImageNet-21k |  384x384   |   88.59   |  45.21   |   86.82   |   98.25   | [config](./convnext-base_32xb128_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in1k-384px_20221219-c8f1dc2b.pth) |
+| ConvNeXt-L\* (`convnext-large_3rdparty_in1k`)  | From scratch |  224x224   |  197.77   |  34.37   |   84.30   |   96.89   |    [config](./convnext-large_64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) |
+| ConvNeXt-L\* (`convnext-large_3rdparty_in1k-384px`) | From scratch |  384x384   |  197.77   |  101.10  |   85.50   |   97.59   | [config](./convnext-large_64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in1k-384px_20221219-6dd29d10.pth) |
+| ConvNeXt-L\* (`convnext-large_in21k-pre_3rdparty_in1k`) | ImageNet 21k |  224x224   |  197.77   |  34.37   |   86.61   |   98.04   |    [config](./convnext-large_64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) |
+| ConvNeXt-L (`convnext-large_in21k-pre-3rdparty_in1k-384px`)\* | ImageNet-21k |  384x384   |  197.77   |  101.10  |   87.46   |   98.37   | [config](./convnext-large_64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_in1k-384px_20221219-6d38dd66.pth) |
+| ConvNeXt-XL\* (`convnext-xlarge_in21k-pre_3rdparty_in1k`) | ImageNet 21k |  224x224   |  350.20   |  60.93   |   86.97   |   98.20   |   [config](./convnext-xlarge_64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) |
+| ConvNeXt-XL\* (`convnext-xlarge_in21k-pre-3rdparty_in1k-384px`) | ImageNet-21k |  384x384   |  350.20   |  179.20  |   87.76   |   98.55   | [config](./convnext-xlarge_64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_in1k-384px_20221219-b161bc14.pth) |
 
 *Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
 
diff --git a/configs/convnext/convnext-base_32xb128_in1k-384px.py b/configs/convnext/convnext-base_32xb128_in1k-384px.py
new file mode 100644
index 00000000000..65546942562
--- /dev/null
+++ b/configs/convnext/convnext-base_32xb128_in1k-384px.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/convnext/convnext-base.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=128)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/convnext-large_64xb64_in1k-384px.py b/configs/convnext/convnext-large_64xb64_in1k-384px.py
new file mode 100644
index 00000000000..6698b9edcda
--- /dev/null
+++ b/configs/convnext/convnext-large_64xb64_in1k-384px.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/convnext/convnext-large.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=64)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/convnext-small_32xb128_in1k-384px.py b/configs/convnext/convnext-small_32xb128_in1k-384px.py
new file mode 100644
index 00000000000..729f00ad2fd
--- /dev/null
+++ b/configs/convnext/convnext-small_32xb128_in1k-384px.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/convnext/convnext-small.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=128)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/convnext-tiny_32xb128_in1k-384px.py b/configs/convnext/convnext-tiny_32xb128_in1k-384px.py
new file mode 100644
index 00000000000..6513ad8dfa4
--- /dev/null
+++ b/configs/convnext/convnext-tiny_32xb128_in1k-384px.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/convnext/convnext-tiny.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=128)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (32 GPUs) x (128 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/convnext-xlarge_64xb64_in1k-384px.py b/configs/convnext/convnext-xlarge_64xb64_in1k-384px.py
new file mode 100644
index 00000000000..6edc94d2448
--- /dev/null
+++ b/configs/convnext/convnext-xlarge_64xb64_in1k-384px.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/convnext/convnext-xlarge.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=64)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR
+# based on the actual training batch size.
+# base_batch_size = (64 GPUs) x (64 samples per GPU)
+auto_scale_lr = dict(base_batch_size=4096)
diff --git a/configs/convnext/metafile.yml b/configs/convnext/metafile.yml
index 114a151e1a5..542bbcd7ac9 100644
--- a/configs/convnext/metafile.yml
+++ b/configs/convnext/metafile.yml
@@ -41,6 +41,44 @@ Models:
         Task: Image Classification
     Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_32xb128-noema_in1k_20221208-5d4509c7.pth
     Config: configs/convnext/convnext-tiny_32xb128_in1k.py
+  - Name: convnext-tiny_in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 4457472768
+      Parameters: 28589128
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.90
+          Top 5 Accuracy: 96.62
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_in21k-pre_3rdparty_in1k_20221219-7501e534.pth
+    Config: configs/convnext/convnext-tiny_32xb128_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_224.pth
+      Code: https://github.com/facebookresearch/ConvNeXt
+  - Name: convnext-tiny_in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 13135236864
+      Parameters: 28589128
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.11
+          Top 5 Accuracy: 97.14
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_in21k-pre_3rdparty_in1k-384px_20221219-c1182362.pth
+    Config: configs/convnext/convnext-tiny_32xb128_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_384.pth
+      Code: https://github.com/facebookresearch/ConvNeXt
   - Name: convnext-small_32xb128_in1k
     Metadata:
       Training Data: ImageNet-1k
@@ -69,6 +107,44 @@ Models:
         Task: Image Classification
     Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_32xb128-noema_in1k_20221208-4a618995.pth
     Config: configs/convnext/convnext-small_32xb128_in1k.py
+  - Name: convnext-small_in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 8687008512
+      Parameters: 50223688
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.59
+          Top 5 Accuracy: 97.41
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_in21k-pre_3rdparty_in1k_20221219-aeca4c93.pth
+    Config: configs/convnext/convnext-small_32xb128_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_224.pth
+      Code: https://github.com/facebookresearch/ConvNeXt
+  - Name: convnext-small_in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 25580818176
+      Parameters: 50223688
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.75
+          Top 5 Accuracy: 97.88
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_in21k-pre_3rdparty_in1k-384px_20221219-96f0bb87.pth
+    Config: configs/convnext/convnext-small_32xb128_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_384.pth
+      Code: https://github.com/facebookresearch/ConvNeXt
   - Name: convnext-base_32xb128_in1k
     Metadata:
       Training Data: ImageNet-1k
@@ -131,6 +207,23 @@ Models:
     Converted From:
       Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224.pth
       Code: https://github.com/facebookresearch/ConvNeXt
+  - Name: convnext-base_3rdparty_in1k-384px
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 45205885952
+      Parameters: 88591464
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.10
+          Top 5 Accuracy: 97.34
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in1k-384px_20221219-c8f1dc2b.pth
+    Config: configs/convnext/convnext-base_32xb128_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_384.pth
+      Code: https://github.com/facebookresearch/ConvNeXt
   - Name: convnext-base_3rdparty_in21k
     Metadata:
       Training Data: ImageNet-21k
@@ -162,6 +255,25 @@ Models:
     Converted From:
       Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth
       Code: https://github.com/facebookresearch/ConvNeXt
+  - Name: convnext-base_in21k-pre-3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 45205885952
+      Parameters: 88591464
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.82
+          Top 5 Accuracy: 98.25
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_in1k-384px_20221219-4570f792.pth
+    Config: configs/convnext/convnext-base_32xb128_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth
+      Code: https://github.com/facebookresearch/ConvNeXt
   - Name: convnext-large_3rdparty_in1k
     Metadata:
       Training Data: ImageNet-1k
@@ -179,6 +291,23 @@ Models:
     Converted From:
       Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth
       Code: https://github.com/facebookresearch/ConvNeXt
+  - Name: convnext-large_3rdparty_in1k-384px
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 101103214080
+      Parameters: 197767336
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.50
+          Top 5 Accuracy: 97.59
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in1k-384px_20221219-6dd29d10.pth
+    Config: configs/convnext/convnext-large_64xb64_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_384.pth
+      Code: https://github.com/facebookresearch/ConvNeXt
   - Name: convnext-large_3rdparty_in21k
     Metadata:
       Training Data: ImageNet-21k
@@ -210,6 +339,25 @@ Models:
     Converted From:
       Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth
       Code: https://github.com/facebookresearch/ConvNeXt
+  - Name: convnext-large_in21k-pre-3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 101103214080
+      Parameters: 197767336
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 87.46
+          Top 5 Accuracy: 98.37
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_in1k-384px_20221219-6d38dd66.pth
+    Config: configs/convnext/convnext-large_64xb64_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth
+      Code: https://github.com/facebookresearch/ConvNeXt
   - Name: convnext-xlarge_3rdparty_in21k
     Metadata:
       Training Data: ImageNet-21k
@@ -241,3 +389,22 @@ Models:
     Converted From:
       Weights: https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth
       Code: https://github.com/facebookresearch/ConvNeXt
+  - Name: convnext-xlarge_in21k-pre-3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 179196798976
+      Parameters: 350196968
+    In Collection: ConvNeXt
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 87.76
+          Top 5 Accuracy: 98.55
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_in1k-384px_20221219-b161bc14.pth
+    Config: configs/convnext/convnext-xlarge_64xb64_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth
+      Code: https://github.com/facebookresearch/ConvNeXt
diff --git a/tools/model_converters/convnext_to_mmcls.py b/tools/model_converters/convnext_to_mmcls.py
new file mode 100644
index 00000000000..da24aca82ae
--- /dev/null
+++ b/tools/model_converters/convnext_to_mmcls.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_convnext(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    for k, v in list(ckpt.items()):
+        new_v = v
+        if k.startswith('head'):
+            new_k = k.replace('head.', 'head.fc.')
+            new_ckpt[new_k] = new_v
+            continue
+        elif k.startswith('stages'):
+            if 'dwconv' in k:
+                new_k = k.replace('dwconv', 'depthwise_conv')
+            elif 'pwconv' in k:
+                new_k = k.replace('pwconv', 'pointwise_conv')
+            else:
+                new_k = k
+        elif k.startswith('norm'):
+            new_k = k.replace('norm', 'norm3')
+        else:
+            new_k = k
+
+        if not new_k.startswith('head'):
+            new_k = 'backbone.' + new_k
+        new_ckpt[new_k] = new_v
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in pretrained van models to mmcls style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+
+    weight = convert_convnext(state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(dict(state_dict=weight), args.dst)
+
+    print('Done!!')
+
+
+if __name__ == '__main__':
+    main()

From 5547f4cac4db52d6ee995c9c276be9dccf507dbf Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Tue, 20 Dec 2022 13:04:00 +0800
Subject: [PATCH 11/21] [Feature] Add TinyViT for dev-1.x. (#1042)

* [Feature] add TinyViT for dev-1.x

* [Feature] update readme

* fix lint error

* refactor the code

* [Update] update the args

* [Update] add unit test and fix bugs

* Rename the configuration file

* delete invalid files

* [Feature] update tinyvit readme

* [Feature] update tinyvit readme

* [Feature] update metafile

* Update tinyvit metafile
---
 configs/_base_/models/tinyvit/tinyvit-11m.py  |  25 +
 configs/_base_/models/tinyvit/tinyvit-21m.py  |  25 +
 configs/_base_/models/tinyvit/tinyvit-5m.py   |  25 +
 configs/tinyvit/README.md                     |  34 +
 configs/tinyvit/metafile.yml                  | 144 ++++
 .../tinyvit-11m-distill_8xb256_in1k.py        |   3 +
 configs/tinyvit/tinyvit-11m_8xb256_in1k.py    |   6 +
 .../tinyvit-21m-distill_8xb256_in1k-384px.py  |  29 +
 .../tinyvit-21m-distill_8xb256_in1k-512px.py  |  28 +
 .../tinyvit-21m-distill_8xb256_in1k.py        |   3 +
 configs/tinyvit/tinyvit-21m_8xb256_in1k.py    |   6 +
 .../tinyvit/tinyvit-5m-distill_8xb256_in1k.py |   3 +
 configs/tinyvit/tinyvit-5m_8xb256_in1k.py     |   6 +
 mmcls/models/backbones/__init__.py            |   2 +
 mmcls/models/backbones/tinyvit.py             | 769 ++++++++++++++++++
 mmcls/models/utils/__init__.py                |  38 +-
 mmcls/models/utils/attention.py               |  90 ++
 model-index.yml                               |   1 +
 .../test_backbones/test_tinyvit.py            |  80 ++
 tools/model_converters/tinyvit_to_mmcls.py    |  61 ++
 20 files changed, 1349 insertions(+), 29 deletions(-)
 create mode 100644 configs/_base_/models/tinyvit/tinyvit-11m.py
 create mode 100644 configs/_base_/models/tinyvit/tinyvit-21m.py
 create mode 100644 configs/_base_/models/tinyvit/tinyvit-5m.py
 create mode 100644 configs/tinyvit/README.md
 create mode 100644 configs/tinyvit/metafile.yml
 create mode 100644 configs/tinyvit/tinyvit-11m-distill_8xb256_in1k.py
 create mode 100644 configs/tinyvit/tinyvit-11m_8xb256_in1k.py
 create mode 100644 configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-384px.py
 create mode 100644 configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-512px.py
 create mode 100644 configs/tinyvit/tinyvit-21m-distill_8xb256_in1k.py
 create mode 100644 configs/tinyvit/tinyvit-21m_8xb256_in1k.py
 create mode 100644 configs/tinyvit/tinyvit-5m-distill_8xb256_in1k.py
 create mode 100644 configs/tinyvit/tinyvit-5m_8xb256_in1k.py
 create mode 100644 mmcls/models/backbones/tinyvit.py
 create mode 100644 tests/test_models/test_backbones/test_tinyvit.py
 create mode 100644 tools/model_converters/tinyvit_to_mmcls.py

diff --git a/configs/_base_/models/tinyvit/tinyvit-11m.py b/configs/_base_/models/tinyvit/tinyvit-11m.py
new file mode 100644
index 00000000000..6c046e35a0f
--- /dev/null
+++ b/configs/_base_/models/tinyvit/tinyvit-11m.py
@@ -0,0 +1,25 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='TinyViT',
+        arch='11m',
+        img_size=(224, 224),
+        window_size=[7, 7, 14, 7],
+        out_indices=(3, ),
+        drop_path_rate=0.1,
+        gap_before_final_norm=True,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.),
+        ]),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=448,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
diff --git a/configs/_base_/models/tinyvit/tinyvit-21m.py b/configs/_base_/models/tinyvit/tinyvit-21m.py
new file mode 100644
index 00000000000..7f362f8f627
--- /dev/null
+++ b/configs/_base_/models/tinyvit/tinyvit-21m.py
@@ -0,0 +1,25 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='TinyViT',
+        arch='21m',
+        img_size=(224, 224),
+        window_size=[7, 7, 14, 7],
+        out_indices=(3, ),
+        drop_path_rate=0.2,
+        gap_before_final_norm=True,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.),
+        ]),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=576,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
diff --git a/configs/_base_/models/tinyvit/tinyvit-5m.py b/configs/_base_/models/tinyvit/tinyvit-5m.py
new file mode 100644
index 00000000000..923ebd918f8
--- /dev/null
+++ b/configs/_base_/models/tinyvit/tinyvit-5m.py
@@ -0,0 +1,25 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='TinyViT',
+        arch='5m',
+        img_size=(224, 224),
+        window_size=[7, 7, 14, 7],
+        out_indices=(3, ),
+        drop_path_rate=0.0,
+        gap_before_final_norm=True,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.),
+        ]),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=320,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    ))
diff --git a/configs/tinyvit/README.md b/configs/tinyvit/README.md
new file mode 100644
index 00000000000..b870b73b089
--- /dev/null
+++ b/configs/tinyvit/README.md
@@ -0,0 +1,34 @@
+# TinyViT
+
+> [TinyViT: Fast Pretraining Distillation for Small Vision Transformers](https://arxiv.org/abs/2207.10666)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Vision transformer (ViT) recently has drawn great attention in computer vision due to its remarkable model capability. However, most prevailing ViT models suffer from huge number of parameters, restricting their applicability on devices with limited resources. To alleviate this issue, we propose TinyViT, a new family of tiny and efficient small vision transformers pretrained on large-scale datasets with our proposed fast distillation framework. The central idea is to transfer knowledge from large pretrained models to small ones, while enabling small models to get the dividends of massive pretraining data. More specifically, we apply distillation during pretraining for knowledge transfer. The logits of large teacher models are sparsified and stored in disk in advance to save the memory cost and computation overheads. The tiny student transformers are automatically scaled down from a large pretrained model with computation and parameter constraints. Comprehensive experiments demonstrate the efficacy of TinyViT. It achieves a top-1 accuracy of 84.8% on ImageNet-1k with only 21M parameters, being comparable to SwinB pretrained on ImageNet-21k while using 4.2 times fewer parameters. Moreover, increasing image resolutions, TinyViT can reach 86.5% accuracy, being slightly better than Swin-L while using only 11% parameters. Last but not the least, we demonstrate a good transfer ability of TinyViT on various downstream tasks.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github.com/microsoft/Cream/raw/main/TinyViT/.figure/framework.png" width="100%">
+</div>
+
+## Results and models
+
+### ImageNet-1k
+
+|            Model            |   Pretrain   | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                        Config                        |                                  Download                                  |
+| :-------------------------: | :----------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------------: | :------------------------------------------------------------------------: |
+|      TinyViT-5M-224\*       | From scratch |   5.39    |   1.29   |   79.02   |   94.74   |        [config](./tinyvit-5m_8xb256_in1k.py)         | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth) |
+|      TinyViT-11M-224\*      | From scratch |   11.00   |   2.05   |   81.44   |   95.79   |        [config](./tinyvit-11m_8xb256_in1k.py)        | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth) |
+|      TinyViT-21M-224\*      | From scratch |   21.20   |   4.30   |   83.08   |   96.54   |        [config](./tinyvit-21m_8xb256_in1k.py)        | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) |
+| TinyViT-5M-224-Distilled\*  | ImageNet-21k |   5.39    |   1.29   |   80.71   |   95.57   |    [config](./tinyvit-5m-distill_8xb256_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) |
+| TinyViT-11M-224-Distilled\* | ImageNet-21k |   11.00   |   2.05   |   83.19   |   96.53   |    [config](./tinyvit-11m-distill_8xb256_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) |
+| TinyViT-21M-224-Distilled\* | ImageNet-21k |   21.20   |   4.30   |   84.85   |   97.27   |    [config](./tinyvit-21m-distill_8xb256_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) |
+| TinyViT-21M-384-Distilled\* | ImageNet-21k |  350.20   |  60.93   |   86.97   |   98.20   | [config](./tinyvit-21m-distill_8xb256_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) |
+| TinyViT-21M-512-Distilled\* | ImageNet-21k |  350.20   |  60.93   |   86.97   |   98.20   | [config](./tinyvit-21m-distill_8xb256_in1k-512px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/microsoft/Cream/tree/main/TinyViT). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
diff --git a/configs/tinyvit/metafile.yml b/configs/tinyvit/metafile.yml
new file mode 100644
index 00000000000..d76f75e1fd5
--- /dev/null
+++ b/configs/tinyvit/metafile.yml
@@ -0,0 +1,144 @@
+Collections:
+  - Name: TinyViT
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - MBConv
+        - Window Multi-head Self-Attention
+    Paper:
+      URL: https://arxiv.org/abs/2207.10666
+      Title: 'TinyViT: Fast Pretraining Distillation for Small Vision Transformers'
+    README: configs/tinyvit/README.md
+    Code:
+      Version: v1.0.0rc1
+      URL: https://github.com/open-mmlab/mmclassification/blob/v0.23.2/mmcls/models/backbones/tinyvit.py
+
+Models:
+  - Name: tinyvit-5m_3rdparty_8xb256_in1k
+    Metadata:
+      FLOPs: 1286655360
+      Parameters: 5392764
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 79.02
+          Top 5 Accuracy: 94.74
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Config: configs/tinyvit/tinyvit-5m_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_5m_1k.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-11m_3rdparty_8xb256_in1k
+    Metadata:
+      FLOPs: 2050033664
+      Parameters: 10996972
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.44
+          Top 5 Accuracy: 95.79
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Config: configs/tinyvit/tinyvit-11m_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_11m_1k.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-21m_3rdparty_8xb256_in1k
+    Metadata:
+      FLOPs: 4301124096
+      Parameters: 21198568
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.08
+          Top 5 Accuracy: 96.58
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Config: configs/tinyvit/tinyvit-21m_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_1k.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-5m-distill_3rdparty_8xb256_in1k
+    Metadata:
+      FLOPs: 1286655360
+      Parameters: 5392764
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.71
+          Top 5 Accuracy: 95.57
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Config: configs/tinyvit/tinyvit-5m-distill_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_5m_22kto1k_distill.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-11m-distill_3rdparty_8xb256_in1k
+    Metadata:
+      FLOPs: 2050033664
+      Parameters: 10996972
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.19
+          Top 5 Accuracy: 96.53
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Config: configs/tinyvit/tinyvit-11m-distill_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_11m_22kto1k_distill.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-21m-distill_3rdparty_8xb256_in1k
+    Metadata:
+      FLOPs: 4301124096
+      Parameters: 21198568
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.85
+          Top 5 Accuracy: 97.27
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Config: configs/tinyvit/tinyvit-21m-distill_8xb256_in1k.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_distill.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-21m-distill_3rdparty_8xb256_in1k-384px
+    Metadata:
+      FLOPs: 13848250176
+      Parameters: 21230488
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.21
+          Top 5 Accuracy: 97.77
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Config: configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-384px.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_384_distill.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
+  - Name: tinyvit-21m-distill_3rdparty_8xb256_in1k-512px
+    Metadata:
+      FLOPs: 27151420224
+      Parameters: 21268120
+    In Collection: TinyViT
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.44
+          Top 5 Accuracy: 97.89
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Config: configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-512px.py
+    Converted From:
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_512_distill.pth
+      Code: https://github.com/microsoft/Cream/tree/main/TinyViT
diff --git a/configs/tinyvit/tinyvit-11m-distill_8xb256_in1k.py b/configs/tinyvit/tinyvit-11m-distill_8xb256_in1k.py
new file mode 100644
index 00000000000..145feb9aa65
--- /dev/null
+++ b/configs/tinyvit/tinyvit-11m-distill_8xb256_in1k.py
@@ -0,0 +1,3 @@
+_base_ = [
+    './tinyvit-11m_8xb256_in1k.py',
+]
diff --git a/configs/tinyvit/tinyvit-11m_8xb256_in1k.py b/configs/tinyvit/tinyvit-11m_8xb256_in1k.py
new file mode 100644
index 00000000000..f3acfa86a0d
--- /dev/null
+++ b/configs/tinyvit/tinyvit-11m_8xb256_in1k.py
@@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+    '../_base_/models/tinyvit/tinyvit-11m.py',
+]
diff --git a/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-384px.py b/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-384px.py
new file mode 100644
index 00000000000..6c4878122d0
--- /dev/null
+++ b/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-384px.py
@@ -0,0 +1,29 @@
+_base_ = [
+    '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+    '../_base_/models/tinyvit/tinyvit-21m.py',
+]
+
+# model settings
+model = dict(
+    backbone=dict(
+        img_size=(384, 384),
+        window_size=[12, 12, 24, 12],
+        drop_path_rate=0.1,
+    ))
+
+# data settings
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(384, 384),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='PackClsInputs'),
+]
+
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+test_dataloader = val_dataloader
diff --git a/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-512px.py b/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-512px.py
new file mode 100644
index 00000000000..4746e320746
--- /dev/null
+++ b/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-512px.py
@@ -0,0 +1,28 @@
+_base_ = [
+    '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+    '../_base_/models/tinyvit/tinyvit-21m.py',
+]
+
+# model settings
+model = dict(
+    backbone=dict(
+        img_size=(512, 512),
+        window_size=[16, 16, 32, 16],
+        drop_path_rate=0.1,
+    ))
+# data settings
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        scale=(512, 512),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='PackClsInputs'),
+]
+
+val_dataloader = dict(batch_size=16, dataset=dict(pipeline=test_pipeline))
+
+test_dataloader = val_dataloader
diff --git a/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k.py b/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k.py
new file mode 100644
index 00000000000..53885852757
--- /dev/null
+++ b/configs/tinyvit/tinyvit-21m-distill_8xb256_in1k.py
@@ -0,0 +1,3 @@
+_base_ = [
+    './tinyvit-21m_8xb256_in1k.py',
+]
diff --git a/configs/tinyvit/tinyvit-21m_8xb256_in1k.py b/configs/tinyvit/tinyvit-21m_8xb256_in1k.py
new file mode 100644
index 00000000000..6c12019c9cf
--- /dev/null
+++ b/configs/tinyvit/tinyvit-21m_8xb256_in1k.py
@@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+    '../_base_/models/tinyvit/tinyvit-21m.py',
+]
diff --git a/configs/tinyvit/tinyvit-5m-distill_8xb256_in1k.py b/configs/tinyvit/tinyvit-5m-distill_8xb256_in1k.py
new file mode 100644
index 00000000000..0003c30ac46
--- /dev/null
+++ b/configs/tinyvit/tinyvit-5m-distill_8xb256_in1k.py
@@ -0,0 +1,3 @@
+_base_ = [
+    './tinyvit-5m_8xb256_in1k.py',
+]
diff --git a/configs/tinyvit/tinyvit-5m_8xb256_in1k.py b/configs/tinyvit/tinyvit-5m_8xb256_in1k.py
new file mode 100644
index 00000000000..262b5a469c4
--- /dev/null
+++ b/configs/tinyvit/tinyvit-5m_8xb256_in1k.py
@@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/datasets/imagenet_bs32_pil_bicubic.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+    '../_base_/models/tinyvit/tinyvit-5m.py',
+]
diff --git a/mmcls/models/backbones/__init__.py b/mmcls/models/backbones/__init__.py
index b29e63e250a..458741fb873 100644
--- a/mmcls/models/backbones/__init__.py
+++ b/mmcls/models/backbones/__init__.py
@@ -41,6 +41,7 @@
 from .swin_transformer_v2 import SwinTransformerV2
 from .t2t_vit import T2T_ViT
 from .timm_backbone import TIMMBackbone
+from .tinyvit import TinyViT
 from .tnt import TNT
 from .twins import PCPVT, SVT
 from .van import VAN
@@ -101,4 +102,5 @@
     'DaViT',
     'BEiT',
     'RevVisionTransformer',
+    'TinyViT',
 ]
diff --git a/mmcls/models/backbones/tinyvit.py b/mmcls/models/backbones/tinyvit.py
new file mode 100644
index 00000000000..47064791890
--- /dev/null
+++ b/mmcls/models/backbones/tinyvit.py
@@ -0,0 +1,769 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+from mmengine.registry import MODELS
+from torch.nn import functional as F
+
+from ..utils import LeAttention
+from .base_backbone import BaseBackbone
+
+
+class ConvBN2d(Sequential):
+    """An implementation of Conv2d + BatchNorm2d with support of fusion.
+
+    Modified from
+    https://github.com/microsoft/Cream/blob/main/TinyViT/models/tiny_vit.py
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        kernel_size (int): The size of the convolution kernel.
+            Default: 1.
+        stride (int): The stride of the convolution.
+            Default: 1.
+        padding (int): The padding of the convolution.
+            Default: 0.
+        dilation (int): The dilation of the convolution.
+            Default: 1.
+        groups (int): The number of groups in the convolution.
+            Default: 1.
+        bn_weight_init (float): The initial value of the weight of
+            the nn.BatchNorm2d layer. Default: 1.0.
+        init_cfg (dict): The initialization config of the module.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bn_weight_init=1.0,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.add_module(
+            'conv2d',
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=False))
+        bn2d = nn.BatchNorm2d(num_features=out_channels)
+        # bn initialization
+        torch.nn.init.constant_(bn2d.weight, bn_weight_init)
+        torch.nn.init.constant_(bn2d.bias, 0)
+        self.add_module('bn2d', bn2d)
+
+    @torch.no_grad()
+    def fuse(self):
+        conv2d, bn2d = self._modules.values()
+        w = bn2d.weight / (bn2d.running_var + bn2d.eps)**0.5
+        w = conv2d.weight * w[:, None, None, None]
+        b = bn2d.bias - bn2d.running_mean * bn2d.weight / \
+            (bn2d.running_var + bn2d.eps)**0.5
+
+        m = nn.Conv2d(
+            in_channels=w.size(1) * self.c.groups,
+            out_channels=w.size(0),
+            kernel_size=w.shape[2:],
+            stride=self.conv2d.stride,
+            padding=self.conv2d.padding,
+            dilation=self.conv2d.dilation,
+            groups=self.conv2d.groups)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+
+
+class PatchEmbed(BaseModule):
+    """Patch Embedding for Vision Transformer.
+
+    Adapted from
+    https://github.com/microsoft/Cream/blob/main/TinyViT/models/tiny_vit.py
+
+    Different from `mmcv.cnn.bricks.transformer.PatchEmbed`, this module use
+    Conv2d and BatchNorm2d to implement PatchEmbedding, and output shape is
+    (N, C, H, W).
+
+    Args:
+        in_channels (int): The number of input channels.
+        embed_dim (int): The embedding dimension.
+        resolution (Tuple[int, int]): The resolution of the input feature.
+        act_cfg (dict): The activation config of the module.
+            Default: dict(type='GELU').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 embed_dim,
+                 resolution,
+                 act_cfg=dict(type='GELU')):
+        super().__init__()
+        img_size: Tuple[int, int] = resolution
+        self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
+        self.num_patches = self.patches_resolution[0] * \
+            self.patches_resolution[1]
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        self.seq = nn.Sequential(
+            ConvBN2d(
+                in_channels,
+                embed_dim // 2,
+                kernel_size=3,
+                stride=2,
+                padding=1),
+            build_activation_layer(act_cfg),
+            ConvBN2d(
+                embed_dim // 2, embed_dim, kernel_size=3, stride=2, padding=1),
+        )
+
+    def forward(self, x):
+        return self.seq(x)
+
+
+class PatchMerging(nn.Module):
+    """Patch Merging for TinyViT.
+
+    Adapted from
+    https://github.com/microsoft/Cream/blob/main/TinyViT/models/tiny_vit.py
+
+    Different from `mmcls.models.utils.PatchMerging`, this module use Conv2d
+    and BatchNorm2d to implement PatchMerging.
+
+    Args:
+        in_channels (int): The number of input channels.
+        resolution (Tuple[int, int]): The resolution of the input feature.
+        out_channels (int): The number of output channels.
+        act_cfg (dict): The activation config of the module.
+            Default: dict(type='GELU').
+    """
+
+    def __init__(self,
+                 resolution,
+                 in_channels,
+                 out_channels,
+                 act_cfg=dict(type='GELU')):
+        super().__init__()
+
+        self.img_size = resolution
+
+        self.act = build_activation_layer(act_cfg)
+        self.conv1 = ConvBN2d(in_channels, out_channels, kernel_size=1)
+        self.conv2 = ConvBN2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=out_channels)
+        self.conv3 = ConvBN2d(out_channels, out_channels, kernel_size=1)
+        self.out_resolution = (resolution[0] // 2, resolution[1] // 2)
+
+    def forward(self, x):
+        if len(x.shape) == 3:
+            H, W = self.img_size
+            B = x.shape[0]
+            x = x.view(B, H, W, -1).permute(0, 3, 1, 2)
+        x = self.conv1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = self.act(x)
+        x = self.conv3(x)
+
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+
+class MBConvBlock(nn.Module):
+    """Mobile Inverted Residual Bottleneck Block for TinyViT. Adapted from
+    https://github.com/microsoft/Cream/blob/main/TinyViT/models/tiny_vit.py.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        expand_ratio (int): The expand ratio of the hidden channels.
+        drop_rate (float): The drop rate of the block.
+        act_cfg (dict): The activation config of the module.
+            Default: dict(type='GELU').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expand_ratio,
+                 drop_path,
+                 act_cfg=dict(type='GELU')):
+        super().__init__()
+        self.in_channels = in_channels
+        hidden_channels = int(in_channels * expand_ratio)
+
+        # linear
+        self.conv1 = ConvBN2d(in_channels, hidden_channels, kernel_size=1)
+        self.act = build_activation_layer(act_cfg)
+        # depthwise conv
+        self.conv2 = ConvBN2d(
+            in_channels=hidden_channels,
+            out_channels=hidden_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=hidden_channels)
+        # linear
+        self.conv3 = ConvBN2d(
+            hidden_channels, out_channels, kernel_size=1, bn_weight_init=0.0)
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        shortcut = x
+
+        x = self.conv1(x)
+        x = self.act(x)
+
+        x = self.conv2(x)
+        x = self.act(x)
+
+        x = self.conv3(x)
+
+        x = self.drop_path(x)
+
+        x += shortcut
+        x = self.act(x)
+
+        return x
+
+
+class ConvStage(BaseModule):
+    """Convolution Stage for TinyViT.
+
+    Adapted from
+    https://github.com/microsoft/Cream/blob/main/TinyViT/models/tiny_vit.py
+
+    Args:
+        in_channels (int): The number of input channels.
+        resolution (Tuple[int, int]): The resolution of the input feature.
+        depth (int): The number of blocks in the stage.
+        act_cfg (dict): The activation config of the module.
+        drop_path (float): The drop path of the block.
+        downsample (None | nn.Module): The downsample operation.
+            Default: None.
+        use_checkpoint (bool): Whether to use checkpointing to save memory.
+        out_channels (int): The number of output channels.
+        conv_expand_ratio (int): The expand ratio of the hidden channels.
+            Default: 4.
+        init_cfg (dict | list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 resolution,
+                 depth,
+                 act_cfg,
+                 drop_path=0.,
+                 downsample=None,
+                 use_checkpoint=False,
+                 out_channels=None,
+                 conv_expand_ratio=4.,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = ModuleList([
+            MBConvBlock(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                expand_ratio=conv_expand_ratio,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path)
+            for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                resolution=resolution,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                act_cfg=act_cfg)
+            self.resolution = self.downsample.out_resolution
+        else:
+            self.downsample = None
+            self.resolution = resolution
+
+    def forward(self, x):
+        for block in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(block, x)
+            else:
+                x = block(x)
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+
+class MLP(BaseModule):
+    """MLP module for TinyViT.
+
+    Args:
+        in_channels (int): The number of input channels.
+        hidden_channels (int, optional): The number of hidden channels.
+            Default: None.
+        out_channels (int, optional): The number of output channels.
+            Default: None.
+        act_cfg (dict): The activation config of the module.
+            Default: dict(type='GELU').
+        drop (float): Probability of an element to be zeroed.
+            Default: 0.
+        init_cfg (dict | list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 hidden_channels=None,
+                 out_channels=None,
+                 act_cfg=dict(type='GELU'),
+                 drop=0.,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        out_channels = out_channels or in_channels
+        hidden_channels = hidden_channels or in_channels
+        self.norm = nn.LayerNorm(in_channels)
+        self.fc1 = nn.Linear(in_channels, hidden_channels)
+        self.fc2 = nn.Linear(hidden_channels, out_channels)
+        self.act = build_activation_layer(act_cfg)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.norm(x)
+
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class TinyViTBlock(BaseModule):
+    """TinViT Block.
+
+    Args:
+        in_channels (int): The number of input channels.
+        resolution (Tuple[int, int]): The resolution of the input feature.
+        num_heads (int): The number of heads in the multi-head attention.
+        window_size (int): The size of the window.
+            Default: 7.
+        mlp_ratio (float): The ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        drop (float): Probability of an element to be zeroed.
+            Default: 0.
+        drop_path (float): The drop path of the block.
+            Default: 0.
+        local_conv_size (int): The size of the local convolution.
+            Default: 3.
+        act_cfg (dict): The activation config of the module.
+            Default: dict(type='GELU').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 resolution,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 local_conv_size=3,
+                 act_cfg=dict(type='GELU')):
+        super().__init__()
+        self.in_channels = in_channels
+        self.img_size = resolution
+        self.num_heads = num_heads
+        assert window_size > 0, 'window_size must be greater than 0'
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+        assert in_channels % num_heads == 0, \
+            'dim must be divisible by num_heads'
+        head_dim = in_channels // num_heads
+
+        window_resolution = (window_size, window_size)
+        self.attn = LeAttention(
+            in_channels,
+            head_dim,
+            num_heads,
+            attn_ratio=1,
+            resolution=window_resolution)
+
+        mlp_hidden_dim = int(in_channels * mlp_ratio)
+        self.mlp = MLP(
+            in_channels=in_channels,
+            hidden_channels=mlp_hidden_dim,
+            act_cfg=act_cfg,
+            drop=drop)
+
+        self.local_conv = ConvBN2d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=local_conv_size,
+            stride=1,
+            padding=local_conv_size // 2,
+            groups=in_channels)
+
+    def forward(self, x):
+        H, W = self.img_size
+        B, L, C = x.shape
+        assert L == H * W, 'input feature has wrong size'
+        res_x = x
+        if H == self.window_size and W == self.window_size:
+            x = self.attn(x)
+        else:
+            x = x.view(B, H, W, C)
+            pad_b = (self.window_size -
+                     H % self.window_size) % self.window_size
+            pad_r = (self.window_size -
+                     W % self.window_size) % self.window_size
+            padding = pad_b > 0 or pad_r > 0
+
+            if padding:
+                x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
+
+            pH, pW = H + pad_b, W + pad_r
+            nH = pH // self.window_size
+            nW = pW // self.window_size
+            # window partition
+            x = x.view(B, nH, self.window_size, nW, self.window_size,
+                       C).transpose(2, 3).reshape(
+                           B * nH * nW, self.window_size * self.window_size, C)
+            x = self.attn(x)
+            # window reverse
+            x = x.view(B, nH, nW, self.window_size, self.window_size,
+                       C).transpose(2, 3).reshape(B, pH, pW, C)
+
+            if padding:
+                x = x[:, :H, :W].contiguous()
+
+            x = x.view(B, L, C)
+
+        x = res_x + self.drop_path(x)
+
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = self.local_conv(x)
+        x = x.view(B, C, L).transpose(1, 2)
+
+        x = x + self.drop_path(self.mlp(x))
+        return x
+
+
+class BasicStage(BaseModule):
+    """Basic Stage for TinyViT.
+
+    Args:
+        in_channels (int): The number of input channels.
+        resolution (Tuple[int, int]): The resolution of the input feature.
+        depth (int): The number of blocks in the stage.
+        num_heads (int): The number of heads in the multi-head attention.
+        window_size (int): The size of the window.
+        mlp_ratio (float): The ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        drop (float): Probability of an element to be zeroed.
+            Default: 0.
+        drop_path (float): The drop path of the block.
+            Default: 0.
+        downsample (None | nn.Module): The downsample operation.
+            Default: None.
+        use_checkpoint (bool): Whether to use checkpointing to save memory.
+            Default: False.
+        act_cfg (dict): The activation config of the module.
+            Default: dict(type='GELU').
+        init_cfg (dict | list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 downsample=None,
+                 use_checkpoint=False,
+                 local_conv_size=3,
+                 out_channels=None,
+                 act_cfg=dict(type='GELU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = ModuleList([
+            TinyViTBlock(
+                in_channels=in_channels,
+                resolution=resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                drop=drop,
+                local_conv_size=local_conv_size,
+                act_cfg=act_cfg,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path)
+            for i in range(depth)
+        ])
+
+        # build patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                resolution=resolution,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                act_cfg=act_cfg)
+            self.resolution = self.downsample.out_resolution
+        else:
+            self.downsample = None
+            self.resolution = resolution
+
+    def forward(self, x):
+        for block in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(block, x)
+            else:
+                x = block(x)
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+
+@MODELS.register_module()
+class TinyViT(BaseBackbone):
+    """TinyViT.
+    A PyTorch implementation of : `TinyViT: Fast Pretraining Distillation
+    for Small Vision Transformers<https://arxiv.org/abs/2201.03545v1>`_
+
+    Inspiration from
+    https://github.com/microsoft/Cream/blob/main/TinyViT
+
+    Args:
+        arch (str | dict): The architecture of TinyViT.
+            Default: '5m'.
+        img_size (tuple | int): The resolution of the input image.
+            Default: (224, 224)
+        window_size (list): The size of the window.
+            Default: [7, 7, 14, 7]
+        in_channels (int): The number of input channels.
+            Default: 3.
+        depths (list[int]): The depth of each stage.
+            Default: [2, 2, 6, 2].
+        mlp_ratio (list[int]): The ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default: 0.
+        drop_path_rate (float): The drop path of the block.
+            Default: 0.1.
+        use_checkpoint (bool): Whether to use checkpointing to save memory.
+            Default: False.
+        mbconv_expand_ratio (int): The expand ratio of the mbconv.
+            Default: 4.0
+        local_conv_size (int): The size of the local conv.
+            Default: 3.
+        layer_lr_decay (float): The layer lr decay.
+            Default: 1.0
+        out_indices (int | list[int]): Output from which stages.
+            Default: -1
+        frozen_stages (int | list[int]): Stages to be frozen (all param fixed).
+            Default: -0
+        gap_before_final_nrom (bool): Whether to add a gap before the final
+            norm. Default: True.
+        act_cfg (dict): The activation config of the module.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict | list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+    arch_settings = {
+        '5m': {
+            'channels': [64, 128, 160, 320],
+            'num_heads': [2, 4, 5, 10],
+            'depths': [2, 2, 6, 2],
+        },
+        '11m': {
+            'channels': [64, 128, 256, 448],
+            'num_heads': [2, 4, 8, 14],
+            'depths': [2, 2, 6, 2],
+        },
+        '21m': {
+            'channels': [96, 192, 384, 576],
+            'num_heads': [3, 6, 12, 18],
+            'depths': [2, 2, 6, 2],
+        },
+    }
+
+    def __init__(self,
+                 arch='5m',
+                 img_size=(224, 224),
+                 window_size=[7, 7, 14, 7],
+                 in_channels=3,
+                 mlp_ratio=4.,
+                 drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_checkpoint=False,
+                 mbconv_expand_ratio=4.0,
+                 local_conv_size=3,
+                 layer_lr_decay=1.0,
+                 out_indices=-1,
+                 frozen_stages=0,
+                 gap_before_final_norm=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(arch, str):
+            assert arch in self.arch_settings, \
+                f'Unavaiable arch, please choose from ' \
+                f'({set(self.arch_settings)} or pass a dict.'
+            arch = self.arch_settings[arch]
+        elif isinstance(arch, dict):
+            assert 'channels' in arch and 'num_heads' in arch and \
+                'depths' in arch, 'The arch dict must have' \
+                f'"channels", "num_heads", "window_sizes" ' \
+                f'keys, but got {arch.keys()}'
+
+        self.channels = arch['channels']
+        self.num_heads = arch['num_heads']
+        self.widow_sizes = window_size
+        self.img_size = img_size
+        self.depths = arch['depths']
+
+        self.num_stages = len(self.channels)
+
+        if isinstance(out_indices, int):
+            out_indices = [out_indices]
+        assert isinstance(out_indices, Sequence), \
+            f'"out_indices" must by a sequence or int, ' \
+            f'get {type(out_indices)} instead.'
+        for i, index in enumerate(out_indices):
+            if index < 0:
+                out_indices[i] = 4 + index
+                assert out_indices[i] >= 0, f'Invalid out_indices {index}'
+        self.out_indices = out_indices
+
+        self.frozen_stages = frozen_stages
+        self.gap_before_final_norm = gap_before_final_norm
+        self.layer_lr_decay = layer_lr_decay
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dim=self.channels[0],
+            resolution=self.img_size,
+            act_cfg=dict(type='GELU'))
+        patches_resolution = self.patch_embed.patches_resolution
+
+        # stochastic depth decay rule
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(self.depths))
+        ]
+
+        # build stages
+        self.stages = ModuleList()
+        for i in range(self.num_stages):
+            depth = self.depths[i]
+            channel = self.channels[i]
+            curr_resolution = (patches_resolution[0] // (2**i),
+                               patches_resolution[1] // (2**i))
+            drop_path = dpr[sum(self.depths[:i]):sum(self.depths[:i + 1])]
+            downsample = PatchMerging if (i < self.num_stages - 1) else None
+            out_channels = self.channels[min(i + 1, self.num_stages - 1)]
+            if i >= 1:
+                stage = BasicStage(
+                    in_channels=channel,
+                    resolution=curr_resolution,
+                    depth=depth,
+                    num_heads=self.num_heads[i],
+                    window_size=self.widow_sizes[i],
+                    mlp_ratio=mlp_ratio,
+                    drop=drop_rate,
+                    drop_path=drop_path,
+                    downsample=downsample,
+                    use_checkpoint=use_checkpoint,
+                    local_conv_size=local_conv_size,
+                    out_channels=out_channels,
+                    act_cfg=act_cfg)
+            else:
+                stage = ConvStage(
+                    in_channels=channel,
+                    resolution=curr_resolution,
+                    depth=depth,
+                    act_cfg=act_cfg,
+                    drop_path=drop_path,
+                    downsample=downsample,
+                    use_checkpoint=use_checkpoint,
+                    out_channels=out_channels,
+                    conv_expand_ratio=mbconv_expand_ratio)
+            self.stages.append(stage)
+
+            # add output norm
+            if i in self.out_indices:
+                norm_layer = build_norm_layer(norm_cfg, out_channels)[1]
+                self.add_module(f'norm{i}', norm_layer)
+
+    def set_layer_lr_decay(self, layer_lr_decay):
+        # TODO: add layer_lr_decay
+        pass
+
+    def forward(self, x):
+        outs = []
+        x = self.patch_embed(x)
+
+        for i, stage in enumerate(self.stages):
+            x = stage(x)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                if self.gap_before_final_norm:
+                    gap = x.mean(1)
+                    outs.append(norm_layer(gap))
+                else:
+                    out = norm_layer(x)
+                    # convert the (B,L,C) format into (B,C,H,W) format
+                    # which would be better for the downstream tasks.
+                    B, L, C = out.shape
+                    out = out.view(B, *stage.resolution, C)
+                    outs.append(out.permute(0, 3, 1, 2))
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages):
+            stage = self.stages[i]
+            stage.eval()
+            for param in stage.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super(TinyViT, self).train(mode)
+        self._freeze_stages()
diff --git a/mmcls/models/utils/__init__.py b/mmcls/models/utils/__init__.py
index f094fd80534..e47cd363b7d 100644
--- a/mmcls/models/utils/__init__.py
+++ b/mmcls/models/utils/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .attention import (BEiTAttention, ChannelMultiheadAttention,
+from .attention import (BEiTAttention, ChannelMultiheadAttention, LeAttention,
                         MultiheadAttention, ShiftWindowMSA, WindowMSA,
                         WindowMSAV2)
 from .batch_augments import CutMix, Mixup, RandomBatchAugment, ResizeMix
@@ -16,32 +16,12 @@
 from .se_layer import SELayer
 
 __all__ = [
-    'channel_shuffle',
-    'make_divisible',
-    'InvertedResidual',
-    'SELayer',
-    'to_ntuple',
-    'to_2tuple',
-    'to_3tuple',
-    'to_4tuple',
-    'PatchEmbed',
-    'PatchMerging',
-    'HybridEmbed',
-    'RandomBatchAugment',
-    'ShiftWindowMSA',
-    'is_tracing',
-    'MultiheadAttention',
-    'ConditionalPositionEncoding',
-    'resize_pos_embed',
-    'resize_relative_position_bias_table',
-    'ClsDataPreprocessor',
-    'Mixup',
-    'CutMix',
-    'ResizeMix',
-    'BEiTAttention',
-    'LayerScale',
-    'WindowMSA',
-    'WindowMSAV2',
-    'ChannelMultiheadAttention',
-    'PositionEncodingFourier',
+    'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer',
+    'to_ntuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'PatchEmbed',
+    'PatchMerging', 'HybridEmbed', 'RandomBatchAugment', 'ShiftWindowMSA',
+    'is_tracing', 'MultiheadAttention', 'ConditionalPositionEncoding',
+    'resize_pos_embed', 'resize_relative_position_bias_table',
+    'ClsDataPreprocessor', 'Mixup', 'CutMix', 'ResizeMix', 'BEiTAttention',
+    'LayerScale', 'WindowMSA', 'WindowMSAV2', 'ChannelMultiheadAttention',
+    'PositionEncodingFourier', 'LeAttention'
 ]
diff --git a/mmcls/models/utils/attention.py b/mmcls/models/utils/attention.py
index b2e4bf8929c..8d78b59aaa9 100644
--- a/mmcls/models/utils/attention.py
+++ b/mmcls/models/utils/attention.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+
 import numpy as np
 import torch
 import torch.nn as nn
@@ -789,3 +791,91 @@ def forward(self, x):
         if self.v_shortcut:
             x = qkv[2].squeeze(1) + x
         return x
+
+
+class LeAttention(BaseModule):
+    """LeViT Attention. Multi-head attention with attention bias,  which is
+    proposed in `LeViT: a Vision Transformer in ConvNet’s Clothing for Faster
+    Inference<https://arxiv.org/abs/2104.01136>`_
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads. Default: 8.
+        key_dim (int): Dimension of key. Default: None.
+        attn_ratio (int): Ratio of attention heads. Default: 8.
+        resolution (tuple[int]): Input resolution. Default: (16, 16).
+        init_cfg (dict, optional): The Config for initialization.
+    """
+
+    def __init__(self,
+                 dim,
+                 key_dim,
+                 num_heads=8,
+                 attn_ratio=4,
+                 resolution=(14, 14),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        # (h, w)
+        assert isinstance(resolution, tuple) and len(resolution) == 2
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        h = self.dh + nh_kd * 2
+
+        self.norm = nn.LayerNorm(dim)
+        self.qkv = nn.Linear(dim, h)
+        self.proj = nn.Linear(self.dh, dim)
+
+        points = list(
+            itertools.product(range(resolution[0]), range(resolution[1])))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(
+            torch.zeros(num_heads, len(attention_offsets)))
+        self.register_buffer(
+            'attention_bias_idxs',
+            torch.LongTensor(idxs).view(N, N),
+            persistent=False)
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.ab = self.attention_biases[:, self.attention_bias_idxs]
+
+    def forward(self, x):  # x (B,N,C)
+        B, N, _ = x.shape
+
+        # Normalization
+        x = self.norm(x)
+
+        qkv = self.qkv(x)
+        # (B, N, num_heads, d)
+        q, k, v = qkv.view(B, N, self.num_heads,
+                           -1).split([self.key_dim, self.key_dim, self.d],
+                                     dim=3)
+        # (B, num_heads, N, d)
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+
+        attn = ((q @ k.transpose(-2, -1)) * self.scale +
+                (self.attention_biases[:, self.attention_bias_idxs]
+                 if self.training else self.ab))
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
+        x = self.proj(x)
+        return x
diff --git a/model-index.yml b/model-index.yml
index adce8e2ae91..d0bbf424def 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -13,6 +13,7 @@ Import:
   - configs/tnt/metafile.yml
   - configs/vision_transformer/metafile.yml
   - configs/t2t_vit/metafile.yml
+  - configs/tinyvit/metafile.yml
   - configs/mlp_mixer/metafile.yml
   - configs/conformer/metafile.yml
   - configs/regnet/metafile.yml
diff --git a/tests/test_models/test_backbones/test_tinyvit.py b/tests/test_models/test_backbones/test_tinyvit.py
new file mode 100644
index 00000000000..50b0218a293
--- /dev/null
+++ b/tests/test_models/test_backbones/test_tinyvit.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcls.models.backbones import TinyViT
+
+
+def test_assertion():
+    with pytest.raises(AssertionError):
+        TinyViT(arch='unknown')
+
+    with pytest.raises(AssertionError):
+        # MobileViT out_indices should be valid depth.
+        TinyViT(out_indices=-100)
+
+
+def test_tinyvit():
+
+    # Test forward
+    model = TinyViT(arch='5m')
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 320])
+
+    # Test forward with multiple outputs
+    model = TinyViT(arch='5m', out_indices=(0, 1, 2, 3))
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 128])
+    assert feat[1].shape == torch.Size([1, 160])
+    assert feat[2].shape == torch.Size([1, 320])
+    assert feat[3].shape == torch.Size([1, 320])
+
+    # Test with custom arch
+    model = TinyViT(
+        arch={
+            'depths': [2, 3, 4, 5],
+            'channels': [64, 128, 256, 448],
+            'num_heads': [4, 4, 4, 4]
+        },
+        out_indices=(0, 1, 2, 3))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+    assert feat[0].shape == torch.Size([1, 128])
+    assert feat[1].shape == torch.Size([1, 256])
+    assert feat[2].shape == torch.Size([1, 448])
+    assert feat[3].shape == torch.Size([1, 448])
+
+    # Test without gap before final norm
+    model = TinyViT(
+        arch='21m', out_indices=(0, 1, 2, 3), gap_before_final_norm=False)
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 4
+
+    assert feat[0].shape == torch.Size([1, 192, 28, 28])
+    assert feat[1].shape == torch.Size([1, 384, 14, 14])
+    assert feat[2].shape == torch.Size([1, 576, 7, 7])
+    assert feat[3].shape == torch.Size([1, 576, 7, 7])
+
+    # Test frozen_stages
+    model = TinyViT(arch='11m', out_indices=(0, 1, 2, 3), frozen_stages=2)
+    model.init_weights()
+    model.train()
+
+    for i in range(2):
+        assert not model.stages[i].training
+
+    for i in range(2, 4):
+        assert model.stages[i].training
diff --git a/tools/model_converters/tinyvit_to_mmcls.py b/tools/model_converters/tinyvit_to_mmcls.py
new file mode 100644
index 00000000000..e2791998eaf
--- /dev/null
+++ b/tools/model_converters/tinyvit_to_mmcls.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from pathlib import Path
+
+import torch
+
+
+def convert_weights(weight):
+    """Weight Converter.
+
+    Converts the weights from timm to mmcls
+    Args:
+        weight (dict): weight dict from timm
+    Returns:
+        Converted weight dict for mmcls
+    """
+    result = dict()
+    result['meta'] = dict()
+    temp = dict()
+    mapping = {
+        'c.weight': 'conv2d.weight',
+        'bn.weight': 'bn2d.weight',
+        'bn.bias': 'bn2d.bias',
+        'bn.running_mean': 'bn2d.running_mean',
+        'bn.running_var': 'bn2d.running_var',
+        'bn.num_batches_tracked': 'bn2d.num_batches_tracked',
+        'layers': 'stages',
+        'norm_head': 'norm3',
+    }
+
+    weight = weight['model']
+
+    for k, v in weight.items():
+        # keyword mapping
+        for mk, mv in mapping.items():
+            if mk in k:
+                k = k.replace(mk, mv)
+
+        if k.startswith('head.'):
+            temp['head.fc.' + k[5:]] = v
+        else:
+            temp['backbone.' + k] = v
+
+    result['state_dict'] = temp
+    return result
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+    dst = Path(args.dst)
+    if dst.suffix != '.pth':
+        print('The path should contain the name of the pth format file.')
+        exit(1)
+    dst.parent.mkdir(parents=True, exist_ok=True)
+
+    original_model = torch.load(args.src, map_location='cpu')
+    converted_model = convert_weights(original_model)
+    torch.save(converted_model, args.dst)

From 7dcf34533d60cf725ec40e882bbd9c879104d066 Mon Sep 17 00:00:00 2001
From: Ma Zerun <mzr1996@163.com>
Date: Tue, 20 Dec 2022 16:22:31 +0800
Subject: [PATCH 12/21] [Docs] Update TinyViT links. (#1277)

---
 configs/tinyvit/README.md    |  20 +++----
 configs/tinyvit/metafile.yml | 100 +++++++++++++++++++++--------------
 2 files changed, 69 insertions(+), 51 deletions(-)

diff --git a/configs/tinyvit/README.md b/configs/tinyvit/README.md
index b870b73b089..4e926422409 100644
--- a/configs/tinyvit/README.md
+++ b/configs/tinyvit/README.md
@@ -20,15 +20,15 @@ Vision transformer (ViT) recently has drawn great attention in computer vision d
 
 ### ImageNet-1k
 
-|            Model            |   Pretrain   | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                        Config                        |                                  Download                                  |
-| :-------------------------: | :----------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------------: | :------------------------------------------------------------------------: |
-|      TinyViT-5M-224\*       | From scratch |   5.39    |   1.29   |   79.02   |   94.74   |        [config](./tinyvit-5m_8xb256_in1k.py)         | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth) |
-|      TinyViT-11M-224\*      | From scratch |   11.00   |   2.05   |   81.44   |   95.79   |        [config](./tinyvit-11m_8xb256_in1k.py)        | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth) |
-|      TinyViT-21M-224\*      | From scratch |   21.20   |   4.30   |   83.08   |   96.54   |        [config](./tinyvit-21m_8xb256_in1k.py)        | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) |
-| TinyViT-5M-224-Distilled\*  | ImageNet-21k |   5.39    |   1.29   |   80.71   |   95.57   |    [config](./tinyvit-5m-distill_8xb256_in1k.py)     | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) |
-| TinyViT-11M-224-Distilled\* | ImageNet-21k |   11.00   |   2.05   |   83.19   |   96.53   |    [config](./tinyvit-11m-distill_8xb256_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) |
-| TinyViT-21M-224-Distilled\* | ImageNet-21k |   21.20   |   4.30   |   84.85   |   97.27   |    [config](./tinyvit-21m-distill_8xb256_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) |
-| TinyViT-21M-384-Distilled\* | ImageNet-21k |  350.20   |  60.93   |   86.97   |   98.20   | [config](./tinyvit-21m-distill_8xb256_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) |
-| TinyViT-21M-512-Distilled\* | ImageNet-21k |  350.20   |  60.93   |   86.97   |   98.20   | [config](./tinyvit-21m-distill_8xb256_in1k-512px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) |
+|                     Model                      |        Pretrain        | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                     Config                      |                      Download                      |
+| :--------------------------------------------: | :--------------------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------: | :------------------------------------------------: |
+|           tinyvit-5m_3rdparty_in1k\*           |      From scratch      |   5.39    |   1.29   |   79.02   |   94.74   |      [config](./tinyvit-5m_8xb256_in1k.py)      | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-5m_3rdparty_in1k_20221021-62cb5abf.pth) |
+|  tinyvit-5m_in21k-distill-pre_3rdparty_in1k\*  | ImageNet-21k (distill) |   5.39    |   1.29   |   80.71   |   95.57   |  [config](./tinyvit-5m-distill_8xb256_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-5m_in21k-distill-pre_3rdparty_in1k_20221021-d4b010a8.pth) |
+|          tinyvit-11m_3rdparty_in1k\*           |      From scratch      |   11.00   |   2.05   |   81.44   |   95.79   |     [config](./tinyvit-11m_8xb256_in1k.py)      | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-11m_3rdparty_in1k_20221021-11ccef16.pth) |
+| tinyvit-11m_in21k-distill-pre_3rdparty_in1k\*  | ImageNet-21k (distill) |   11.00   |   2.05   |   83.19   |   96.53   | [config](./tinyvit-11m-distill_8xb256_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-11m_in21k-distill-pre_3rdparty_in1k_20221021-5d3bc0dc.pth) |
+|          tinyvit-21m_3rdparty_in1k\*           |      From scratch      |   21.20   |   4.30   |   83.08   |   96.58   |     [config](./tinyvit-21m_8xb256_in1k.py)      | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_3rdparty_in1k_20221021-5346ba34.pth) |
+| tinyvit-21m_in21k-distill-pre_3rdparty_in1k\*  | ImageNet-21k (distill) |   21.20   |   4.30   |   84.85   |   97.27   | [config](./tinyvit-21m-distill_8xb256_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k_20221021-3d9b30a2.pth) |
+| tinyvit-21m_in21k-distill-pre_3rdparty_in1k-384px\* | ImageNet-21k (distill) |   21.23   |  13.85   |   86.21   |   97.77   | [config](./tinyvit-21m-distill_8xb256_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k-384px_20221021-65be6b3f.pth) |
+| tinyvit-21m_in21k-distill-pre_3rdparty_in1k-512px\* | ImageNet-21k (distill) |   21.27   |  27.15   |   86.44   |   97.89   | [config](./tinyvit-21m-distill_8xb256_in1k-512px.py) | [model](https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k-512px_20221021-e42a9bea.pth) |
 
 *Models with * are converted from the [official repo](https://github.com/microsoft/Cream/tree/main/TinyViT). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
diff --git a/configs/tinyvit/metafile.yml b/configs/tinyvit/metafile.yml
index d76f75e1fd5..402a7fc487b 100644
--- a/configs/tinyvit/metafile.yml
+++ b/configs/tinyvit/metafile.yml
@@ -6,18 +6,19 @@ Collections:
         - MBConv
         - Window Multi-head Self-Attention
     Paper:
-      URL: https://arxiv.org/abs/2207.10666
       Title: 'TinyViT: Fast Pretraining Distillation for Small Vision Transformers'
+      URL: https://arxiv.org/abs/2207.10666
     README: configs/tinyvit/README.md
     Code:
       Version: v1.0.0rc1
       URL: https://github.com/open-mmlab/mmclassification/blob/v0.23.2/mmcls/models/backbones/tinyvit.py
 
 Models:
-  - Name: tinyvit-5m_3rdparty_8xb256_in1k
+  - Name: tinyvit-5m_3rdparty_in1k
     Metadata:
       FLOPs: 1286655360
       Parameters: 5392764
+      Training Data: ImageNet-1k
     In Collection: TinyViT
     Results:
       - Dataset: ImageNet-1k
@@ -25,79 +26,90 @@ Models:
           Top 1 Accuracy: 79.02
           Top 5 Accuracy: 94.74
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-5m_3rdparty_in1k_20221021-62cb5abf.pth
     Config: configs/tinyvit/tinyvit-5m_8xb256_in1k.py
     Converted From:
       Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_5m_1k.pth
       Code: https://github.com/microsoft/Cream/tree/main/TinyViT
-  - Name: tinyvit-11m_3rdparty_8xb256_in1k
+  - Name: tinyvit-5m_in21k-distill-pre_3rdparty_in1k
     Metadata:
-      FLOPs: 2050033664
-      Parameters: 10996972
+      FLOPs: 1286655360
+      Parameters: 5392764
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
     In Collection: TinyViT
     Results:
       - Dataset: ImageNet-1k
         Metrics:
-          Top 1 Accuracy: 81.44
-          Top 5 Accuracy: 95.79
+          Top 1 Accuracy: 80.71
+          Top 5 Accuracy: 95.57
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
-    Config: configs/tinyvit/tinyvit-11m_8xb256_in1k.py
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-5m_in21k-distill-pre_3rdparty_in1k_20221021-d4b010a8.pth
+    Config: configs/tinyvit/tinyvit-5m-distill_8xb256_in1k.py
     Converted From:
-      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_11m_1k.pth
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_5m_22kto1k_distill.pth
       Code: https://github.com/microsoft/Cream/tree/main/TinyViT
-  - Name: tinyvit-21m_3rdparty_8xb256_in1k
+  - Name: tinyvit-11m_3rdparty_in1k
     Metadata:
-      FLOPs: 4301124096
-      Parameters: 21198568
+      FLOPs: 2050033664
+      Parameters: 10996972
+      Training Data: ImageNet-1k
     In Collection: TinyViT
     Results:
       - Dataset: ImageNet-1k
         Metrics:
-          Top 1 Accuracy: 83.08
-          Top 5 Accuracy: 96.58
+          Top 1 Accuracy: 81.44
+          Top 5 Accuracy: 95.79
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
-    Config: configs/tinyvit/tinyvit-21m_8xb256_in1k.py
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-11m_3rdparty_in1k_20221021-11ccef16.pth
+    Config: configs/tinyvit/tinyvit-11m_8xb256_in1k.py
     Converted From:
-      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_1k.pth
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_11m_1k.pth
       Code: https://github.com/microsoft/Cream/tree/main/TinyViT
-  - Name: tinyvit-5m-distill_3rdparty_8xb256_in1k
+  - Name: tinyvit-11m_in21k-distill-pre_3rdparty_in1k
     Metadata:
-      FLOPs: 1286655360
-      Parameters: 5392764
+      FLOPs: 2050033664
+      Parameters: 10996972
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
     In Collection: TinyViT
     Results:
       - Dataset: ImageNet-1k
         Metrics:
-          Top 1 Accuracy: 80.71
-          Top 5 Accuracy: 95.57
+          Top 1 Accuracy: 83.19
+          Top 5 Accuracy: 96.53
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
-    Config: configs/tinyvit/tinyvit-5m-distill_8xb256_in1k.py
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-11m_in21k-distill-pre_3rdparty_in1k_20221021-5d3bc0dc.pth
+    Config: configs/tinyvit/tinyvit-11m-distill_8xb256_in1k.py
     Converted From:
-      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_5m_22kto1k_distill.pth
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_11m_22kto1k_distill.pth
       Code: https://github.com/microsoft/Cream/tree/main/TinyViT
-  - Name: tinyvit-11m-distill_3rdparty_8xb256_in1k
+  - Name: tinyvit-21m_3rdparty_in1k
     Metadata:
-      FLOPs: 2050033664
-      Parameters: 10996972
+      FLOPs: 4301124096
+      Parameters: 21198568
+      Training Data: ImageNet-1k
     In Collection: TinyViT
     Results:
       - Dataset: ImageNet-1k
         Metrics:
-          Top 1 Accuracy: 83.19
-          Top 5 Accuracy: 96.53
+          Top 1 Accuracy: 83.08
+          Top 5 Accuracy: 96.58
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
-    Config: configs/tinyvit/tinyvit-11m-distill_8xb256_in1k.py
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_3rdparty_in1k_20221021-5346ba34.pth
+    Config: configs/tinyvit/tinyvit-21m_8xb256_in1k.py
     Converted From:
-      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_11m_22kto1k_distill.pth
+      Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_1k.pth
       Code: https://github.com/microsoft/Cream/tree/main/TinyViT
-  - Name: tinyvit-21m-distill_3rdparty_8xb256_in1k
+  - Name: tinyvit-21m_in21k-distill-pre_3rdparty_in1k
     Metadata:
       FLOPs: 4301124096
       Parameters: 21198568
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
     In Collection: TinyViT
     Results:
       - Dataset: ImageNet-1k
@@ -105,15 +117,18 @@ Models:
           Top 1 Accuracy: 84.85
           Top 5 Accuracy: 97.27
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k_20221021-3d9b30a2.pth
     Config: configs/tinyvit/tinyvit-21m-distill_8xb256_in1k.py
     Converted From:
       Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_distill.pth
       Code: https://github.com/microsoft/Cream/tree/main/TinyViT
-  - Name: tinyvit-21m-distill_3rdparty_8xb256_in1k-384px
+  - Name: tinyvit-21m_in21k-distill-pre_3rdparty_in1k-384px
     Metadata:
       FLOPs: 13848250176
       Parameters: 21230488
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
     In Collection: TinyViT
     Results:
       - Dataset: ImageNet-1k
@@ -121,15 +136,18 @@ Models:
           Top 1 Accuracy: 86.21
           Top 5 Accuracy: 97.77
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k-384px_20221021-65be6b3f.pth
     Config: configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-384px.py
     Converted From:
       Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_384_distill.pth
       Code: https://github.com/microsoft/Cream/tree/main/TinyViT
-  - Name: tinyvit-21m-distill_3rdparty_8xb256_in1k-512px
+  - Name: tinyvit-21m_in21k-distill-pre_3rdparty_in1k-512px
     Metadata:
       FLOPs: 27151420224
       Parameters: 21268120
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
     In Collection: TinyViT
     Results:
       - Dataset: ImageNet-1k
@@ -137,7 +155,7 @@ Models:
           Top 1 Accuracy: 86.44
           Top 5 Accuracy: 97.89
         Task: Image Classification
-    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/
+    Weights: https://download.openmmlab.com/mmclassification/v0/tinyvit/tinyvit-21m_in21k-distill-pre_3rdparty_in1k-512px_20221021-e42a9bea.pth
     Config: configs/tinyvit/tinyvit-21m-distill_8xb256_in1k-512px.py
     Converted From:
       Weights: https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/tiny_vit_21m_22kto1k_512_distill.pth

From 14dcb69092b4847d968cf8ec1423ff6c004f06b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wangbo=20Zhao=28=E9=BB=91=E8=89=B2=E6=9E=B7=E9=94=81=29?=
 <56866854+wangbo-zhao@users.noreply.github.com>
Date: Tue, 20 Dec 2022 16:52:54 +0800
Subject: [PATCH 13/21] [Feature] Add mixmim backbone with checkpoints. (#1224)

* add mixmim backbone

* add mixmim inference

* add docstring, metafile, test and modify readme

* Update README and metafile

Co-authored-by: mzr1996 <mzr1996@163.com>
---
 README.md                                     |   1 +
 README_zh-CN.md                               |   1 +
 configs/_base_/models/mixmim/mixmim_base.py   |  20 +
 configs/mixmim/README.md                      |  90 ++++
 configs/mixmim/metafile.yml                   |  39 ++
 configs/mixmim/mixmim-base_8xb64_in1k.py      |   5 +
 mmcls/models/backbones/__init__.py            |   2 +
 mmcls/models/backbones/mixmim.py              | 494 ++++++++++++++++++
 model-index.yml                               |   1 +
 .../test_models/test_backbones/test_mixmim.py |  40 ++
 tools/model_converters/mixmimx_to_mmcls.py    |  98 ++++
 11 files changed, 791 insertions(+)
 create mode 100644 configs/_base_/models/mixmim/mixmim_base.py
 create mode 100644 configs/mixmim/README.md
 create mode 100644 configs/mixmim/metafile.yml
 create mode 100644 configs/mixmim/mixmim-base_8xb64_in1k.py
 create mode 100644 mmcls/models/backbones/mixmim.py
 create mode 100644 tests/test_models/test_backbones/test_mixmim.py
 create mode 100644 tools/model_converters/mixmimx_to_mmcls.py

diff --git a/README.md b/README.md
index 3bdb53c7de5..95f3c9f5106 100644
--- a/README.md
+++ b/README.md
@@ -154,6 +154,7 @@ Results and models are available in the [model zoo](https://mmclassification.rea
 - [x] [RepLKNet](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/replknet)
 - [x] [BEiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/beit) / [BEiT v2](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/beitv2)
 - [x] [EVA](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/eva)
+- [x] [MixMIM](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mixmim)
 
 </details>
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 0ebde4aefa0..5fddca0ee54 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -155,6 +155,7 @@ mim install -e .
 - [x] [RepLKNet](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/replknet)
 - [x] [BEiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/beit) / [BEiT v2](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/beitv2)
 - [x] [EVA](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/eva)
+- [x] [MixMIM](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mixmim)
 
 </details>
 
diff --git a/configs/_base_/models/mixmim/mixmim_base.py b/configs/_base_/models/mixmim/mixmim_base.py
new file mode 100644
index 00000000000..ccde357570d
--- /dev/null
+++ b/configs/_base_/models/mixmim/mixmim_base.py
@@ -0,0 +1,20 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='MixMIMTransformer', arch='B', drop_rate=0.0, drop_path_rate=0.1),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        init_cfg=None,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0)
+    ]))
diff --git a/configs/mixmim/README.md b/configs/mixmim/README.md
new file mode 100644
index 00000000000..bcba223d8f6
--- /dev/null
+++ b/configs/mixmim/README.md
@@ -0,0 +1,90 @@
+# MixMIM
+
+> [MixMIM: Mixed and Masked Image Modeling for Efficient Visual Representation Learning](https://arxiv.org/abs/2205.13137)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this study, we propose Mixed and Masked Image Modeling (MixMIM), a
+simple but efficient MIM method that is applicable to various hierarchical Vision
+Transformers. Existing MIM methods replace a random subset of input tokens with
+a special [MASK] symbol and aim at reconstructing original image tokens from
+the corrupted image. However, we find that using the [MASK] symbol greatly
+slows down the training and causes training-finetuning inconsistency, due to the
+large masking ratio (e.g., 40% in BEiT). In contrast, we replace the masked tokens
+of one image with visible tokens of another image, i.e., creating a mixed image.
+We then conduct dual reconstruction to reconstruct the original two images from
+the mixed input, which significantly improves efficiency. While MixMIM can
+be applied to various architectures, this paper explores a simpler but stronger
+hierarchical Transformer, and scales with MixMIM-B, -L, and -H. Empirical
+results demonstrate that MixMIM can learn high-quality visual representations
+efficiently. Notably, MixMIM-B with 88M parameters achieves 85.1% top-1
+accuracy on ImageNet-1K by pretraining for 600 epochs, setting a new record for
+neural networks with comparable model sizes (e.g., ViT-B) among MIM methods.
+Besides, its transferring performances on the other 6 datasets show MixMIM has
+better FLOPs / performance tradeoff than previous MIM methods
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/56866854/202853730-d26fb3d7-e5e8-487a-aad5-e3d4600cef87.png"/>
+</div>
+
+## How to use it?
+
+### Inference
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+>>> import torch
+>>> import mmcls
+>>> model = mmcls.get_model('mixmim-base_3rdparty_in1k', pretrained=True)
+>>> predict = mmcls.inference_model(model, 'demo/demo.JPEG')
+>>> print(predict['pred_class'])
+sea snake
+>>> print(predict['pred_score'])
+0.865431010723114
+```
+
+**Use the model**
+
+```python
+>>> import torch
+>>> import mmcls
+>>>
+>>> model = mmcls.get_model('mixmim-base_3rdparty_in1k', pretrained=True)
+>>> inputs = torch.rand(1, 3, 224, 224)
+>>> # To get classification scores.
+>>> out = model(inputs)
+>>> print(out.shape)
+torch.Size([1, 1000])
+>>> # To extract features.
+>>> outs = model.extract_feat(inputs)
+>>> print(outs[0].shape)
+torch.Size([1, 1024])
+```
+
+<!-- [TABS-END] -->
+
+## Models
+
+|            Model            | Params(M) | Pretrain Epochs | Flops(G) | Top-1 (%) | Top-5 (%) |                Config                 |                                        Download                                        |
+| :-------------------------: | :-------: | :-------------: | :------: | :-------: | :-------: | :-----------------------------------: | :------------------------------------------------------------------------------------: |
+| mixmim-base_3rdparty_in1k\* |    88     |       300       |   16.3   |   84.6    |   97.0    | [config](./mixmim-base_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mixmim/mixmim-base_3rdparty_in1k_20221206-e40e2c8c.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/Sense-X/MixMIM). The config files of these models are only for inference.*
+
+For MixMIM self-supervised learning algorithm, welcome to [MMSelfSup page](https://github.com/open-mmlab/mmselfsup/tree/dev-1.x/configs/selfsup/mixmim) to get more information.
+
+## Citation
+
+```bibtex
+@article{MixMIM2022,
+  author  = {Jihao Liu, Xin Huang, Yu Liu, Hongsheng Li},
+  journal = {arXiv:2205.13137},
+  title   = {MixMIM: Mixed and Masked Image Modeling for Efficient Visual Representation Learning},
+  year    = {2022},
+}
+```
diff --git a/configs/mixmim/metafile.yml b/configs/mixmim/metafile.yml
new file mode 100644
index 00000000000..70623c8c909
--- /dev/null
+++ b/configs/mixmim/metafile.yml
@@ -0,0 +1,39 @@
+Collections:
+  - Name: MixMIM
+    Metadata:
+      Architecture:
+        - Attention Dropout
+        - Convolution
+        - Dense Connections
+        - Dropout
+        - GELU
+        - Layer Normalization
+        - Multi-Head Attention
+        - Scaled Dot-Product Attention
+        - Tanh Activation
+    Paper:
+      Title: 'MixMIM: Mixed and Masked Image Modeling for Efficient Visual Representation Learning'
+      URL: https://arxiv.org/abs/2205.13137
+    README: configs/mixmim/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmclassification/blob/dev-1.x/mmcls/models/backbones/mixmim.py
+      Version: v1.0.0rc4
+
+Models:
+  - Name: mixmim-base_3rdparty_in1k
+    Metadata:
+      FLOPs: 16352000000
+      Parameters: 88344000
+      Training Data:
+        - ImageNet-1k
+    In Collection: MixMIM
+    Results:
+      - Dataset: ImageNet-1k
+        Task: Image Classification
+        Metrics:
+          Top 1 Accuracy: 84.6
+          Top 5 Accuracy: 97.0
+    Weights: https://download.openmmlab.com/mmclassification/v0/mixmim/mixmim-base_3rdparty_in1k_20221206-e40e2c8c.pth
+    Config: configs/mixmim/mixmim-base_8xb64_in1k.py
+    Converted From:
+      Code: https://github.com/Sense-X/MixMIM
diff --git a/configs/mixmim/mixmim-base_8xb64_in1k.py b/configs/mixmim/mixmim-base_8xb64_in1k.py
new file mode 100644
index 00000000000..bb35a037656
--- /dev/null
+++ b/configs/mixmim/mixmim-base_8xb64_in1k.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/mixmim/mixmim_base.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs256.py', '../_base_/default_runtime.py'
+]
diff --git a/mmcls/models/backbones/__init__.py b/mmcls/models/backbones/__init__.py
index 458741fb873..b583d988dfe 100644
--- a/mmcls/models/backbones/__init__.py
+++ b/mmcls/models/backbones/__init__.py
@@ -16,6 +16,7 @@
 from .hrnet import HRNet
 from .inception_v3 import InceptionV3
 from .lenet import LeNet5
+from .mixmim import MixMIMTransformer
 from .mlp_mixer import MlpMixer
 from .mobilenet_v2 import MobileNetV2
 from .mobilenet_v3 import MobileNetV3
@@ -102,5 +103,6 @@
     'DaViT',
     'BEiT',
     'RevVisionTransformer',
+    'MixMIMTransformer',
     'TinyViT',
 ]
diff --git a/mmcls/models/backbones/mixmim.py b/mmcls/models/backbones/mixmim.py
new file mode 100644
index 00000000000..6bed2cf4a6a
--- /dev/null
+++ b/mmcls/models/backbones/mixmim.py
@@ -0,0 +1,494 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.drop import DropPath
+from mmcv.cnn.bricks.transformer import PatchEmbed, PatchMerging
+from mmengine.model import BaseModule
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+from mmcls.models.backbones.base_backbone import BaseBackbone
+from mmcls.models.backbones.vision_transformer import TransformerEncoderLayer
+from mmcls.models.utils.attention import WindowMSA
+from mmcls.models.utils.helpers import to_2tuple
+from mmcls.registry import MODELS
+
+
+class MixMIMWindowAttention(WindowMSA):
+    """MixMIM Window Attention.
+
+    Compared with WindowMSA, we add some modifications
+    in ``forward`` to meet the requirement of MixMIM during
+    pretraining.
+
+    Implements one windown attention in MixMIM.
+    Args:
+        embed_dims (int): The feature dimension.
+        window_size (list): The height and width of the window.
+        num_heads (int): The number of head in attention.
+        qkv_bias (bool): Whether to add bias for qkv in attention modules.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        attn_drop_rate (float): attention drop rate.
+            Defaults to 0.
+        proj_drop_rate (float): Probability of an element to be zeroed.
+            Defaults to 0.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None):
+
+        super().__init__(
+            embed_dims=embed_dims,
+            window_size=window_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop_rate,
+            proj_drop=proj_drop_rate,
+            init_cfg=init_cfg)
+
+    def forward(self, x, mask=None):
+
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            mask = mask.reshape(B_, 1, 1, N)
+            mask_new = mask * mask.transpose(
+                2, 3) + (1 - mask) * (1 - mask).transpose(2, 3)
+            mask_new = 1 - mask_new
+
+            if mask_new.dtype == torch.float16:
+                attn = attn - 65500 * mask_new
+            else:
+                attn = attn - 1e30 * mask_new
+
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class MixMIMBlock(TransformerEncoderLayer):
+    """MixMIM Block. Implements one block in MixMIM.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        input_resolution (tuple): Input resolution of this layer.
+        num_heads (int): The number of head in attention,
+        window_size (list): The height and width of the window.
+        mlp_ratio (int): The MLP ration in FFN.
+        num_fcs (int): The number of linear layers in a block.
+        qkv_bias (bool): Whether to add bias for qkv in attention modules.
+            Defaults to True.
+        proj_drop_rate (float): Probability of an element to be zeroed.
+            Defaults to 0.
+        attn_drop_rate (float): attention drop rate.
+            Defaults to 0.
+        drop_path_rate (float): stochastic depth rate.
+            Defaults to 0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='LN')``.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 input_resolution,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 proj_drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 init_cfg: Optional[Union[List[dict], dict]] = None) -> None:
+
+        super().__init__(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            feedforward_channels=int(mlp_ratio * embed_dims),
+            drop_rate=proj_drop_rate,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate,
+            num_fcs=num_fcs,
+            qkv_bias=qkv_bias,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+
+        if min(self.input_resolution) <= self.window_size:
+            self.window_size = min(self.input_resolution)
+
+        self.attn = MixMIMWindowAttention(
+            embed_dims=embed_dims,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate)
+
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+    @staticmethod
+    def window_reverse(windows, H, W, window_size):
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    @staticmethod
+    def window_partition(x, window_size):
+        B, H, W, C = x.shape
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+    def forward(self, x, attn_mask=None):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # partition windows
+        x_windows = self.window_partition(
+            x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                   C)  # nW*B, window_size*window_size, C
+        if attn_mask is not None:
+            attn_mask = attn_mask.repeat(B, 1, 1)  # B, N, 1
+            attn_mask = attn_mask.view(B, H, W, 1)
+            attn_mask = self.window_partition(attn_mask, self.window_size)
+            attn_mask = attn_mask.view(-1, self.window_size * self.window_size,
+                                       1)
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+        x = self.window_reverse(attn_windows, H, W,
+                                self.window_size)  # B H' W' C
+
+        x = x.view(B, H * W, C)
+
+        x = shortcut + self.drop_path(x)
+
+        x = self.ffn(self.norm2(x), identity=x)  # ffn contains DropPath
+
+        return x
+
+
+class MixMIMLayer(BaseModule):
+    """Implements one MixMIM layer, which may contains several MixMIM blocks.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        input_resolution (tuple): Input resolution of this layer.
+        depth (int): The number of blocks in this layer.
+        num_heads (int): The number of head in attention,
+        window_size (list): The height and width of the window.
+        mlp_ratio (int): The MLP ration in FFN.
+        qkv_bias (bool): Whether to add bias for qkv in attention modules.
+            Defaults to True.
+        proj_drop_rate (float): Probability of an element to be zeroed.
+            Defaults to 0.
+        attn_drop_rate (float): attention drop rate.
+            Defaults to 0.
+        drop_path_rate (float): stochastic depth rate.
+            Defaults to 0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='LN')``.
+        downsample (class, optional): Downsample the output of blocks b
+            y patch merging.Defaults to None.
+        use_checkpoint (bool): Whether use the checkpoint to
+        reduce GPU memory cost.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 input_resolution: int,
+                 depth: int,
+                 num_heads: int,
+                 window_size: int,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 proj_drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=[0.],
+                 norm_cfg=dict(type='LN'),
+                 downsample=None,
+                 use_checkpoint=False,
+                 init_cfg: Optional[Union[List[dict], dict]] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            self.blocks.append(
+                MixMIMBlock(
+                    embed_dims=embed_dims,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    proj_drop_rate=proj_drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=drop_path_rate[i],
+                    norm_cfg=norm_cfg))
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                in_channels=embed_dims,
+                out_channels=2 * embed_dims,
+                norm_cfg=norm_cfg)
+        else:
+            self.downsample = None
+
+    def forward(self, x, attn_mask=None):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask=attn_mask)
+        if self.downsample is not None:
+            x, _ = self.downsample(x, self.input_resolution)
+        return x
+
+    def extra_repr(self) -> str:
+        return f'dim={self.embed_dims}, \
+    input_resolution={self.input_resolution}, depth={self.depth}'
+
+
+@MODELS.register_module()
+class MixMIMTransformer(BaseBackbone):
+    """MixMIM backbone.
+
+    A PyTorch implement of : ` MixMIM: Mixed and Masked Image
+    Modeling for Efficient Visual Representation Learning
+    <https://arxiv.org/abs/2205.13137>`_
+
+    Args:
+        arch (str | dict): MixMIM architecture. If use string,
+            choose from 'base','large' and 'huge'.
+            If use dict, it should have below keys:
+
+            - **embed_dims** (int): The dimensions of embedding.
+            - **depths** (int): The number of transformer encoder layers.
+            - **num_heads** (int): The number of heads in attention modules.
+
+            Defaults to 'base'.
+        mlp_ratio (int): The mlp ratio in FFN.  Defaults to 4.
+        img_size (int | tuple): The expected input image shape. Because we
+            support dynamic input shape, just set the argument to mlp_ratio
+            the most common input image shape. Defaults to 224.
+        patch_size (int | tuple): The patch size in patch embedding.
+            Defaults to 16.
+        in_channels (int): The num of input channels. Defaults to 3.
+        window_size (list): The height and width of the window.
+        qkv_bias (bool): Whether to add bias for qkv in attention modules.
+            Defaults to True.
+        patch_cfg (dict): Extra config dict for patch embedding.
+            Defaults to an empty dict.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='LN')``.
+        drop_rate (float): Probability of an element to be zeroed.
+            Defaults to 0.
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        attn_drop_rate (float): attention drop rate. Defaults to 0.
+        use_checkpoint (bool): Whether use the checkpoint to
+        reduce GPU memory cost.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to None.
+    """
+    arch_zoo = {
+        **dict.fromkeys(
+            ['b', 'base'], {
+                'embed_dims': 128,
+                'depths': [2, 2, 18, 2],
+                'num_heads': [4, 8, 16, 32]
+            }),
+        **dict.fromkeys(
+            ['l', 'large'], {
+                'embed_dims': 192,
+                'depths': [2, 2, 18, 2],
+                'num_heads': [6, 12, 24, 48]
+            }),
+        **dict.fromkeys(
+            ['h', 'huge'], {
+                'embed_dims': 352,
+                'depths': [2, 2, 18, 2],
+                'num_heads': [11, 22, 44, 88]
+            }),
+    }
+
+    def __init__(
+        self,
+        arch='base',
+        mlp_ratio=4,
+        img_size=224,
+        patch_size=4,
+        in_channels=3,
+        window_size=[14, 14, 14, 7],
+        qkv_bias=True,
+        patch_cfg=dict(),
+        norm_cfg=dict(type='LN'),
+        drop_rate=0.0,
+        drop_path_rate=0.0,
+        attn_drop_rate=0.0,
+        use_checkpoint=False,
+        init_cfg: Optional[dict] = None,
+    ) -> None:
+        super(MixMIMTransformer, self).__init__(init_cfg=init_cfg)
+
+        if isinstance(arch, str):
+            arch = arch.lower()
+            assert arch in set(self.arch_zoo), \
+                f'Arch {arch} is not in default archs {set(self.arch_zoo)}'
+            self.arch_settings = self.arch_zoo[arch]
+        else:
+            essential_keys = {
+                'embed_dims', 'num_layers', 'num_heads', 'feedforward_channels'
+            }
+            assert isinstance(arch, dict) and essential_keys <= set(arch), \
+                f'Custom arch needs a dict with keys {essential_keys}'
+            self.arch_settings = arch
+
+        self.embed_dims = self.arch_settings['embed_dims']
+        self.depths = self.arch_settings['depths']
+        self.num_heads = self.arch_settings['num_heads']
+
+        self.encoder_stride = 32
+
+        self.num_layers = len(self.depths)
+        self.qkv_bias = qkv_bias
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.use_checkpoint = use_checkpoint
+        self.mlp_ratio = mlp_ratio
+        self.window_size = window_size
+
+        _patch_cfg = dict(
+            in_channels=in_channels,
+            input_size=img_size,
+            embed_dims=self.embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=patch_size,
+            norm_cfg=dict(type='LN'),
+        )
+        _patch_cfg.update(patch_cfg)
+        self.patch_embed = PatchEmbed(**_patch_cfg)
+        self.patch_resolution = self.patch_embed.init_out_size
+
+        self.dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(self.depths))
+        ]
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            self.layers.append(
+                MixMIMLayer(
+                    embed_dims=int(self.embed_dims * 2**i_layer),
+                    input_resolution=(self.patch_resolution[0] // (2**i_layer),
+                                      self.patch_resolution[1] //
+                                      (2**i_layer)),
+                    depth=self.depths[i_layer],
+                    num_heads=self.num_heads[i_layer],
+                    window_size=self.window_size[i_layer],
+                    mlp_ratio=self.mlp_ratio,
+                    qkv_bias=self.qkv_bias,
+                    proj_drop_rate=self.drop_rate,
+                    attn_drop_rate=self.attn_drop_rate,
+                    drop_path_rate=self.dpr[sum(self.depths[:i_layer]
+                                                ):sum(self.depths[:i_layer +
+                                                                  1])],
+                    norm_cfg=norm_cfg,
+                    downsample=PatchMerging if
+                    (i_layer < self.num_layers - 1) else None,
+                    use_checkpoint=self.use_checkpoint))
+
+        self.num_features = int(self.embed_dims * 2**(self.num_layers - 1))
+        self.drop_after_pos = nn.Dropout(p=self.drop_rate)
+
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.num_patches = self.patch_resolution[0] * self.patch_resolution[1]
+        self.absolute_pos_embed = nn.Parameter(
+            torch.zeros(1, self.num_patches, self.embed_dims),
+            requires_grad=False)
+
+        _, self.norm = build_norm_layer(norm_cfg, self.num_features)
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.patch_embed(x)
+
+        x = x + self.absolute_pos_embed
+        x = self.drop_after_pos(x)
+
+        for layer in self.layers:
+            x = layer(x, attn_mask=None)
+
+        x = self.norm(x)
+        x = self.avgpool(x.transpose(1, 2))  # B C 1
+        x = torch.flatten(x, 1)
+
+        return (x, )
diff --git a/model-index.yml b/model-index.yml
index d0bbf424def..a761ab8a225 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -45,3 +45,4 @@ Import:
   - configs/beitv2/metafile.yml
   - configs/eva/metafile.yml
   - configs/revvit/metafile.yml
+  - configs/mixmim/metafile.yml
diff --git a/tests/test_models/test_backbones/test_mixmim.py b/tests/test_models/test_backbones/test_mixmim.py
new file mode 100644
index 00000000000..e21d143c9b5
--- /dev/null
+++ b/tests/test_models/test_backbones/test_mixmim.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+
+from mmcls.models.backbones import MixMIMTransformer
+
+
+class TestMixMIM(TestCase):
+
+    def setUp(self):
+        self.cfg = dict(arch='b', drop_rate=0.0, drop_path_rate=0.1)
+
+    def test_structure(self):
+
+        # Test custom arch
+        cfg = deepcopy(self.cfg)
+
+        model = MixMIMTransformer(**cfg)
+        self.assertEqual(model.embed_dims, 128)
+        self.assertEqual(sum(model.depths), 24)
+        self.assertIsNotNone(model.absolute_pos_embed)
+
+        num_heads = [4, 8, 16, 32]
+        for i, layer in enumerate(model.layers):
+            self.assertEqual(layer.blocks[0].num_heads, num_heads[i])
+            self.assertEqual(layer.blocks[0].ffn.feedforward_channels,
+                             128 * (2**i) * 4)
+
+    def test_forward(self):
+        imgs = torch.randn(1, 3, 224, 224)
+
+        cfg = deepcopy(self.cfg)
+        model = MixMIMTransformer(**cfg)
+        outs = model(imgs)
+        self.assertIsInstance(outs, tuple)
+        self.assertEqual(len(outs), 1)
+        averaged_token = outs[-1]
+        self.assertEqual(averaged_token.shape, (1, 1024))
diff --git a/tools/model_converters/mixmimx_to_mmcls.py b/tools/model_converters/mixmimx_to_mmcls.py
new file mode 100644
index 00000000000..dcf9858b59b
--- /dev/null
+++ b/tools/model_converters/mixmimx_to_mmcls.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def correct_unfold_reduction_order(x: torch.Tensor):
+    out_channel, in_channel = x.shape
+    x = x.reshape(out_channel, 4, in_channel // 4)
+    x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel)
+    return x
+
+
+def correct_unfold_norm_order(x):
+    in_channel = x.shape[0]
+    x = x.reshape(4, in_channel // 4)
+    x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+    return x
+
+
+def convert_mixmim(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    for k, v in list(ckpt.items()):
+        new_v = v
+
+        if k.startswith('patch_embed'):
+            new_k = k.replace('proj', 'projection')
+
+        elif k.startswith('layers'):
+            if 'norm1' in k:
+                new_k = k.replace('norm1', 'ln1')
+            elif 'norm2' in k:
+                new_k = k.replace('norm2', 'ln2')
+            elif 'mlp.fc1' in k:
+                new_k = k.replace('mlp.fc1', 'ffn.layers.0.0')
+            elif 'mlp.fc2' in k:
+                new_k = k.replace('mlp.fc2', 'ffn.layers.1')
+            else:
+                new_k = k
+
+        elif k.startswith('norm') or k.startswith('absolute_pos_embed'):
+            new_k = k
+
+        elif k.startswith('head'):
+            new_k = k.replace('head.', 'head.fc.')
+
+        else:
+            raise ValueError
+
+        # print(new_k)
+        if not new_k.startswith('head'):
+            new_k = 'backbone.' + new_k
+
+        if 'downsample' in new_k:
+            print('Covert {} in PatchMerging from timm to mmcv format!'.format(
+                new_k))
+
+            if 'reduction' in new_k:
+                new_v = correct_unfold_reduction_order(new_v)
+            elif 'norm' in new_k:
+                new_v = correct_unfold_norm_order(new_v)
+
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in pretrained van models to mmcls style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+
+    weight = convert_mixmim(state_dict)
+    # weight = convert_official_mixmim(state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+    print('Done!!')
+
+
+if __name__ == '__main__':
+    main()

From 5b266d9e7c49044d1fcf15d046df3c06af3ff505 Mon Sep 17 00:00:00 2001
From: Rongjie Li <limo97@163.com>
Date: Thu, 22 Dec 2022 16:33:57 +0800
Subject: [PATCH 14/21] [Feature] Add clip backbone. (#1258)

* clip backbone added

* passed precommit

* update readme

* update according to PR review

* add missing file

* add unittest

* refine metafile

* refine metafile and readme for readdocs

* refine metafile

* refine metafile

* Update metafile

Co-authored-by: mzr1996 <mzr1996@163.com>
---
 .../_base_/datasets/imagenet_bs64_clip_224.py |  72 +++++
 .../_base_/datasets/imagenet_bs64_clip_384.py |  72 +++++
 .../_base_/datasets/imagenet_bs64_clip_448.py |  73 +++++
 configs/clip/README.md                        |  53 ++++
 configs/clip/metafile.yml                     | 296 ++++++++++++++++++
 .../clip/vit-base-p16_pt-64xb64_in1k-384px.py |  12 +
 .../clip/vit-base-p16_pt-64xb64_in1k-448px.py |  12 +
 configs/clip/vit-base-p16_pt-64xb64_in1k.py   |  12 +
 .../clip/vit-base-p32_pt-64xb64_in1k-384px.py |  12 +
 .../clip/vit-base-p32_pt-64xb64_in1k-448px.py |  12 +
 configs/clip/vit-base-p32_pt-64xb64_in1k.py   |  12 +
 mmcls/models/backbones/vision_transformer.py  |  10 +
 model-index.yml                               |   1 +
 .../test_backbones/test_vision_transformer.py |   6 +
 tools/model_converters/clip_to_mmcls.py       |  74 +++++
 15 files changed, 729 insertions(+)
 create mode 100644 configs/_base_/datasets/imagenet_bs64_clip_224.py
 create mode 100644 configs/_base_/datasets/imagenet_bs64_clip_384.py
 create mode 100644 configs/_base_/datasets/imagenet_bs64_clip_448.py
 create mode 100644 configs/clip/README.md
 create mode 100644 configs/clip/metafile.yml
 create mode 100644 configs/clip/vit-base-p16_pt-64xb64_in1k-384px.py
 create mode 100644 configs/clip/vit-base-p16_pt-64xb64_in1k-448px.py
 create mode 100644 configs/clip/vit-base-p16_pt-64xb64_in1k.py
 create mode 100644 configs/clip/vit-base-p32_pt-64xb64_in1k-384px.py
 create mode 100644 configs/clip/vit-base-p32_pt-64xb64_in1k-448px.py
 create mode 100644 configs/clip/vit-base-p32_pt-64xb64_in1k.py
 create mode 100644 tools/model_converters/clip_to_mmcls.py

diff --git a/configs/_base_/datasets/imagenet_bs64_clip_224.py b/configs/_base_/datasets/imagenet_bs64_clip_224.py
new file mode 100644
index 00000000000..f3149d44ef6
--- /dev/null
+++ b/configs/_base_/datasets/imagenet_bs64_clip_224.py
@@ -0,0 +1,72 @@
+# dataset settings
+dataset_type = 'ImageNet'
+img_norm_cfg = dict(
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    to_rgb=True)
+image_size = 224
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=image_size,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    # dict(
+    #     type='RandAugment',
+    #     policies={{_base_.rand_increasing_policies}},
+    #     num_policies=2,
+    #     total_level=10,
+    #     magnitude_level=9,
+    #     magnitude_std=0.5,
+    #     hparams=dict(
+    #         pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+    #         interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(image_size, -1),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=image_size),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_prefix='data/imagenet/train',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_prefix='data/imagenet/val',
+        ann_file='data/imagenet/meta/val.txt',
+        pipeline=test_pipeline),
+    test=dict(
+        # replace `data/val` with `data/test` for standard test
+        type=dataset_type,
+        data_prefix='data/imagenet/val',
+        ann_file='data/imagenet/meta/val.txt',
+        pipeline=test_pipeline))
+
+evaluation = dict(interval=10, metric='accuracy')
diff --git a/configs/_base_/datasets/imagenet_bs64_clip_384.py b/configs/_base_/datasets/imagenet_bs64_clip_384.py
new file mode 100644
index 00000000000..2a57a8923fa
--- /dev/null
+++ b/configs/_base_/datasets/imagenet_bs64_clip_384.py
@@ -0,0 +1,72 @@
+# dataset settings
+dataset_type = 'ImageNet'
+img_norm_cfg = dict(
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    to_rgb=True)
+image_size = 384
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=image_size,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    # dict(
+    #     type='RandAugment',
+    #     policies={{_base_.rand_increasing_policies}},
+    #     num_policies=2,
+    #     total_level=10,
+    #     magnitude_level=9,
+    #     magnitude_std=0.5,
+    #     hparams=dict(
+    #         pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+    #         interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(image_size, -1),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=image_size),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_prefix='data/imagenet/train',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_prefix='data/imagenet/val',
+        ann_file='data/imagenet/meta/val.txt',
+        pipeline=test_pipeline),
+    test=dict(
+        # replace `data/val` with `data/test` for standard test
+        type=dataset_type,
+        data_prefix='data/imagenet/val',
+        ann_file='data/imagenet/meta/val.txt',
+        pipeline=test_pipeline))
+
+evaluation = dict(interval=10, metric='accuracy')
diff --git a/configs/_base_/datasets/imagenet_bs64_clip_448.py b/configs/_base_/datasets/imagenet_bs64_clip_448.py
new file mode 100644
index 00000000000..d4fe2a98627
--- /dev/null
+++ b/configs/_base_/datasets/imagenet_bs64_clip_448.py
@@ -0,0 +1,73 @@
+# dataset settings
+dataset_type = 'ImageNet'
+img_norm_cfg = dict(
+    mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
+    std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
+    to_rgb=True)
+image_size = 448
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=image_size,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    # dict(
+    #     type='RandAugment',
+    #     policies={{_base_.rand_increasing_policies}},
+    #     num_policies=2,
+    #     total_level=10,
+    #     magnitude_level=9,
+    #     magnitude_std=0.5,
+    #     hparams=dict(
+    #         pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+    #         interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(image_size, -1),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=image_size),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_prefix='data/imagenet/train',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_prefix='data/imagenet/val',
+        ann_file='data/imagenet/meta/val.txt',
+        pipeline=test_pipeline),
+    test=dict(
+        # replace `data/val` with `data/test` for standard test
+        type=dataset_type,
+        data_prefix='data/imagenet/val',
+        ann_file='data/imagenet/meta/val.txt',
+        pipeline=test_pipeline))
+
+evaluation = dict(interval=10, metric='accuracy')
diff --git a/configs/clip/README.md b/configs/clip/README.md
new file mode 100644
index 00000000000..a15f12b4f77
--- /dev/null
+++ b/configs/clip/README.md
@@ -0,0 +1,53 @@
+# CLIP
+
+> [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This restricted form of supervision limits their generality and usability since additional labeled data is needed to specify any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a much broader source of supervision. We demonstrate that the simple pre-training task of predicting which caption goes with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400 million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks. We study the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need for any dataset specific training. For instance, we match the accuracy of the original ResNet-50 on ImageNet zero-shot without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained model weights at this https URL.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://raw.githubusercontent.com/Scarecrow0/figures_cache/main/clip_main_fig.png" width="100%"/>
+</div>
+
+## Results and models
+
+### ImageNet-1k
+
+|                     Model                      |        Pretrain         | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                     Config                      |                     Download                      |
+| :--------------------------------------------: | :---------------------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------: | :-----------------------------------------------: |
+| clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k\* | LAION-2B & ImageNet-12k |   88.22   |   4.36   |   83.06   |   96.49   |   [config](./vit-base-p32_pt-64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k_20221220-b384e830.pth) |
+| clip-vit-base-p32_laion2b-pre_3rdparty_in1k\*  |        LAION-2B         |   88.22   |   4.36   |   82.46   |   96.12   |   [config](./vit-base-p32_pt-64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_laion2b-pre_3rdparty_in1k_20221220-194df57f.pth) |
+|  clip-vit-base-p32_openai-pre_3rdparty_in1k\*  |         OpenAI          |   88.22   |   4.36   |   81.77   |   95.89   |   [config](./vit-base-p32_pt-64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_openai-pre_3rdparty_in1k_20221220-a0182ba9.pth) |
+| clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k-384px\* | LAION-2B & ImageNet-12k |   88.22   |  12.66   |   85.39   |   97.67   | [config](./vit-base-p32_pt-64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k-384px_20221220-c7757552.pth) |
+| clip-vit-base-p32_openai-in12k-pre_3rdparty_in1k-384px\* |  OpenAI & ImageNet-12k  |   88.22   |  12.66   |   85.13   |   97.42   | [config](./vit-base-p32_pt-64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_openai-in12k-pre_3rdparty_in1k-384px_20221220-dc2e49ea.pth) |
+| clip-vit-base-p16_laion2b-in12k-pre_3rdparty_in1k\* | LAION-2B & ImageNet-12k |   86.57   |  16.86   |   86.02   |   97.76   |   [config](./vit-base-p16_pt-64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_laion2b-in12k-pre_3rdparty_in1k_20221220-a5e31f8c.pth) |
+| clip-vit-base-p16_laion2b-pre_3rdparty_in1k\*  |        LAION-2B         |   86.57   |  16.86   |   85.49   |   97.59   |   [config](./vit-base-p16_pt-64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_laion2b-pre_3rdparty_in1k_20221220-5e24ff58.pth) |
+| clip-vit-base-p16_openai-in12k-pre_3rdparty_in1k\* |  OpenAI & ImageNet-12k  |   86.57   |  16.86   |   85.99   |   97.72   |   [config](./vit-base-p16_pt-64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_openai-in12k-pre_3rdparty_in1k_20221220-90d930a8.pth) |
+|  clip-vit-base-p16_openai-pre_3rdparty_in1k\*  |         OpenAI          |   86.57   |  16.86   |   85.30   |   97.50   |   [config](./vit-base-p16_pt-64xb64_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_openai-pre_3rdparty_in1k_20221220-c7d9c899.pth) |
+| clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k-448px\* | LAION-2B & ImageNet-12k |   88.22   |  17.20   |   85.76   |   97.63   | [config](./vit-base-p32_pt-64xb64_in1k-448px.py) | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k-448px_20221220-ca404a7d.pth) |
+| clip-vit-base-p16_laion2b-in12k-pre_3rdparty_in1k-384px\* | LAION-2B & ImageNet-12k |   86.57   |  49.37   |   87.17   |   98.02   | [config](./vit-base-p16_pt-64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_laion2b-in12k-pre_3rdparty_in1k-384px_20221220-84ed0cc0.pth) |
+| clip-vit-base-p16_laion2b-pre_3rdparty_in1k-384px\* |        LAION-2B         |   86.57   |  49.37   |   86.52   |   97.97   | [config](./vit-base-p16_pt-64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_laion2b-pre_3rdparty_in1k-384px_20221220-558ed826.pth) |
+| clip-vit-base-p16_openai-in12k-pre_3rdparty_in1k-384px\* |  OpenAI & ImageNet-12k  |   86.57   |  49.37   |   86.87   |   98.05   | [config](./vit-base-p16_pt-64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_openai-in12k-pre_3rdparty_in1k-384px_20221220-8df86b74.pth) |
+| clip-vit-base-p16_openai-pre_3rdparty_in1k-384px\* |         OpenAI          |   86.57   |  49.37   |   86.25   |   97.90   | [config](./vit-base-p16_pt-64xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_openai-pre_3rdparty_in1k-384px_20221220-eb012e87.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/rwightman/pytorch-image-models). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
+## Citation
+
+```bibtex
+@InProceedings{pmlr-v139-radford21a,
+title = {Learning Transferable Visual Models From Natural Language Supervision},
+author = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
+booktitle = {Proceedings of the 38th International Conference on Machine Learning},
+year = {2021},
+series = {Proceedings of Machine Learning Research},
+publisher = {PMLR},
+}
+```
diff --git a/configs/clip/metafile.yml b/configs/clip/metafile.yml
new file mode 100644
index 00000000000..2fb213abd0b
--- /dev/null
+++ b/configs/clip/metafile.yml
@@ -0,0 +1,296 @@
+Collections:
+  - Name: CLIP
+    Metadata:
+      Architecture:
+        - Attention Dropout
+        - Convolution
+        - Dense Connections
+        - Dropout
+        - GELU
+        - Layer Normalization
+        - Multi-Head Attention
+        - Scaled Dot-Product Attention
+        - Tanh Activation
+    Paper:
+      Title: Learning Transferable Visual Models From Natural Language Supervision
+      URL: https://arxiv.org/abs/2201.09792
+    README: configs/clip/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmclassification/blob/dev-1.x/mmcls/models/backbones/vision_transformer.py
+      Version: v1.0.0
+
+Models:
+  - Name: clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k
+    Metadata:
+      FLOPs: 4364335104
+      Parameters: 88225000
+      Training Data:
+        - LAION-2B
+        - ImageNet-12k
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.06
+          Top 5 Accuracy: 96.49
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k_20221220-b384e830.pth
+    Config: configs/clip/vit-base-p32_pt-64xb64_in1k.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch32_clip_224.laion2b_ft_in12k_in1k
+  - Name: clip-vit-base-p32_laion2b-pre_3rdparty_in1k
+    Metadata:
+      FLOPs: 4364335104
+      Parameters: 88225000
+      Training Data:
+        - LAION-2B
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.46
+          Top 5 Accuracy: 96.12
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_laion2b-pre_3rdparty_in1k_20221220-194df57f.pth
+    Config: configs/clip/vit-base-p32_pt-64xb64_in1k.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch32_clip_224.laion2b_ft_in1k
+  - Name: clip-vit-base-p32_openai-pre_3rdparty_in1k
+    Metadata:
+      FLOPs: 4364335104
+      Parameters: 88225000
+      Training Data:
+        - OpenAI
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.77
+          Top 5 Accuracy: 95.89
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_openai-pre_3rdparty_in1k_20221220-a0182ba9.pth
+    Config: configs/clip/vit-base-p32_pt-64xb64_in1k.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch32_clip_224.openai_ft_in1k
+  - Name: clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k-384px
+    Metadata:
+      FLOPs: 12661054464
+      Parameters: 88225000
+      Training Data:
+        - LAION-2B
+        - ImageNet-12k
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.39
+          Top 5 Accuracy: 97.67
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k-384px_20221220-c7757552.pth
+    Config: configs/clip/vit-base-p32_pt-64xb64_in1k-384px.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch32_clip_384.laion2b_ft_in12k_in1k
+  - Name: clip-vit-base-p32_openai-in12k-pre_3rdparty_in1k-384px
+    Metadata:
+      FLOPs: 12661054464
+      Parameters: 88225000
+      Training Data:
+        - OpenAI
+        - ImageNet-12k
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.13
+          Top 5 Accuracy: 97.42
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_openai-in12k-pre_3rdparty_in1k-384px_20221220-dc2e49ea.pth
+    Config: configs/clip/vit-base-p32_pt-64xb64_in1k-384px.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch32_clip_384.openai_ft_in12k_in1k
+  - Name: clip-vit-base-p16_laion2b-in12k-pre_3rdparty_in1k
+    Metadata:
+      FLOPs: 16855600128
+      Parameters: 86568424
+      Training Data:
+        - LAION-2B
+        - ImageNet-12k
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.02
+          Top 5 Accuracy: 97.76
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_laion2b-in12k-pre_3rdparty_in1k_20221220-a5e31f8c.pth
+    Config: configs/clip/vit-base-p16_pt-64xb64_in1k.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch16_clip_224.laion2b_ft_in12k_in1k
+  - Name: clip-vit-base-p16_laion2b-pre_3rdparty_in1k
+    Metadata:
+      FLOPs: 16855600128
+      Parameters: 86568424
+      Training Data:
+        - LAION-2B
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.49
+          Top 5 Accuracy: 97.59
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_laion2b-pre_3rdparty_in1k_20221220-5e24ff58.pth
+    Config: configs/clip/vit-base-p16_pt-64xb64_in1k.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch16_clip_224.laion2b_ft_in1k
+  - Name: clip-vit-base-p16_openai-in12k-pre_3rdparty_in1k
+    Metadata:
+      FLOPs: 16855600128
+      Parameters: 86568424
+      Training Data:
+        - OpenAI
+        - ImageNet-12k
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.99
+          Top 5 Accuracy: 97.72
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_openai-in12k-pre_3rdparty_in1k_20221220-90d930a8.pth
+    Config: configs/clip/vit-base-p16_pt-64xb64_in1k.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch16_clip_224.openai_ft_in12k_in1k
+  - Name: clip-vit-base-p16_openai-pre_3rdparty_in1k
+    Metadata:
+      FLOPs: 16855600128
+      Parameters: 86568424
+      Training Data:
+        - OpenAI
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.3
+          Top 5 Accuracy: 97.5
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_openai-pre_3rdparty_in1k_20221220-c7d9c899.pth
+    Config: configs/clip/vit-base-p16_pt-64xb64_in1k.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch16_clip_224.openai_ft_in1k
+  - Name: clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k-448px
+    Metadata:
+      FLOPs: 17202416640
+      Parameters: 88225000
+      Training Data:
+        - LAION-2B
+        - ImageNet-12k
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.76
+          Top 5 Accuracy: 97.63
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p32_laion2b-in12k-pre_3rdparty_in1k-448px_20221220-ca404a7d.pth
+    Config: configs/clip/vit-base-p32_pt-64xb64_in1k-448px.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch32_clip_448.laion2b_ft_in12k_in1k
+  - Name: clip-vit-base-p16_laion2b-in12k-pre_3rdparty_in1k-384px
+    Metadata:
+      FLOPs: 49370078208
+      Parameters: 86568424
+      Training Data:
+        - LAION-2B
+        - ImageNet-12k
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 87.17
+          Top 5 Accuracy: 98.02
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_laion2b-in12k-pre_3rdparty_in1k-384px_20221220-84ed0cc0.pth
+    Config: configs/clip/vit-base-p16_pt-64xb64_in1k-384px.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch16_clip_384.laion2b_ft_in12k_in1k
+  - Name: clip-vit-base-p16_laion2b-pre_3rdparty_in1k-384px
+    Metadata:
+      FLOPs: 49370078208
+      Parameters: 86568424
+      Training Data:
+        - LAION-2B
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.52
+          Top 5 Accuracy: 97.97
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_laion2b-pre_3rdparty_in1k-384px_20221220-558ed826.pth
+    Config: configs/clip/vit-base-p16_pt-64xb64_in1k-384px.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch16_clip_384.laion2b_ft_in1k
+  - Name: clip-vit-base-p16_openai-in12k-pre_3rdparty_in1k-384px
+    Metadata:
+      FLOPs: 49370078208
+      Parameters: 86568424
+      Training Data:
+        - OpenAI
+        - ImageNet-12k
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.87
+          Top 5 Accuracy: 98.05
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_openai-in12k-pre_3rdparty_in1k-384px_20221220-8df86b74.pth
+    Config: configs/clip/vit-base-p16_pt-64xb64_in1k-384px.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch16_clip_384.openai_ft_in12k_in1k
+  - Name: clip-vit-base-p16_openai-pre_3rdparty_in1k-384px
+    Metadata:
+      FLOPs: 49370078208
+      Parameters: 86568424
+      Training Data:
+        - OpenAI
+        - ImageNet-1k
+    In Collection: CLIP
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.25
+          Top 5 Accuracy: 97.9
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/clip/clip-vit-base-p16_openai-pre_3rdparty_in1k-384px_20221220-eb012e87.pth
+    Config: configs/clip/vit-base-p16_pt-64xb64_in1k-384px.py
+    Converted From:
+      Code: https://github.com/rwightman/pytorch-image-models
+      Weights: https://huggingface.co/timm/vit_base_patch16_clip_384.openai_ft_in1k
diff --git a/configs/clip/vit-base-p16_pt-64xb64_in1k-384px.py b/configs/clip/vit-base-p16_pt-64xb64_in1k-384px.py
new file mode 100644
index 00000000000..45bd29f4e42
--- /dev/null
+++ b/configs/clip/vit-base-p16_pt-64xb64_in1k-384px.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/vit-base-p16.py',
+    '../_base_/datasets/imagenet_bs64_clip_384.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting/mnt/lustre/lirongjie/tmp/clip_ckpt/trans_ckpt
+model = dict(backbone=dict(pre_norm=True, ), )
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
diff --git a/configs/clip/vit-base-p16_pt-64xb64_in1k-448px.py b/configs/clip/vit-base-p16_pt-64xb64_in1k-448px.py
new file mode 100644
index 00000000000..a8ca8171057
--- /dev/null
+++ b/configs/clip/vit-base-p16_pt-64xb64_in1k-448px.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/vit-base-p16.py',
+    '../_base_/datasets/imagenet_bs64_clip_448.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting/mnt/lustre/lirongjie/tmp/clip_ckpt/trans_ckpt
+model = dict(backbone=dict(pre_norm=True, ), )
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
diff --git a/configs/clip/vit-base-p16_pt-64xb64_in1k.py b/configs/clip/vit-base-p16_pt-64xb64_in1k.py
new file mode 100644
index 00000000000..12476e84091
--- /dev/null
+++ b/configs/clip/vit-base-p16_pt-64xb64_in1k.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/vit-base-p16.py',
+    '../_base_/datasets/imagenet_bs64_clip_224.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting/mnt/lustre/lirongjie/tmp/clip_ckpt/trans_ckpt
+model = dict(backbone=dict(pre_norm=True, ), )
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
diff --git a/configs/clip/vit-base-p32_pt-64xb64_in1k-384px.py b/configs/clip/vit-base-p32_pt-64xb64_in1k-384px.py
new file mode 100644
index 00000000000..86713ecdaa7
--- /dev/null
+++ b/configs/clip/vit-base-p32_pt-64xb64_in1k-384px.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/vit-base-p32.py',
+    '../_base_/datasets/imagenet_bs64_clip_384.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting/mnt/lustre/lirongjie/tmp/clip_ckpt/trans_ckpt
+model = dict(backbone=dict(pre_norm=True, ), )
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
diff --git a/configs/clip/vit-base-p32_pt-64xb64_in1k-448px.py b/configs/clip/vit-base-p32_pt-64xb64_in1k-448px.py
new file mode 100644
index 00000000000..1bad04cbc16
--- /dev/null
+++ b/configs/clip/vit-base-p32_pt-64xb64_in1k-448px.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/vit-base-p32.py',
+    '../_base_/datasets/imagenet_bs64_clip_448.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting/mnt/lustre/lirongjie/tmp/clip_ckpt/trans_ckpt
+model = dict(backbone=dict(pre_norm=True, ), )
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
diff --git a/configs/clip/vit-base-p32_pt-64xb64_in1k.py b/configs/clip/vit-base-p32_pt-64xb64_in1k.py
new file mode 100644
index 00000000000..6482dd36170
--- /dev/null
+++ b/configs/clip/vit-base-p32_pt-64xb64_in1k.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/vit-base-p32.py',
+    '../_base_/datasets/imagenet_bs64_clip_224.py',
+    '../_base_/schedules/imagenet_bs4096_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# model setting/mnt/lustre/lirongjie/tmp/clip_ckpt/trans_ckpt
+model = dict(backbone=dict(pre_norm=True, ), )
+
+# schedule setting
+optim_wrapper = dict(clip_grad=dict(max_norm=1.0))
diff --git a/mmcls/models/backbones/vision_transformer.py b/mmcls/models/backbones/vision_transformer.py
index a3771f2695f..655f8edc87c 100644
--- a/mmcls/models/backbones/vision_transformer.py
+++ b/mmcls/models/backbones/vision_transformer.py
@@ -236,6 +236,7 @@ def __init__(self,
                  interpolate_mode='bicubic',
                  patch_cfg=dict(),
                  layer_cfgs=dict(),
+                 pre_norm=False,
                  init_cfg=None):
         super(VisionTransformer, self).__init__(init_cfg)
 
@@ -264,6 +265,7 @@ def __init__(self,
             conv_type='Conv2d',
             kernel_size=patch_size,
             stride=patch_size,
+            bias=not pre_norm,  # disable bias if pre_norm is used(e.g., CLIP)
         )
         _patch_cfg.update(patch_cfg)
         self.patch_embed = PatchEmbed(**_patch_cfg)
@@ -319,6 +321,13 @@ def __init__(self,
             self.layers.append(TransformerEncoderLayer(**_layer_cfg))
 
         self.frozen_stages = frozen_stages
+        if pre_norm:
+            _, norm_layer = build_norm_layer(
+                norm_cfg, self.embed_dims, postfix=1)
+        else:
+            norm_layer = nn.Identity()
+        self.add_module('pre_norm', norm_layer)
+
         self.final_norm = final_norm
         if final_norm:
             self.norm1_name, norm1 = build_norm_layer(
@@ -417,6 +426,7 @@ def forward(self, x):
             num_extra_tokens=self.num_extra_tokens)
         x = self.drop_after_pos(x)
 
+        x = self.pre_norm(x)
         if not self.with_cls_token:
             # Remove class token for transformer encoder input
             x = x[:, 1:]
diff --git a/model-index.yml b/model-index.yml
index a761ab8a225..c036c685c99 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -45,4 +45,5 @@ Import:
   - configs/beitv2/metafile.yml
   - configs/eva/metafile.yml
   - configs/revvit/metafile.yml
+  - configs/clip/metafile.yml
   - configs/mixmim/metafile.yml
diff --git a/tests/test_models/test_backbones/test_vision_transformer.py b/tests/test_models/test_backbones/test_vision_transformer.py
index b51fe306f88..5f0fbbf5a4a 100644
--- a/tests/test_models/test_backbones/test_vision_transformer.py
+++ b/tests/test_models/test_backbones/test_vision_transformer.py
@@ -73,6 +73,12 @@ def test_structure(self):
             self.assertAlmostEqual(layer.ffn.dropout_layer.drop_prob, dpr)
             dpr += dpr_inc
 
+        # Test model structure:  prenorm
+        cfg = deepcopy(self.cfg)
+        cfg['pre_norm'] = True
+        model = VisionTransformer(**cfg)
+        self.assertNotEqual(model.pre_norm.__class__, torch.nn.Identity)
+
     def test_init_weights(self):
         # test weight init cfg
         cfg = deepcopy(self.cfg)
diff --git a/tools/model_converters/clip_to_mmcls.py b/tools/model_converters/clip_to_mmcls.py
new file mode 100644
index 00000000000..6c179cfbf5e
--- /dev/null
+++ b/tools/model_converters/clip_to_mmcls.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_clip(ckpt):
+    new_ckpt = OrderedDict()
+
+    for k, v in list(ckpt.items()):
+        new_v = v
+        if k.startswith('head'):
+            new_k = k.replace('head.', 'head.layers.head.')
+            new_ckpt[new_k] = new_v
+            continue
+        elif k.startswith('patch_embed'):
+            if 'proj.' in k:
+                new_k = k.replace('proj.', 'projection.')
+            else:
+                new_k = k
+        elif k.startswith('norm_pre'):
+            new_k = k.replace('norm_pre', 'pre_norm')
+        elif k.startswith('blocks'):
+            new_k = k.replace('blocks.', 'layers.')
+            if 'norm1' in k:
+                new_k = new_k.replace('norm1', 'ln1')
+            elif 'norm2' in k:
+                new_k = new_k.replace('norm2', 'ln2')
+            elif 'mlp.fc1' in k:
+                new_k = new_k.replace('mlp.fc1', 'ffn.layers.0.0')
+            elif 'mlp.fc2' in k:
+                new_k = new_k.replace('mlp.fc2', 'ffn.layers.1')
+        elif k.startswith('norm'):
+            new_k = k.replace('norm', 'ln1')
+        else:
+            new_k = k
+
+        if not new_k.startswith('head'):
+            new_k = 'backbone.' + new_k
+        new_ckpt[new_k] = new_v
+    return new_ckpt
+
+
+'${ls /mnt/lustre/lirongjie/tmp/clip_ckpt/download_ckpt}'
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in pretrained clip models to mmcls style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    weight = convert_clip(state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+    print('Done!!')
+
+
+if __name__ == '__main__':
+    main()

From bac181f39302cce6f54885fd50a4b7e1fcc1cdd9 Mon Sep 17 00:00:00 2001
From: Colle <piercus@users.noreply.github.com>
Date: Fri, 30 Dec 2022 03:36:00 +0100
Subject: [PATCH 15/21] [Feature] Support Multi-task. (#1229)

* unit test for multi_task_head

* [Feature] MultiTaskHead (#628, #481)

* [Fix] lint for multi_task_head

* [Feature] Add `MultiTaskDataset` to support multi-task training.

* Update MultiTaskClsHead

* Update docs

* [CI] Add test mim CI. (#879)

* [Fix] Remove duplicated wide-resnet metafile.

* [Feature] Support MPS device. (#894)

* [Feature] Support MPS device.

* Add `auto_select_device`

* Add unit tests

* [Fix] Fix Albu crash bug. (#918)

* Fix albu BUG: using albu will cause the label from array(x) to array([x]) and crash the trainning

* Fix common

* Using copy incase potential bug in multi-label tasks

* Improve coding

* Improve code logic

* Add unit test

* Fix typo

* Fix yapf

* Bump version to 0.23.2. (#937)

* [Improve] Use `forward_dummy` to calculate FLOPS. (#953)

* Update README

* [Docs] Fix typo for wrong reference. (#1036)

* [Doc] Fix typo in tutorial 2 (#1043)

* [Docs] Fix a typo in ImageClassifier (#1050)

* add mask to loss

* add another pipeline

* adpat the pipeline if there is no mask

* switch mask and task

* first version of multi data smaple

* fix problem with attribut by getattr

* rm img_label suffix, fix 'LabelData' object has no attribute 'gt_label'

* training  without evaluation

* first version work

* add others metrics

* delete evaluation from dataset

* fix linter

* fix linter

* multi metrics

* first version of test

* change evaluate metric

* Update tests/test_models/test_heads.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update tests/test_models/test_heads.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* add tests

* add test for multidatasample

* create a generic test

* create a generic test

* create a generic test

* change multi data sample

* correct test

* test

* add new test

* add test for dataset

* correct test

* correct test

* correct test

* correct test

* fix : #5

* run yapf

* fix linter

* fix linter

* fix linter

* fix isort

* fix isort

* fix docformmater

* fix docformmater

* fix linter

* fix linter

* fix data sample

* Update mmcls/structures/multi_task_data_sample.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/structures/multi_task_data_sample.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/structures/multi_task_data_sample.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/structures/multi_task_data_sample.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/structures/multi_task_data_sample.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/structures/multi_task_data_sample.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update tests/test_structures/test_datasample.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/structures/multi_task_data_sample.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update tests/test_structures/test_datasample.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update tests/test_structures/test_datasample.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* update data sample

* update head

* update head

* update multi data sample

* fix linter

* fix linter

* fix linter

* fix linter

* fix linter

* fix linter

* update head

* fix problem we don't  set pred or  gt

* fix problem we don't  set pred or  gt

* fix problem we don't  set pred or  gt

* fix linter

* fix : #2

* fix : linter

* update multi head

* fix linter

* fix linter

* update data sample

* update data sample

* fix ; linter

* update test

* test pipeline

* update pipeline

* update test

* update dataset

* update dataset

* fix linter

* fix linter

* update formatting

* add test for multi-task-eval

* update formatting

* fix linter

* update test

* update

* add test

* update metrics

* update metrics

* add doc for functions

* fix linter

* training for multitask 1.x

* fix linter

* run flake8

* run linter

* update test

* add mask in evaluation

* update metric doc

* update metric doc

* Update mmcls/evaluation/metrics/multi_task.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/evaluation/metrics/multi_task.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/evaluation/metrics/multi_task.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/evaluation/metrics/multi_task.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/evaluation/metrics/multi_task.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* Update mmcls/evaluation/metrics/multi_task.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* update metric doc

* update metric doc

* Fix cannot import name MultiTaskDataSample

* fix test_datasets

* fix test_datasets

* fix linter

* add an example of multitask

* change name of configs dataset

* Refactor the multi-task support

* correct test and metric

* add test to multidatasample

* add test to multidatasample

* correct test

* correct metrics and clshead

* Update mmcls/models/heads/cls_head.py

Co-authored-by: Colle <piercus@users.noreply.github.com>

* update cls_head.py documentation

* lint

* lint

* fix: lint

* fix linter

* add eval mask

* fix documentation

* fix: single_label.py back to 1.x

* Update mmcls/models/heads/multi_task_head.py

Co-authored-by: Ma Zerun <mzr1996@163.com>

* Remove multi-task configs.

Co-authored-by: mzr1996 <mzr1996@163.com>
Co-authored-by: HinGwenWoong <peterhuang0323@qq.com>
Co-authored-by: Ming-Hsuan-Tu <alec.tu@acer.com>
Co-authored-by: Lei Lei <18294546+Crescent-Saturn@users.noreply.github.com>
Co-authored-by: WRH <12756472+wangruohui@users.noreply.github.com>
Co-authored-by: marouaneamz <maroineamil99@gmail.com>
Co-authored-by: marouane amzil <53240092+marouaneamz@users.noreply.github.com>
---
 mmcls/datasets/__init__.py                    |   3 +-
 mmcls/datasets/multi_task.py                  | 344 ++++++++++++++++++
 mmcls/datasets/transforms/__init__.py         |   6 +-
 mmcls/datasets/transforms/formatting.py       |  90 ++++-
 mmcls/evaluation/metrics/__init__.py          |   3 +-
 mmcls/evaluation/metrics/multi_task.py        | 120 ++++++
 mmcls/models/heads/__init__.py                |   4 +-
 mmcls/models/heads/cls_head.py                |  30 +-
 mmcls/models/heads/multi_task_head.py         | 139 +++++++
 mmcls/models/utils/data_preprocessor.py       |   9 +-
 mmcls/structures/__init__.py                  |   3 +-
 mmcls/structures/multi_task_data_sample.py    |  10 +
 tests/__init__.py                             |   1 +
 tests/data/dataset/multi-task.json            |  40 ++
 tests/test_datasets/test_datasets.py          |  75 +++-
 .../test_transforms/test_formatting.py        |  58 ++-
 .../test_metrics/test_multi_task_metrics.py   | 134 +++++++
 tests/test_models/test_heads.py               | 138 ++++++-
 tests/test_structures/test_datasample.py      |  19 +-
 19 files changed, 1185 insertions(+), 41 deletions(-)
 create mode 100644 mmcls/datasets/multi_task.py
 create mode 100644 mmcls/evaluation/metrics/multi_task.py
 create mode 100644 mmcls/models/heads/multi_task_head.py
 create mode 100644 mmcls/structures/multi_task_data_sample.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/data/dataset/multi-task.json
 create mode 100644 tests/test_evaluation/test_metrics/test_multi_task_metrics.py

diff --git a/mmcls/datasets/__init__.py b/mmcls/datasets/__init__.py
index 0097a693299..22abdadcc51 100644
--- a/mmcls/datasets/__init__.py
+++ b/mmcls/datasets/__init__.py
@@ -8,6 +8,7 @@
 from .imagenet import ImageNet, ImageNet21k
 from .mnist import MNIST, FashionMNIST
 from .multi_label import MultiLabelDataset
+from .multi_task import MultiTaskDataset
 from .samplers import *  # noqa: F401,F403
 from .transforms import *  # noqa: F401,F403
 from .voc import VOC
@@ -15,5 +16,5 @@
 __all__ = [
     'BaseDataset', 'ImageNet', 'CIFAR10', 'CIFAR100', 'MNIST', 'FashionMNIST',
     'VOC', 'build_dataset', 'ImageNet21k', 'KFoldDataset', 'CUB',
-    'CustomDataset', 'MultiLabelDataset'
+    'CustomDataset', 'MultiLabelDataset', 'MultiTaskDataset'
 ]
diff --git a/mmcls/datasets/multi_task.py b/mmcls/datasets/multi_task.py
new file mode 100644
index 00000000000..a28b4982002
--- /dev/null
+++ b/mmcls/datasets/multi_task.py
@@ -0,0 +1,344 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from os import PathLike
+from typing import Optional, Sequence
+
+import mmengine
+from mmcv.transforms import Compose
+from mmengine.fileio import FileClient
+
+from .builder import DATASETS
+
+
+def expanduser(path):
+    if isinstance(path, (str, PathLike)):
+        return osp.expanduser(path)
+    else:
+        return path
+
+
+def isabs(uri):
+    return osp.isabs(uri) or ('://' in uri)
+
+
+@DATASETS.register_module()
+class MultiTaskDataset:
+    """Custom dataset for multi-task dataset.
+
+    To use the dataset, please generate and provide an annotation file in the
+    below format:
+
+    .. code-block:: json
+
+        {
+          "metainfo": {
+            "tasks":
+              [
+              'gender'
+              'wear'
+              ]
+          },
+          "data_list": [
+            {
+              "img_path": "a.jpg",
+              gt_label:{
+                  "gender": 0,
+                  "wear": [1, 0, 1, 0]
+                }
+            },
+            {
+              "img_path": "b.jpg",
+              gt_label:{
+                  "gender": 1,
+                  "wear": [1, 0, 1, 0]
+                }
+            }
+          ]
+        }
+
+    Assume we put our dataset in the ``data/mydataset`` folder in the
+    repository and organize it as the below format: ::
+
+        mmclassification/
+        └── data
+            └── mydataset
+                ├── annotation
+                │   ├── train.json
+                │   ├── test.json
+                │   └── val.json
+                ├── train
+                │   ├── a.jpg
+                │   └── ...
+                ├── test
+                │   ├── b.jpg
+                │   └── ...
+                └── val
+                    ├── c.jpg
+                    └── ...
+
+    We can use the below config to build datasets:
+
+    .. code:: python
+
+        >>> from mmcls.datasets import build_dataset
+        >>> train_cfg = dict(
+        ...     type="MultiTaskDataset",
+        ...     ann_file="annotation/train.json",
+        ...     data_root="data/mydataset",
+        ...     # The `img_path` field in the train annotation file is relative
+        ...     # to the `train` folder.
+        ...     data_prefix='train',
+        ... )
+        >>> train_dataset = build_dataset(train_cfg)
+
+    Or we can put all files in the same folder: ::
+
+        mmclassification/
+        └── data
+            └── mydataset
+                 ├── train.json
+                 ├── test.json
+                 ├── val.json
+                 ├── a.jpg
+                 ├── b.jpg
+                 ├── c.jpg
+                 └── ...
+
+    And we can use the below config to build datasets:
+
+    .. code:: python
+
+        >>> from mmcls.datasets import build_dataset
+        >>> train_cfg = dict(
+        ...     type="MultiTaskDataset",
+        ...     ann_file="train.json",
+        ...     data_root="data/mydataset",
+        ...     # the `data_prefix` is not required since all paths are
+        ...     # relative to the `data_root`.
+        ... )
+        >>> train_dataset = build_dataset(train_cfg)
+
+
+    Args:
+        ann_file (str): The annotation file path. It can be either absolute
+            path or relative path to the ``data_root``.
+        metainfo (dict, optional): The extra meta information. It should be
+            a dict with the same format as the ``"metainfo"`` field in the
+            annotation file. Defaults to None.
+        data_root (str, optional): The root path of the data directory. It's
+            the prefix of the ``data_prefix`` and the ``ann_file``. And it can
+            be a remote path like "s3://openmmlab/xxx/". Defaults to None.
+        data_prefix (str, optional): The base folder relative to the
+            ``data_root`` for the ``"img_path"`` field in the annotation file.
+            Defaults to None.
+        pipeline (Sequence[dict]): A list of dict, where each element
+            represents a operation defined in :mod:`mmcls.datasets.pipelines`.
+            Defaults to an empty tuple.
+        test_mode (bool): in train mode or test mode. Defaults to False.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            If None, automatically inference from the ``data_root``.
+            Defaults to None.
+    """
+    METAINFO = dict()
+
+    def __init__(self,
+                 ann_file: str,
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: Optional[str] = None,
+                 pipeline: Sequence = (),
+                 test_mode: bool = False,
+                 file_client_args: Optional[dict] = None):
+
+        self.data_root = expanduser(data_root)
+
+        # Inference the file client
+        if self.data_root is not None:
+            file_client = FileClient.infer_client(
+                file_client_args, uri=self.data_root)
+        else:
+            file_client = FileClient(file_client_args)
+        self.file_client: FileClient = file_client
+
+        self.ann_file = self._join_root(expanduser(ann_file))
+        self.data_prefix = self._join_root(data_prefix)
+
+        self.test_mode = test_mode
+        self.pipeline = Compose(pipeline)
+        self.data_list = self.load_data_list(self.ann_file, metainfo)
+
+    def _join_root(self, path):
+        """Join ``self.data_root`` with the specified path.
+
+        If the path is an absolute path, just return the path. And if the
+        path is None, return ``self.data_root``.
+
+        Examples:
+            >>> self.data_root = 'a/b/c'
+            >>> self._join_root('d/e/')
+            'a/b/c/d/e'
+            >>> self._join_root('https://openmmlab.com')
+            'https://openmmlab.com'
+            >>> self._join_root(None)
+            'a/b/c'
+        """
+        if path is None:
+            return self.data_root
+        if isabs(path):
+            return path
+
+        joined_path = self.file_client.join_path(self.data_root, path)
+        return joined_path
+
+    @classmethod
+    def _get_meta_info(cls, in_metainfo: dict = None) -> dict:
+        """Collect meta information from the dictionary of meta.
+
+        Args:
+            in_metainfo (dict): Meta information dict.
+
+        Returns:
+            dict: Parsed meta information.
+        """
+        # `cls.METAINFO` will be overwritten by in_meta
+        metainfo = copy.deepcopy(cls.METAINFO)
+        if in_metainfo is None:
+            return metainfo
+
+        metainfo.update(in_metainfo)
+
+        return metainfo
+
+    def load_data_list(self, ann_file, metainfo_override=None):
+        """Load annotations from an annotation file.
+
+        Args:
+            ann_file (str): Absolute annotation file path if ``self.root=None``
+                or relative path if ``self.root=/path/to/data/``.
+
+        Returns:
+            list[dict]: A list of annotation.
+        """
+        annotations = mmengine.load(ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        if 'data_list' not in annotations:
+            raise ValueError('The annotation file must have the `data_list` '
+                             'field.')
+        metainfo = annotations.get('metainfo', {})
+        raw_data_list = annotations['data_list']
+
+        # Set meta information.
+        assert isinstance(metainfo, dict), 'The `metainfo` field in the '\
+            f'annotation file should be a dict, but got {type(metainfo)}'
+        if metainfo_override is not None:
+            assert isinstance(metainfo_override, dict), 'The `metainfo` ' \
+                f'argument should be a dict, but got {type(metainfo_override)}'
+            metainfo.update(metainfo_override)
+        self._metainfo = self._get_meta_info(metainfo)
+
+        data_list = []
+        for i, raw_data in enumerate(raw_data_list):
+            try:
+                data_list.append(self.parse_data_info(raw_data))
+            except AssertionError as e:
+                raise RuntimeError(
+                    f'The format check fails during parse the item {i} of '
+                    f'the annotation file with error: {e}')
+        return data_list
+
+    def parse_data_info(self, raw_data):
+        """Parse raw annotation to target format.
+
+        This method will return a dict which contains the data information of a
+        sample.
+
+        Args:
+            raw_data (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            dict: Parsed annotation.
+        """
+        assert isinstance(raw_data, dict), \
+            f'The item should be a dict, but got {type(raw_data)}'
+        assert 'img_path' in raw_data, \
+            "The item doesn't have `img_path` field."
+        data = dict(
+            img_path=self._join_root(raw_data['img_path']),
+            gt_label=raw_data['gt_label'],
+        )
+        return data
+
+    @property
+    def metainfo(self) -> dict:
+        """Get meta information of dataset.
+
+        Returns:
+            dict: meta information collected from ``cls.METAINFO``,
+            annotation file and metainfo argument during instantiation.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def prepare_data(self, idx):
+        """Get data processed by ``self.pipeline``.
+
+        Args:
+            idx (int): The index of ``data_info``.
+
+        Returns:
+            Any: Depends on ``self.pipeline``.
+        """
+        results = copy.deepcopy(self.data_list[idx])
+        return self.pipeline(results)
+
+    def __len__(self):
+        """Get the length of the whole dataset.
+
+        Returns:
+            int: The length of filtered dataset.
+        """
+        return len(self.data_list)
+
+    def __getitem__(self, idx):
+        """Get the idx-th image and data information of dataset after
+        ``self.pipeline``.
+
+        Args:
+            idx (int): The index of of the data.
+
+        Returns:
+            dict: The idx-th image and data information after
+            ``self.pipeline``.
+        """
+        return self.prepare_data(idx)
+
+    def __repr__(self):
+        """Print the basic information of the dataset.
+
+        Returns:
+            str: Formatted string.
+        """
+        head = 'Dataset ' + self.__class__.__name__
+        body = [f'Number of samples: \t{self.__len__()}']
+        if self.data_root is not None:
+            body.append(f'Root location: \t{self.data_root}')
+        body.append(f'Annotation file: \t{self.ann_file}')
+        if self.data_prefix is not None:
+            body.append(f'Prefix of images: \t{self.data_prefix}')
+        # -------------------- extra repr --------------------
+        tasks = self.metainfo['tasks']
+        body.append(f'For {len(tasks)} tasks')
+        for task in tasks:
+            body.append(f' {task} ')
+        # ----------------------------------------------------
+
+        if len(self.pipeline.transforms) > 0:
+            body.append('With transforms:')
+            for t in self.pipeline.transforms:
+                body.append(f'    {t}')
+
+        lines = [head] + [' ' * 4 + line for line in body]
+        return '\n'.join(lines)
diff --git a/mmcls/datasets/transforms/__init__.py b/mmcls/datasets/transforms/__init__.py
index 8ea8db3d78f..1fa905a56cf 100644
--- a/mmcls/datasets/transforms/__init__.py
+++ b/mmcls/datasets/transforms/__init__.py
@@ -3,7 +3,8 @@
                            Brightness, ColorTransform, Contrast, Cutout,
                            Equalize, Invert, Posterize, RandAugment, Rotate,
                            Sharpness, Shear, Solarize, SolarizeAdd, Translate)
-from .formatting import Collect, PackClsInputs, ToNumpy, ToPIL, Transpose
+from .formatting import (Collect, PackClsInputs, PackMultiTaskInputs, ToNumpy,
+                         ToPIL, Transpose)
 from .processing import (Albumentations, ColorJitter, EfficientNetCenterCrop,
                          EfficientNetRandomCrop, Lighting, RandomCrop,
                          RandomErasing, RandomResizedCrop, ResizeEdge)
@@ -15,5 +16,6 @@
     'Contrast', 'Brightness', 'Sharpness', 'AutoAugment', 'SolarizeAdd',
     'Cutout', 'RandAugment', 'Lighting', 'ColorJitter', 'RandomErasing',
     'PackClsInputs', 'Albumentations', 'EfficientNetRandomCrop',
-    'EfficientNetCenterCrop', 'ResizeEdge', 'BaseAugTransform'
+    'EfficientNetCenterCrop', 'ResizeEdge', 'BaseAugTransform',
+    'PackMultiTaskInputs'
 ]
diff --git a/mmcls/datasets/transforms/formatting.py b/mmcls/datasets/transforms/formatting.py
index c413d6f3fd0..d96ffed93cb 100644
--- a/mmcls/datasets/transforms/formatting.py
+++ b/mmcls/datasets/transforms/formatting.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import warnings
+from collections import defaultdict
 from collections.abc import Sequence
+from functools import partial
 
 import numpy as np
 import torch
@@ -9,7 +10,7 @@
 from PIL import Image
 
 from mmcls.registry import TRANSFORMS
-from mmcls.structures import ClsDataSample
+from mmcls.structures import ClsDataSample, MultiTaskDataSample
 
 
 def to_tensor(data):
@@ -85,12 +86,6 @@ def transform(self, results: dict) -> dict:
                 img = np.expand_dims(img, -1)
             img = np.ascontiguousarray(img.transpose(2, 0, 1))
             packed_results['inputs'] = to_tensor(img)
-        else:
-            warnings.warn(
-                'Cannot get "img" in the input dict of `PackClsInputs`,'
-                'please make sure `LoadImageFromFile` has been added '
-                'in the data pipeline or images have been loaded in '
-                'the dataset.')
 
         data_sample = ClsDataSample()
         if 'gt_label' in results:
@@ -100,7 +95,6 @@ def transform(self, results: dict) -> dict:
         img_meta = {k: results[k] for k in self.meta_keys if k in results}
         data_sample.set_metainfo(img_meta)
         packed_results['data_samples'] = data_sample
-
         return packed_results
 
     def __repr__(self) -> str:
@@ -109,6 +103,84 @@ def __repr__(self) -> str:
         return repr_str
 
 
+@TRANSFORMS.register_module()
+class PackMultiTaskInputs(BaseTransform):
+    """Convert all image labels of multi-task dataset to a dict of tensor.
+
+    Args:
+        tasks (List[str]): The task names defined in the dataset.
+        meta_keys(Sequence[str]): The meta keys to be saved in the
+            ``metainfo`` of the packed ``data_samples``.
+            Defaults to a tuple includes keys:
+
+            - ``sample_idx``: The id of the image sample.
+            - ``img_path``: The path to the image file.
+            - ``ori_shape``: The original shape of the image as a tuple (H, W).
+            - ``img_shape``: The shape of the image after the pipeline as a
+              tuple (H, W).
+            - ``scale_factor``: The scale factor between the resized image and
+              the original image.
+            - ``flip``: A boolean indicating if image flip transform was used.
+            - ``flip_direction``: The flipping direction.
+    """
+
+    def __init__(self,
+                 task_handlers=dict(),
+                 multi_task_fields=('gt_label', ),
+                 meta_keys=('sample_idx', 'img_path', 'ori_shape', 'img_shape',
+                            'scale_factor', 'flip', 'flip_direction')):
+        self.multi_task_fields = multi_task_fields
+        self.meta_keys = meta_keys
+        self.task_handlers = defaultdict(
+            partial(PackClsInputs, meta_keys=meta_keys))
+        for task_name, task_handler in task_handlers.items():
+            self.task_handlers[task_name] = TRANSFORMS.build(
+                dict(type=task_handler, meta_keys=meta_keys))
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+
+        result = {'img_path': 'a.png', 'gt_label': {'task1': 1, 'task3': 3},
+            'img': array([[[  0,   0,   0])
+        """
+        packed_results = dict()
+        results = results.copy()
+
+        if 'img' in results:
+            img = results.pop('img')
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            packed_results['inputs'] = to_tensor(img)
+
+        task_results = defaultdict(dict)
+        for field in self.multi_task_fields:
+            if field in results:
+                value = results.pop(field)
+                for k, v in value.items():
+                    task_results[k].update({field: v})
+
+        data_sample = MultiTaskDataSample()
+        for task_name, task_result in task_results.items():
+            task_handler = self.task_handlers[task_name]
+            task_pack_result = task_handler({**results, **task_result})
+            data_sample.set_field(task_pack_result['data_samples'], task_name)
+
+        packed_results['data_samples'] = data_sample
+        return packed_results
+
+    def __repr__(self):
+        repr = self.__class__.__name__
+        task_handlers = {
+            name: handler.__class__.__name__
+            for name, handler in self.task_handlers.items()
+        }
+        repr += f'(task_handlers={task_handlers}, '
+        repr += f'multi_task_fields={self.multi_task_fields}, '
+        repr += f'meta_keys={self.meta_keys})'
+        return repr
+
+
 @TRANSFORMS.register_module()
 class Transpose(BaseTransform):
     """Transpose numpy array.
diff --git a/mmcls/evaluation/metrics/__init__.py b/mmcls/evaluation/metrics/__init__.py
index 25b4dc27148..78e02a291f8 100644
--- a/mmcls/evaluation/metrics/__init__.py
+++ b/mmcls/evaluation/metrics/__init__.py
@@ -1,9 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .multi_label import AveragePrecision, MultiLabelMetric
+from .multi_task import MultiTasksMetric
 from .single_label import Accuracy, SingleLabelMetric
 from .voc_multi_label import VOCAveragePrecision, VOCMultiLabelMetric
 
 __all__ = [
     'Accuracy', 'SingleLabelMetric', 'MultiLabelMetric', 'AveragePrecision',
-    'VOCAveragePrecision', 'VOCMultiLabelMetric'
+    'MultiTasksMetric', 'VOCAveragePrecision', 'VOCMultiLabelMetric'
 ]
diff --git a/mmcls/evaluation/metrics/multi_task.py b/mmcls/evaluation/metrics/multi_task.py
new file mode 100644
index 00000000000..5f07bdd07d5
--- /dev/null
+++ b/mmcls/evaluation/metrics/multi_task.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Sequence
+
+from mmengine.evaluator import BaseMetric
+
+from mmcls.registry import METRICS
+
+
+@METRICS.register_module()
+class MultiTasksMetric(BaseMetric):
+    """Metrics for MultiTask
+    Args:
+        task_metrics(dict): a dictionary in the keys are the names of the tasks
+            and the values is a list of the metric corresponds to this task
+    Examples:
+        >>> import torch
+        >>> from mmcls.evaluation import MultiTasksMetric
+        # -------------------- The Basic Usage --------------------
+        >>>task_metrics = {
+            'task0': [dict(type='Accuracy', topk=(1, ))],
+            'task1': [dict(type='Accuracy', topk=(1, 3))]
+        }
+        >>>pred = [{
+            'pred_task': {
+                'task0': torch.tensor([0.7, 0.0, 0.3]),
+                'task1': torch.tensor([0.5, 0.2, 0.3])
+            },
+            'gt_task': {
+                'task0':  torch.tensor(0),
+                'task1':  torch.tensor(2)
+            }
+        }, {
+            'pred_task': {
+                'task0': torch.tensor([0.0, 0.0, 1.0]),
+                'task1': torch.tensor([0.0, 0.0, 1.0])
+            },
+            'gt_task': {
+                'task0':  torch.tensor(2),
+                'task1':  torch.tensor(2)
+            }
+        }]
+        >>>metric = MultiTasksMetric(task_metrics)
+        >>>metric.process(None, pred)
+        >>>results = metric.evaluate(2)
+        results = {
+            'task0_accuracy/top1': 100.0,
+            'task1_accuracy/top1': 50.0,
+            'task1_accuracy/top3': 100.0
+        }
+    """
+
+    def __init__(self,
+                 task_metrics: Dict,
+                 collect_device: str = 'cpu') -> None:
+        self.task_metrics = task_metrics
+        super().__init__(collect_device=collect_device)
+
+        self._metrics = {}
+        for task_name in self.task_metrics.keys():
+            self._metrics[task_name] = []
+            for metric in self.task_metrics[task_name]:
+                self._metrics[task_name].append(METRICS.build(metric))
+
+    def process(self, data_batch, data_samples: Sequence[dict]):
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for task_name in self.task_metrics.keys():
+            filtered_data_samples = []
+            for data_sample in data_samples:
+                eval_mask = data_sample[task_name]['eval_mask']
+                if eval_mask:
+                    filtered_data_samples.append(data_sample[task_name])
+            for metric in self._metrics[task_name]:
+                metric.process(data_batch, filtered_data_samples)
+
+    def compute_metrics(self, results: list) -> dict:
+        raise NotImplementedError(
+            'compute metrics should not be used here directly')
+
+    def evaluate(self, size):
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are
+            "{task_name}_{metric_name}" , and the values
+            are corresponding results.
+        """
+        metrics = {}
+        for task_name in self._metrics:
+            for metric in self._metrics[task_name]:
+                name = metric.__class__.__name__
+                if name == 'MultiTasksMetric' or metric.results:
+                    results = metric.evaluate(size)
+                else:
+                    results = {metric.__class__.__name__: 0}
+                for key in results:
+                    name = f'{task_name}_{key}'
+                    if name in results:
+                        """Inspired from https://github.com/open-
+                        mmlab/mmengine/ bl ob/ed20a9cba52ceb371f7c825131636b9e2
+                        747172e/mmengine/evalua tor/evaluator.py#L84-L87."""
+                        raise ValueError(
+                            'There are multiple metric results with the same'
+                            f'metric name {name}. Please make sure all metrics'
+                            'have different prefixes.')
+                    metrics[name] = results[key]
+        return metrics
diff --git a/mmcls/models/heads/__init__.py b/mmcls/models/heads/__init__.py
index 3e359d37227..b5f5761ef46 100644
--- a/mmcls/models/heads/__init__.py
+++ b/mmcls/models/heads/__init__.py
@@ -8,11 +8,13 @@
 from .multi_label_cls_head import MultiLabelClsHead
 from .multi_label_csra_head import CSRAClsHead
 from .multi_label_linear_head import MultiLabelLinearClsHead
+from .multi_task_head import MultiTaskHead
 from .stacked_head import StackedLinearClsHead
 from .vision_transformer_head import VisionTransformerClsHead
 
 __all__ = [
     'ClsHead', 'LinearClsHead', 'StackedLinearClsHead', 'MultiLabelClsHead',
     'MultiLabelLinearClsHead', 'VisionTransformerClsHead', 'DeiTClsHead',
-    'ConformerHead', 'EfficientFormerClsHead', 'ArcFaceClsHead', 'CSRAClsHead'
+    'ConformerHead', 'EfficientFormerClsHead', 'ArcFaceClsHead', 'CSRAClsHead',
+    'MultiTaskHead'
 ]
diff --git a/mmcls/models/heads/cls_head.py b/mmcls/models/heads/cls_head.py
index 4af22f65764..1338947bf53 100644
--- a/mmcls/models/heads/cls_head.py
+++ b/mmcls/models/heads/cls_head.py
@@ -108,9 +108,10 @@ def _get_loss(self, cls_score: torch.Tensor,
         return losses
 
     def predict(
-            self,
-            feats: Tuple[torch.Tensor],
-            data_samples: List[ClsDataSample] = None) -> List[ClsDataSample]:
+        self,
+        feats: Tuple[torch.Tensor],
+        data_samples: List[Union[ClsDataSample, None]] = None
+    ) -> List[ClsDataSample]:
         """Inference without augmentation.
 
         Args:
@@ -118,7 +119,7 @@ def predict(
                 Multiple stage inputs are acceptable but only the last stage
                 will be used to classify. The shape of every item should be
                 ``(num_samples, num_classes)``.
-            data_samples (List[ClsDataSample], optional): The annotation
+            data_samples (List[ClsDataSample | None], optional): The annotation
                 data of every samples. If not None, set ``pred_label`` of
                 the input data samples. Defaults to None.
 
@@ -141,14 +142,15 @@ def _get_predictions(self, cls_score, data_samples):
         pred_scores = F.softmax(cls_score, dim=1)
         pred_labels = pred_scores.argmax(dim=1, keepdim=True).detach()
 
-        if data_samples is not None:
-            for data_sample, score, label in zip(data_samples, pred_scores,
-                                                 pred_labels):
-                data_sample.set_pred_score(score).set_pred_label(label)
-        else:
-            data_samples = []
-            for score, label in zip(pred_scores, pred_labels):
-                data_samples.append(ClsDataSample().set_pred_score(
-                    score).set_pred_label(label))
+        out_data_samples = []
+        if data_samples is None:
+            data_samples = [None for _ in range(pred_scores.size(0))]
+
+        for data_sample, score, label in zip(data_samples, pred_scores,
+                                             pred_labels):
+            if data_sample is None:
+                data_sample = ClsDataSample()
 
-        return data_samples
+            data_sample.set_pred_score(score).set_pred_label(label)
+            out_data_samples.append(data_sample)
+        return out_data_samples
diff --git a/mmcls/models/heads/multi_task_head.py b/mmcls/models/heads/multi_task_head.py
new file mode 100644
index 00000000000..64167739f65
--- /dev/null
+++ b/mmcls/models/heads/multi_task_head.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmengine.model import ModuleDict
+
+from mmcls.registry import MODELS
+from mmcls.structures import MultiTaskDataSample
+from .base_head import BaseHead
+
+
+def loss_convertor(loss_func, task_name):
+
+    def wrapped(inputs, data_samples, **kwargs):
+        mask = torch.empty(len(data_samples), dtype=torch.bool)
+        task_data_samples = []
+        for i, data_sample in enumerate(data_samples):
+            assert isinstance(data_sample, MultiTaskDataSample)
+            sample_mask = task_name in data_sample
+            mask[i] = sample_mask
+            if sample_mask:
+                task_data_samples.append(data_sample.get(task_name))
+
+        if len(task_data_samples) == 0:
+            return {'loss': torch.tensor(0.), 'mask_size': torch.tensor(0.)}
+
+        # Mask the inputs of the task
+        def mask_inputs(inputs, mask):
+            if isinstance(inputs, Sequence):
+                return type(inputs)(
+                    [mask_inputs(input, mask) for input in inputs])
+            elif isinstance(inputs, torch.Tensor):
+                return inputs[mask]
+
+        masked_inputs = mask_inputs(inputs, mask)
+        loss_output = loss_func(masked_inputs, task_data_samples, **kwargs)
+        loss_output['mask_size'] = mask.sum().to(torch.float)
+        return loss_output
+
+    return wrapped
+
+
+@MODELS.register_module()
+class MultiTaskHead(BaseHead):
+    """Multi task head.
+
+    Args:
+        task_heads (dict): Sub heads to use, the key will be use to rename the
+            loss components.
+        common_cfg (dict): The common settings for all heads. Defaults to an
+            empty dict.
+        init_cfg (dict, optional): The extra initialization settings.
+            Defaults to None.
+    """
+
+    def __init__(self, task_heads, init_cfg=None, **kwargs):
+        super(MultiTaskHead, self).__init__(init_cfg=init_cfg)
+
+        assert isinstance(task_heads, dict), 'The `task_heads` argument' \
+            "should be a dict, which's keys are task names and values are" \
+            'configs of head for the task.'
+
+        self.task_heads = ModuleDict()
+
+        for task_name, sub_head in task_heads.items():
+            if not isinstance(sub_head, nn.Module):
+                sub_head = MODELS.build(sub_head, default_args=kwargs)
+            sub_head.loss = loss_convertor(sub_head.loss, task_name)
+            self.task_heads[task_name] = sub_head
+
+    def forward(self, feats):
+        """The forward process."""
+        return {
+            task_name: head(feats)
+            for task_name, head in self.task_heads.items()
+        }
+
+    def loss(self, feats: Tuple[torch.Tensor],
+             data_samples: List[MultiTaskDataSample], **kwargs) -> dict:
+        """Calculate losses from the classification score.
+
+        Args:
+            feats (tuple[Tensor]): The features extracted from the backbone.
+            data_samples (List[MultiTaskDataSample]): The annotation data of
+                every samples.
+            **kwargs: Other keyword arguments to forward the loss module.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components, each task loss
+                key will be prefixed by the task_name like "task1_loss"
+        """
+        losses = dict()
+        for task_name, head in self.task_heads.items():
+            head_loss = head.loss(feats, data_samples, **kwargs)
+            for k, v in head_loss.items():
+                losses[f'{task_name}_{k}'] = v
+        return losses
+
+    def predict(
+        self,
+        feats: Tuple[torch.Tensor],
+        data_samples: List[MultiTaskDataSample] = None
+    ) -> List[MultiTaskDataSample]:
+        """Inference without augmentation.
+
+        Args:
+            feats (tuple[Tensor]): The features extracted from the backbone.
+            data_samples (List[MultiTaskDataSample], optional): The annotation
+                data of every samples. If not None, set ``pred_label`` of
+                the input data samples. Defaults to None.
+
+        Returns:
+            List[MultiTaskDataSample]: A list of data samples which contains
+            the predicted results.
+        """
+        predictions_dict = dict()
+
+        for task_name, head in self.task_heads.items():
+            task_samples = head.predict(feats)
+            batch_size = len(task_samples)
+            predictions_dict[task_name] = task_samples
+
+        if data_samples is None:
+            data_samples = [MultiTaskDataSample() for _ in range(batch_size)]
+
+        for task_name, task_samples in predictions_dict.items():
+            for data_sample, task_sample in zip(data_samples, task_samples):
+                task_sample.set_field(
+                    task_name in data_sample.tasks,
+                    'eval_mask',
+                    field_type='metainfo')
+
+                if task_name in data_sample.tasks:
+                    data_sample.get(task_name).update(task_sample)
+                else:
+                    data_sample.set_field(task_sample, task_name)
+
+        return data_samples
diff --git a/mmcls/models/utils/data_preprocessor.py b/mmcls/models/utils/data_preprocessor.py
index 1da730c2f35..716b0a1eafa 100644
--- a/mmcls/models/utils/data_preprocessor.py
+++ b/mmcls/models/utils/data_preprocessor.py
@@ -8,7 +8,8 @@
 from mmengine.model import BaseDataPreprocessor, stack_batch
 
 from mmcls.registry import MODELS
-from mmcls.structures import (batch_label_to_onehot, cat_batch_labels,
+from mmcls.structures import (ClsDataSample, MultiTaskDataSample,
+                              batch_label_to_onehot, cat_batch_labels,
                               stack_batch_scores, tensor_split)
 from .batch_augments import RandomBatchAugment
 
@@ -151,7 +152,9 @@ def forward(self, data: dict, training: bool = False) -> dict:
                                  self.pad_value)
 
         data_samples = data.get('data_samples', None)
-        if data_samples is not None and 'gt_label' in data_samples[0]:
+        sample_item = data_samples[0] if data_samples is not None else None
+        if isinstance(sample_item,
+                      ClsDataSample) and 'gt_label' in sample_item:
             gt_labels = [sample.gt_label for sample in data_samples]
             batch_label, label_indices = cat_batch_labels(
                 gt_labels, device=self.device)
@@ -181,5 +184,7 @@ def forward(self, data: dict, training: bool = False) -> dict:
             if batch_score is not None:
                 for sample, score in zip(data_samples, batch_score):
                     sample.set_gt_score(score)
+        elif isinstance(sample_item, MultiTaskDataSample):
+            data_samples = self.cast_data(data_samples)
 
         return {'inputs': inputs, 'data_samples': data_samples}
diff --git a/mmcls/structures/__init__.py b/mmcls/structures/__init__.py
index 0dc08443cab..3021d0a7d0b 100644
--- a/mmcls/structures/__init__.py
+++ b/mmcls/structures/__init__.py
@@ -1,9 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .cls_data_sample import ClsDataSample
+from .multi_task_data_sample import MultiTaskDataSample
 from .utils import (batch_label_to_onehot, cat_batch_labels,
                     stack_batch_scores, tensor_split)
 
 __all__ = [
     'ClsDataSample', 'batch_label_to_onehot', 'cat_batch_labels',
-    'stack_batch_scores', 'tensor_split'
+    'stack_batch_scores', 'tensor_split', 'MultiTaskDataSample'
 ]
diff --git a/mmcls/structures/multi_task_data_sample.py b/mmcls/structures/multi_task_data_sample.py
new file mode 100644
index 00000000000..f00993861bf
--- /dev/null
+++ b/mmcls/structures/multi_task_data_sample.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmengine.structures import BaseDataElement
+
+
+class MultiTaskDataSample(BaseDataElement):
+
+    @property
+    def tasks(self):
+        return self._data_fields
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000000..ef101fec61e
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/tests/data/dataset/multi-task.json b/tests/data/dataset/multi-task.json
new file mode 100644
index 00000000000..bf96384a0e6
--- /dev/null
+++ b/tests/data/dataset/multi-task.json
@@ -0,0 +1,40 @@
+{
+  "metainfo": {
+    "tasks": [
+      "gender",
+      "wear"
+    ]
+  },
+  "data_list": [
+    {
+      "img_path": "a/1.JPG",
+      "gt_label": {
+        "gender": 0
+      }
+    },
+    {
+      "img_path": "b/2.jpeg",
+      "gt_label": {
+        "gender": 0,
+        "wear": [
+          1,
+          0,
+          1,
+          0
+        ]
+      }
+    },
+    {
+      "img_path": "b/subb/3.jpg",
+      "gt_label": {
+        "gender": 1,
+        "wear": [
+          0,
+          1,
+          0,
+          1
+        ]
+      }
+    }
+  ]
+}
diff --git a/tests/test_datasets/test_datasets.py b/tests/test_datasets/test_datasets.py
index f637fb9580d..eb2fab213e5 100644
--- a/tests/test_datasets/test_datasets.py
+++ b/tests/test_datasets/test_datasets.py
@@ -79,7 +79,7 @@ def test_repr(self):
         else:
             self.assertIn('The `CLASSES` meta info is not set.', repr(dataset))
 
-        self.assertIn("Haven't been initialized", repr(dataset))
+        self.assertIn('Haven\'t been initialized', repr(dataset))
         dataset.full_init()
         self.assertIn(f'Number of samples: \t{len(dataset)}', repr(dataset))
 
@@ -452,7 +452,7 @@ def test_extra_repr(self):
         cfg = {**self.DEFAULT_ARGS, 'lazy_init': True}
         dataset = dataset_class(**cfg)
 
-        self.assertIn(f'Prefix of data: \t{dataset.data_prefix["root"]}',
+        self.assertIn(f"Prefix of data: \t{dataset.data_prefix['root']}",
                       repr(dataset))
 
     @classmethod
@@ -597,7 +597,7 @@ def test_load_data_list(self):
         }
         dataset = dataset_class(**cfg)
 
-        self.assertIn("Haven't been initialized", repr(dataset))
+        self.assertIn('Haven\'t been initialized', repr(dataset))
         dataset.full_init()
         self.assertIn(f'Number of samples: \t{len(dataset)}', repr(dataset))
 
@@ -770,7 +770,7 @@ def test_extra_repr(self):
         cfg = {**self.DEFAULT_ARGS, 'lazy_init': True}
         dataset = dataset_class(**cfg)
 
-        self.assertIn(f'Prefix of data: \t{dataset.data_prefix["root"]}',
+        self.assertIn(f"Prefix of data: \t{dataset.data_prefix['root']}",
                       repr(dataset))
 
     @classmethod
@@ -874,3 +874,70 @@ def test_extra_repr(self):
     @classmethod
     def tearDownClass(cls):
         cls.tmpdir.cleanup()
+
+
+class TestMultiTaskDataset(TestCase):
+    DATASET_TYPE = 'MultiTaskDataset'
+
+    DEFAULT_ARGS = dict(
+        data_root=ASSETS_ROOT,
+        ann_file=osp.join(ASSETS_ROOT, 'multi-task.json'),
+        pipeline=[])
+
+    def test_metainfo(self):
+        dataset_class = DATASETS.get(self.DATASET_TYPE)
+
+        # Test default behavior
+        dataset = dataset_class(**self.DEFAULT_ARGS)
+        metainfo = {'tasks': ['gender', 'wear']}
+        self.assertDictEqual(dataset.metainfo, metainfo)
+        self.assertFalse(dataset.test_mode)
+
+    def test_parse_data_info(self):
+        dataset_class = DATASETS.get(self.DATASET_TYPE)
+        dataset = dataset_class(**self.DEFAULT_ARGS)
+
+        data = dataset.parse_data_info({
+            'img_path': 'a.jpg',
+            'gt_label': {
+                'gender': 0
+            }
+        })
+        self.assertDictContainsSubset(
+            {
+                'img_path': os.path.join(ASSETS_ROOT, 'a.jpg'),
+                'gt_label': {
+                    'gender': 0
+                }
+            }, data)
+        np.testing.assert_equal(data['gt_label']['gender'], 0)
+
+        # Test missing path
+        with self.assertRaisesRegex(AssertionError, 'have `img_path` field'):
+            dataset.parse_data_info(
+                {'gt_label': {
+                    'gender': 0,
+                    'wear': [1, 0, 1, 0]
+                }})
+
+    def test_repr(self):
+        dataset_class = DATASETS.get(self.DATASET_TYPE)
+        dataset = dataset_class(**self.DEFAULT_ARGS)
+
+        task_doc = ('For 2 tasks\n     gender \n     wear ')
+        self.assertIn(task_doc, repr(dataset))
+
+    def test_load_data_list(self):
+        dataset_class = DATASETS.get(self.DATASET_TYPE)
+
+        # Test default behavior
+        dataset = dataset_class(**self.DEFAULT_ARGS)
+
+        data = dataset.load_data_list(self.DEFAULT_ARGS['ann_file'])
+        self.assertIsInstance(data, list)
+        np.testing.assert_equal(len(data), 3)
+        np.testing.assert_equal(data[0]['gt_label'], {'gender': 0})
+        np.testing.assert_equal(data[1]['gt_label'], {
+            'gender': 0,
+            'wear': [1, 0, 1, 0]
+        })
diff --git a/tests/test_datasets/test_transforms/test_formatting.py b/tests/test_datasets/test_transforms/test_formatting.py
index 0d271b3b0c6..6806b0b8060 100644
--- a/tests/test_datasets/test_transforms/test_formatting.py
+++ b/tests/test_datasets/test_transforms/test_formatting.py
@@ -10,7 +10,7 @@
 from PIL import Image
 
 from mmcls.registry import TRANSFORMS
-from mmcls.structures import ClsDataSample
+from mmcls.structures import ClsDataSample, MultiTaskDataSample
 from mmcls.utils import register_all_modules
 
 register_all_modules()
@@ -51,9 +51,8 @@ def test_transform(self):
         # Test without `img` and `gt_label`
         del data['img']
         del data['gt_label']
-        with self.assertWarnsRegex(Warning, 'Cannot get "img"'):
-            results = transform(copy.deepcopy(data))
-            self.assertNotIn('gt_label', results['data_samples'])
+        results = transform(copy.deepcopy(data))
+        self.assertNotIn('gt_label', results['data_samples'])
 
     def test_repr(self):
         cfg = dict(type='PackClsInputs', meta_keys=['flip', 'img_shape'])
@@ -130,3 +129,54 @@ def test_repr(self):
         cfg = dict(type='Collect', keys=['img'])
         transform = TRANSFORMS.build(cfg)
         self.assertEqual(repr(transform), "Collect(keys=['img'])")
+
+
+class TestPackMultiTaskInputs(unittest.TestCase):
+
+    def test_transform(self):
+        img_path = osp.join(osp.dirname(__file__), '../../data/color.jpg')
+        data = {
+            'sample_idx': 1,
+            'img_path': img_path,
+            'ori_shape': (300, 400),
+            'img_shape': (300, 400),
+            'scale_factor': 1.0,
+            'flip': False,
+            'img': mmcv.imread(img_path),
+            'gt_label': {
+                'task1': 1,
+                'task3': 3
+            },
+        }
+
+        cfg = dict(type='PackMultiTaskInputs', )
+        transform = TRANSFORMS.build(cfg)
+        results = transform(copy.deepcopy(data))
+        self.assertIn('inputs', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['data_samples'], MultiTaskDataSample)
+        self.assertIn('flip', results['data_samples'].task1.metainfo_keys())
+        self.assertIsInstance(results['data_samples'].task1.gt_label,
+                              LabelData)
+
+        # Test grayscale image
+        data['img'] = data['img'].mean(-1)
+        results = transform(copy.deepcopy(data))
+        self.assertIn('inputs', results)
+        self.assertIsInstance(results['inputs'], torch.Tensor)
+        self.assertEqual(results['inputs'].shape, (1, 300, 400))
+
+        # Test without `img` and `gt_label`
+        del data['img']
+        del data['gt_label']
+        results = transform(copy.deepcopy(data))
+        self.assertNotIn('gt_label', results['data_samples'])
+
+    def test_repr(self):
+        cfg = dict(type='PackMultiTaskInputs', meta_keys=['img_shape'])
+        transform = TRANSFORMS.build(cfg)
+        rep = 'PackMultiTaskInputs(task_handlers={},'
+        rep += ' multi_task_fields=(\'gt_label\',),'
+        rep += ' meta_keys=[\'img_shape\'])'
+        self.assertEqual(repr(transform), rep)
diff --git a/tests/test_evaluation/test_metrics/test_multi_task_metrics.py b/tests/test_evaluation/test_metrics/test_multi_task_metrics.py
new file mode 100644
index 00000000000..29e4d96d414
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_multi_task_metrics.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmcls.evaluation.metrics import MultiTasksMetric
+from mmcls.structures import ClsDataSample
+
+
+class MultiTaskMetric(TestCase):
+    data_pred = [
+        {
+            'task0': torch.tensor([0.7, 0.0, 0.3]),
+            'task1': torch.tensor([0.5, 0.2, 0.3])
+        },
+        {
+            'task0': torch.tensor([0.0, 0.0, 1.0]),
+            'task1': torch.tensor([0.0, 0.0, 1.0])
+        },
+    ]
+    data_gt = [{'task0': 0, 'task1': 2}, {'task1': 2}]
+
+    preds = []
+    for i, pred in enumerate(data_pred):
+        sample = {}
+        for task_name in pred:
+            task_sample = ClsDataSample().set_pred_score(pred[task_name])
+            if task_name in data_gt[i]:
+                task_sample.set_gt_label(data_gt[i][task_name])
+                task_sample.set_field(True, 'eval_mask', field_type='metainfo')
+            else:
+                task_sample.set_field(
+                    False, 'eval_mask', field_type='metainfo')
+            sample[task_name] = task_sample.to_dict()
+
+        preds.append(sample)
+    data2 = zip([
+        {
+            'task0': torch.tensor([0.7, 0.0, 0.3]),
+            'task1': {
+                'task10': torch.tensor([0.5, 0.2, 0.3]),
+                'task11': torch.tensor([0.4, 0.3, 0.3])
+            }
+        },
+        {
+            'task0': torch.tensor([0.0, 0.0, 1.0]),
+            'task1': {
+                'task10': torch.tensor([0.1, 0.6, 0.3]),
+                'task11': torch.tensor([0.5, 0.2, 0.3])
+            }
+        },
+    ], [{
+        'task0': 0,
+        'task1': {
+            'task10': 2,
+            'task11': 0
+        }
+    }, {
+        'task0': 2,
+        'task1': {
+            'task10': 1,
+            'task11': 0
+        }
+    }])
+
+    pred2 = []
+    for score, label in data2:
+        sample = {}
+        for task_name in score:
+            if type(score[task_name]) != dict:
+                task_sample = ClsDataSample().set_pred_score(score[task_name])
+                task_sample.set_gt_label(label[task_name])
+                sample[task_name] = task_sample.to_dict()
+                sample[task_name]['eval_mask'] = True
+            else:
+                sample[task_name] = {}
+                sample[task_name]['eval_mask'] = True
+                for task_name2 in score[task_name]:
+                    task_sample = ClsDataSample().set_pred_score(
+                        score[task_name][task_name2])
+                    task_sample.set_gt_label(label[task_name][task_name2])
+                    sample[task_name][task_name2] = task_sample.to_dict()
+                    sample[task_name][task_name2]['eval_mask'] = True
+
+        pred2.append(sample)
+
+    pred3 = [{'task0': {'eval_mask': False}, 'task1': {'eval_mask': False}}]
+    task_metrics = {
+        'task0': [dict(type='Accuracy', topk=(1, ))],
+        'task1': [
+            dict(type='Accuracy', topk=(1, 3)),
+            dict(type='SingleLabelMetric', items=['precision', 'recall'])
+        ]
+    }
+    task_metrics2 = {
+        'task0': [dict(type='Accuracy', topk=(1, ))],
+        'task1': [
+            dict(
+                type='MultiTasksMetric',
+                task_metrics={
+                    'task10': [
+                        dict(type='Accuracy', topk=(1, 3)),
+                        dict(type='SingleLabelMetric', items=['precision'])
+                    ],
+                    'task11': [dict(type='Accuracy', topk=(1, ))]
+                })
+        ]
+    }
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+
+        # Test with score (use score instead of label if score exists)
+        metric = MultiTasksMetric(self.task_metrics)
+        metric.process(None, self.preds)
+        results = metric.evaluate(2)
+        self.assertIsInstance(results, dict)
+        self.assertAlmostEqual(results['task0_accuracy/top1'], 100)
+        self.assertGreater(results['task1_single-label/precision'], 0)
+
+        # Test nested
+        metric = MultiTasksMetric(self.task_metrics2)
+        metric.process(None, self.pred2)
+        results = metric.evaluate(2)
+        self.assertIsInstance(results, dict)
+        self.assertGreater(results['task1_task10_single-label/precision'], 0)
+        self.assertGreater(results['task1_task11_accuracy/top1'], 0)
+
+        # Test with without any ground truth value
+        metric = MultiTasksMetric(self.task_metrics)
+        metric.process(None, self.pred3)
+        results = metric.evaluate(2)
+        self.assertIsInstance(results, dict)
+        self.assertEqual(results['task0_Accuracy'], 0)
diff --git a/tests/test_models/test_heads.py b/tests/test_models/test_heads.py
index 0b1f72f1db8..85fdd7aa1fa 100644
--- a/tests/test_models/test_heads.py
+++ b/tests/test_models/test_heads.py
@@ -10,7 +10,7 @@
 from mmengine import is_seq_of
 
 from mmcls.registry import MODELS
-from mmcls.structures import ClsDataSample
+from mmcls.structures import ClsDataSample, MultiTaskDataSample
 from mmcls.utils import register_all_modules
 
 register_all_modules()
@@ -484,6 +484,142 @@ def test_forward(self):
         head(feats)
 
 
+class TestMultiTaskHead(TestCase):
+    DEFAULT_ARGS = dict(
+        type='MultiTaskHead',  # <- Head config, depends on #675
+        task_heads={
+            'task0': dict(type='LinearClsHead', num_classes=3),
+            'task1': dict(type='LinearClsHead', num_classes=6),
+        },
+        in_channels=10,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    )
+
+    DEFAULT_ARGS2 = dict(
+        type='MultiTaskHead',  # <- Head config, depends on #675
+        task_heads={
+            'task0':
+            dict(
+                type='MultiTaskHead',
+                task_heads={
+                    'task00': dict(type='LinearClsHead', num_classes=3),
+                    'task01': dict(type='LinearClsHead', num_classes=6),
+                }),
+            'task1':
+            dict(type='LinearClsHead', num_classes=6)
+        },
+        in_channels=10,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+    )
+
+    def test_forward(self):
+        head = MODELS.build(self.DEFAULT_ARGS)
+        # return the last item (same as pre_logits)
+        feats = (torch.rand(4, 10), )
+        outs = head(feats)
+        self.assertEqual(outs['task0'].shape, (4, 3))
+        self.assertEqual(outs['task1'].shape, (4, 6))
+        self.assertTrue(isinstance(outs, dict))
+
+    def test_loss(self):
+        feats = (torch.rand(4, 10), )
+        data_samples = []
+
+        for _ in range(4):
+            data_sample = MultiTaskDataSample()
+            for task_name in self.DEFAULT_ARGS['task_heads']:
+                task_sample = ClsDataSample().set_gt_label(1)
+                data_sample.set_field(task_sample, task_name)
+            data_samples.append(data_sample)
+        # with cal_acc = False
+        head = MODELS.build(self.DEFAULT_ARGS)
+
+        losses = head.loss(feats, data_samples)
+        self.assertEqual(
+            losses.keys(),
+            {'task0_loss', 'task0_mask_size', 'task1_loss', 'task1_mask_size'})
+        self.assertGreater(losses['task0_loss'].item(), 0)
+        self.assertGreater(losses['task1_loss'].item(), 0)
+
+    def test_predict(self):
+        feats = (torch.rand(4, 10), )
+        data_samples = []
+
+        for _ in range(4):
+            data_sample = MultiTaskDataSample()
+            for task_name in self.DEFAULT_ARGS['task_heads']:
+                task_sample = ClsDataSample().set_gt_label(1)
+                data_sample.set_field(task_sample, task_name)
+            data_samples.append(data_sample)
+        head = MODELS.build(self.DEFAULT_ARGS)
+        # with without data_samples
+        predictions = head.predict(feats)
+        self.assertTrue(is_seq_of(predictions, MultiTaskDataSample))
+        for pred in predictions:
+            self.assertIn('task0', pred)
+        task0_sample = predictions[0].task0
+        self.assertTrue(type(task0_sample.pred_label.score), 'torch.tensor')
+
+        # with with data_samples
+        predictions = head.predict(feats, data_samples)
+        self.assertTrue(is_seq_of(predictions, MultiTaskDataSample))
+        for sample, pred in zip(data_samples, predictions):
+            self.assertIs(sample, pred)
+            self.assertIn('task0', pred)
+
+    def test_loss_empty_data_sample(self):
+        feats = (torch.rand(4, 10), )
+        data_samples = []
+
+        for _ in range(4):
+            data_sample = MultiTaskDataSample()
+            data_samples.append(data_sample)
+        # with cal_acc = False
+        head = MODELS.build(self.DEFAULT_ARGS)
+        losses = head.loss(feats, data_samples)
+        self.assertEqual(
+            losses.keys(),
+            {'task0_loss', 'task0_mask_size', 'task1_loss', 'task1_mask_size'})
+        self.assertEqual(losses['task0_loss'].item(), 0)
+        self.assertEqual(losses['task1_loss'].item(), 0)
+
+    def test_nested_multi_task_loss(self):
+
+        head = MODELS.build(self.DEFAULT_ARGS2)
+        # return the last item (same as pre_logits)
+        feats = (torch.rand(4, 10), )
+        outs = head(feats)
+        self.assertEqual(outs['task0']['task01'].shape, (4, 6))
+        self.assertTrue(isinstance(outs, dict))
+        self.assertTrue(isinstance(outs['task0'], dict))
+
+    def test_nested_invalid_sample(self):
+        feats = (torch.rand(4, 10), )
+        gt_label = {'task0': 1, 'task1': 1}
+        head = MODELS.build(self.DEFAULT_ARGS2)
+        data_sample = MultiTaskDataSample()
+        for task_name in gt_label:
+            task_sample = ClsDataSample().set_gt_label(gt_label[task_name])
+            data_sample.set_field(task_sample, task_name)
+        with self.assertRaises(Exception):
+            head.loss(feats, data_sample)
+
+    def test_nested_invalid_sample2(self):
+        feats = (torch.rand(4, 10), )
+        gt_label = {'task0': {'task00': 1, 'task01': 1}, 'task1': 1}
+        head = MODELS.build(self.DEFAULT_ARGS)
+        data_sample = MultiTaskDataSample()
+        task_sample = ClsDataSample().set_gt_label(gt_label['task1'])
+        data_sample.set_field(task_sample, 'task1')
+        data_sample.set_field(MultiTaskDataSample(), 'task0')
+        for task_name in gt_label['task0']:
+            task_sample = ClsDataSample().set_gt_label(
+                gt_label['task0'][task_name])
+            data_sample.task0.set_field(task_sample, task_name)
+        with self.assertRaises(Exception):
+            head.loss(feats, data_sample)
+
+
 class TestArcFaceClsHead(TestCase):
     DEFAULT_ARGS = dict(type='ArcFaceClsHead', in_channels=10, num_classes=5)
 
diff --git a/tests/test_structures/test_datasample.py b/tests/test_structures/test_datasample.py
index ee45c3f24a3..e02c95fc787 100644
--- a/tests/test_structures/test_datasample.py
+++ b/tests/test_structures/test_datasample.py
@@ -5,7 +5,7 @@
 import torch
 from mmengine.structures import LabelData
 
-from mmcls.structures import ClsDataSample
+from mmcls.structures import ClsDataSample, MultiTaskDataSample
 
 
 class TestClsDataSample(TestCase):
@@ -122,3 +122,20 @@ def test_set_pred_score(self):
         with self.assertRaisesRegex(AssertionError, 'but got 2'):
             data_sample.set_pred_score(
                 torch.tensor([[0.1, 0.1, 0.6, 0.1, 0.1]]))
+
+
+class TestMultiTaskDataSample(TestCase):
+
+    def test_multi_task_data_sample(self):
+        gt_label = {'task0': {'task00': 1, 'task01': 1}, 'task1': 1}
+        data_sample = MultiTaskDataSample()
+        task_sample = ClsDataSample().set_gt_label(gt_label['task1'])
+        data_sample.set_field(task_sample, 'task1')
+        data_sample.set_field(MultiTaskDataSample(), 'task0')
+        for task_name in gt_label['task0']:
+            task_sample = ClsDataSample().set_gt_label(
+                gt_label['task0'][task_name])
+            data_sample.task0.set_field(task_sample, task_name)
+        self.assertIsInstance(data_sample.task0, MultiTaskDataSample)
+        self.assertIsInstance(data_sample.task1, ClsDataSample)
+        self.assertIsInstance(data_sample.task0.task00, ClsDataSample)

From 9038c1c25516c20f144d4cfa267890d9562267f2 Mon Sep 17 00:00:00 2001
From: Ma Zerun <mzr1996@163.com>
Date: Fri, 30 Dec 2022 11:46:17 +0800
Subject: [PATCH 16/21] [Feature] Support TTA and add `--tta` in
 `tools/test.py`. (#1161)

* [Feature] Support TTA and add `--tta` in `tools/test.py`.

* Add unit tests.

* Rename the TTA model to `AverageClsScoreTTA`.
---
 mmcls/models/__init__.py      |  1 +
 mmcls/models/tta/__init__.py  |  4 +++
 mmcls/models/tta/score_tta.py | 36 +++++++++++++++++++
 tests/test_models/test_tta.py | 67 +++++++++++++++++++++++++++++++++++
 tools/test.py                 | 29 ++++++++++++++-
 5 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 mmcls/models/tta/__init__.py
 create mode 100644 mmcls/models/tta/score_tta.py
 create mode 100644 tests/test_models/test_tta.py

diff --git a/mmcls/models/__init__.py b/mmcls/models/__init__.py
index b7e984d32b5..b3ba9232020 100644
--- a/mmcls/models/__init__.py
+++ b/mmcls/models/__init__.py
@@ -8,6 +8,7 @@
 from .losses import *  # noqa: F401,F403
 from .necks import *  # noqa: F401,F403
 from .retrievers import *  # noqa: F401,F403
+from .tta import *  # noqa: F401,F403
 from .utils import *  # noqa: F401,F403
 
 __all__ = [
diff --git a/mmcls/models/tta/__init__.py b/mmcls/models/tta/__init__.py
new file mode 100644
index 00000000000..568e64ffdc7
--- /dev/null
+++ b/mmcls/models/tta/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .score_tta import AverageClsScoreTTA
+
+__all__ = ['AverageClsScoreTTA']
diff --git a/mmcls/models/tta/score_tta.py b/mmcls/models/tta/score_tta.py
new file mode 100644
index 00000000000..cb348c555eb
--- /dev/null
+++ b/mmcls/models/tta/score_tta.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+from mmengine.model import BaseTTAModel
+
+from mmcls.registry import MODELS
+from mmcls.structures import ClsDataSample
+
+
+@MODELS.register_module()
+class AverageClsScoreTTA(BaseTTAModel):
+
+    def merge_preds(
+        self,
+        data_samples_list: List[List[ClsDataSample]],
+    ) -> List[ClsDataSample]:
+        """Merge predictions of enhanced data to one prediction.
+
+        Args:
+            data_samples_list (List[List[ClsDataSample]]): List of predictions
+                of all enhanced data.
+
+        Returns:
+            List[ClsDataSample]: Merged prediction.
+        """
+        merged_data_samples = []
+        for data_samples in data_samples_list:
+            merged_data_samples.append(self._merge_single_sample(data_samples))
+        return merged_data_samples
+
+    def _merge_single_sample(self, data_samples):
+        merged_data_sample: ClsDataSample = data_samples[0].new()
+        merged_score = sum(data_sample.pred_label.score
+                           for data_sample in data_samples) / len(data_samples)
+        merged_data_sample.set_pred_score(merged_score)
+        return merged_data_sample
diff --git a/tests/test_models/test_tta.py b/tests/test_models/test_tta.py
new file mode 100644
index 00000000000..2affb190a30
--- /dev/null
+++ b/tests/test_models/test_tta.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from unittest import TestCase
+
+import torch
+from mmengine import ConfigDict
+
+from mmcls.models import AverageClsScoreTTA, ImageClassifier
+from mmcls.registry import MODELS
+from mmcls.structures import ClsDataSample
+from mmcls.utils import register_all_modules
+
+register_all_modules()
+
+
+class TestAverageClsScoreTTA(TestCase):
+    DEFAULT_ARGS = dict(
+        type='AverageClsScoreTTA',
+        module=dict(
+            type='ImageClassifier',
+            backbone=dict(type='ResNet', depth=18),
+            neck=dict(type='GlobalAveragePooling'),
+            head=dict(
+                type='LinearClsHead',
+                num_classes=10,
+                in_channels=512,
+                loss=dict(type='CrossEntropyLoss'))))
+
+    def test_initialize(self):
+        model: AverageClsScoreTTA = MODELS.build(self.DEFAULT_ARGS)
+        self.assertIsInstance(model.module, ImageClassifier)
+
+    def test_forward(self):
+        inputs = torch.rand(1, 3, 224, 224)
+        model: AverageClsScoreTTA = MODELS.build(self.DEFAULT_ARGS)
+
+        # The forward of TTA model should not be called.
+        with self.assertRaisesRegex(NotImplementedError, 'will not be called'):
+            model(inputs)
+
+    def test_test_step(self):
+        cfg = ConfigDict(deepcopy(self.DEFAULT_ARGS))
+        cfg.module.data_preprocessor = dict(
+            mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5])
+        model: AverageClsScoreTTA = MODELS.build(cfg)
+
+        img1 = torch.randint(0, 256, (1, 3, 224, 224))
+        img2 = torch.randint(0, 256, (1, 3, 224, 224))
+        data1 = {
+            'inputs': img1,
+            'data_samples': [ClsDataSample().set_gt_label(1)]
+        }
+        data2 = {
+            'inputs': img2,
+            'data_samples': [ClsDataSample().set_gt_label(1)]
+        }
+        data_tta = {
+            'inputs': [img1, img2],
+            'data_samples': [[ClsDataSample().set_gt_label(1)],
+                             [ClsDataSample().set_gt_label(1)]]
+        }
+
+        score1 = model.module.test_step(data1)[0].pred_label.score
+        score2 = model.module.test_step(data2)[0].pred_label.score
+        score_tta = model.test_step(data_tta)[0].pred_label.score
+
+        torch.testing.assert_allclose(score_tta, (score1 + score2) / 2)
diff --git a/tools/test.py b/tools/test.py
index 0fd56510453..61004d8c187 100644
--- a/tools/test.py
+++ b/tools/test.py
@@ -56,6 +56,13 @@ def parse_args():
         '--no-pin-memory',
         action='store_true',
         help='whether to disable the pin_memory option in dataloaders.')
+    parser.add_argument(
+        '--tta',
+        action='store_true',
+        help='Whether to enable the Test-Time-Aug (TTA). If the config file '
+        'has `tta_pipeline` and `tta_model` fields, use them to determine the '
+        'TTA transforms and how to merge the TTA results. Otherwise, use flip '
+        'TTA by averaging classification score.')
     parser.add_argument(
         '--launcher',
         choices=['none', 'pytorch', 'slurm', 'mpi'],
@@ -105,7 +112,27 @@ def merge_args(cfg, args):
         else:
             cfg.test_evaluator = [cfg.test_evaluator, dump_metric]
 
-    # set dataloader args
+    # -------------------- TTA related args --------------------
+    if args.tta:
+        if 'tta_model' not in cfg:
+            cfg.tta_model = dict(type='mmcls.AverageClsScoreTTA')
+        if 'tta_pipeline' not in cfg:
+            test_pipeline = cfg.test_dataloader.dataset.pipeline
+            cfg.tta_pipeline = deepcopy(test_pipeline)
+            flip_tta = dict(
+                type='TestTimeAug',
+                transforms=[
+                    [
+                        dict(type='RandomFlip', prob=1.),
+                        dict(type='RandomFlip', prob=0.)
+                    ],
+                    [test_pipeline[-1]],
+                ])
+            cfg.tta_pipeline[-1] = flip_tta
+        cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model)
+        cfg.test_dataloader.dataset.pipeline = cfg.tta_pipeline
+
+    # ----------------- Default dataloader args -----------------
     default_dataloader_cfg = ConfigDict(
         pin_memory=True,
         collate_fn=dict(type='default_collate'),

From 74743ef588f286907937607f45cfa67a4eb7b47f Mon Sep 17 00:00:00 2001
From: QINGTIAN <102738505+suibe-qingtian@users.noreply.github.com>
Date: Fri, 30 Dec 2022 15:18:39 +0800
Subject: [PATCH 17/21] [Feature] [CodeCamp #68] Add EfficientnetV2 Backbone.
 (#1253)

* add efficientnet_v2.py

* add efficientnet_v2 in __init__.py

* add efficientnet_v2_s base config file

* add efficientnet_v2 config file

* add efficientnet_v2 config file

* update tuple output

* update config file

* update model file

* update model file

* update model file

* update config file

* update model file

* update config file

* update model file

* update model file

* update model file

* update model file

* update model file

* update config file

* update config file

* update model file

* update model file

* update model file

* update model file

* update model config file

* Update efficientnet_v2.py

* add config file and modify arch

* add config file and modify arch

* add the file about convert_pth from timm to mmcls

* update efficientnetv2 model file with mmcls style

* add the file about convert_pth from timm to mmcls

* add the file about convert_pth from timm to mmcls

* update convert file

* update model file

* update convert file

* update model file

* update model file

* update model file

* add metefile and README

* Update tools/model_converters/efficientnetv2-timm_to_mmcls.py

Co-authored-by: Ezra-Yu <18586273+Ezra-Yu@users.noreply.github.com>

* update model file and convert file

* update model file and convert file

* update model file and convert file

* update model file and convert file

* update model file

* update model file

* update model file

* update config file and README file

* update metafile

* Update efficientnetv2_to_mmcls.py

* update model-index.yml

* update metafile.yml

* update b0 and s train pipeline

* update b0 and s train pipeline

* update b0 and s train pipeline

* add test_efficientnet_v2

* update test_efficientnet_v2

* update model file docs

* update test_efficientnet_v2

* update test_efficientnet_v2

* add efficientnet_v2.py

* add efficientnet_v2 in __init__.py

* add efficientnet_v2_s base config file

* add efficientnet_v2 config file

* add efficientnet_v2 config file

* update tuple output

* update config file

* update model file

* update model file

* update model file

* update model file

* update config file

* update config file

* update model file

* update model file

* update model file

* update model file

* update model file

* update config file

* update config file

* update model file

* update model file

* update model file

* update model file

* update model config file

* Update efficientnet_v2.py

* add config file and modify arch

* add config file and modify arch

* add the file about convert_pth from timm to mmcls

* update efficientnetv2 model file with mmcls style

* add the file about convert_pth from timm to mmcls

* add the file about convert_pth from timm to mmcls

* update convert file

* update model file

* update convert file

* update model file

* update model file

* update model file

* add metefile and README

* Update tools/model_converters/efficientnetv2-timm_to_mmcls.py

Co-authored-by: Ezra-Yu <18586273+Ezra-Yu@users.noreply.github.com>

* update model file and convert file

* update model file and convert file

* update model file and convert file

* update model file and convert file

* update model file

* update model file

* update model file

* update config file and README file

* update metafile

* Update efficientnetv2_to_mmcls.py

* update model-index.yml

* update metafile.yml

* update b0 and s train pipeline

* update b0 and s train pipeline

* update b0 and s train pipeline

* add test_efficientnet_v2

* update test_efficientnet_v2

* update model file docs

* update test_efficientnet_v2

* update test_efficientnet_v2

* pass pre-commit hook

* refactor efficientnetv2

* refactor efficientnetv2

* update readme, metafile and weight links

* update model-index.yml

* fix lint

* fix typo

* Update efficientnetv2-b1_8xb32_in1k.py

* Update efficientnetv2-b2_8xb32_in1k.py

* Update efficientnetv2-b3_8xb32_in1k.py

* update two moduals and model file

* update modual file

* update accuracys

* update accuracys

* update metafile

* fix build docs

* update links

* update README.md

Co-authored-by: qingtian <459291290@qq.com>
Co-authored-by: Ezra-Yu <18586273+Ezra-Yu@users.noreply.github.com>
---
 README.md                                     |   1 +
 .../efficientnet_v2/efficientnet_v2_b0.py     |  12 +
 .../efficientnet_v2/efficientnet_v2_b1.py     |  12 +
 .../efficientnet_v2/efficientnet_v2_b2.py     |  12 +
 .../efficientnet_v2/efficientnet_v2_b3.py     |  12 +
 .../efficientnet_v2/efficientnet_v2_l.py      |  12 +
 .../efficientnet_v2/efficientnet_v2_m.py      |  12 +
 .../efficientnet_v2/efficientnet_v2_s.py      |  12 +
 .../efficientnet_v2/efficientnet_v2_xl.py     |  12 +
 .../efficientnet_v2/efficientnetv2_b0.py      |  12 +
 .../efficientnet_v2/efficientnetv2_b1.py      |  12 +
 .../efficientnet_v2/efficientnetv2_b2.py      |  12 +
 .../efficientnet_v2/efficientnetv2_b3.py      |  12 +
 .../efficientnet_v2/efficientnetv2_l.py       |  12 +
 .../efficientnet_v2/efficientnetv2_m.py       |  12 +
 .../efficientnet_v2/efficientnetv2_s.py       |  12 +
 .../efficientnet_v2/efficientnetv2_xl.py      |  12 +
 configs/efficientnet_v2/README.md             | 116 ++++++
 .../efficientnet_v2-b0_8xb32_in1k.py          |  58 +++
 .../efficientnet_v2-b1_8xb32_in1k.py          |  23 ++
 .../efficientnet_v2-b2_8xb32_in1k.py          |  23 ++
 .../efficientnet_v2-b3_8xb32_in1k.py          |  23 ++
 .../efficientnet_v2-l_8xb32_in1k.py           |  34 ++
 .../efficientnet_v2-l_8xb32_in21ft1k.py       |  34 ++
 .../efficientnet_v2-m_8xb32_in1k.py           |  34 ++
 .../efficientnet_v2-m_8xb32_in21ft1k.py       |  34 ++
 .../efficientnet_v2-s_8xb32_in1k.py           |  58 +++
 .../efficientnet_v2-s_8xb32_in21ft1k.py       |  34 ++
 .../efficientnet_v2-xl_8xb32_in21ft1k.py      |  34 ++
 .../efficientnetv2-b0_8xb32_in1k.py           |  58 +++
 .../efficientnetv2-b1_8xb32_in1k.py           |  21 ++
 .../efficientnetv2-b2_8xb32_in1k.py           |  21 ++
 .../efficientnetv2-b3_8xb32_in1k.py           |  21 ++
 .../efficientnetv2-l_8xb32_in1k-480px.py      |  23 ++
 .../efficientnetv2-l_8xb32_in21k.py           |   4 +
 .../efficientnetv2-m_8xb32_in1k-480px.py      |  23 ++
 .../efficientnetv2-m_8xb32_in21k.py           |   4 +
 .../efficientnetv2-s_8xb32_in1k-384px.py      |  34 ++
 .../efficientnetv2-s_8xb32_in21k.py           |  43 +++
 .../efficientnetv2-xl_8xb32_in1k-512px.py     |  23 ++
 .../efficientnetv2-xl_8xb32_in21k.py          |   4 +
 configs/efficientnet_v2/metafile.yml          | 255 +++++++++++++
 docs/en/api/models.rst                        |   1 +
 mmcls/models/backbones/__init__.py            |   2 +
 mmcls/models/backbones/efficientnet.py        |   6 +-
 mmcls/models/backbones/efficientnet_v2.py     | 343 ++++++++++++++++++
 model-index.yml                               |   1 +
 .../test_backbones/test_efficientnet_v2.py    | 150 ++++++++
 .../efficientnetv2_to_mmcls.py                |  99 +++++
 49 files changed, 1831 insertions(+), 3 deletions(-)
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnet_v2_b0.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnet_v2_b1.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnet_v2_b2.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnet_v2_b3.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnet_v2_l.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnet_v2_m.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnet_v2_s.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnet_v2_xl.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnetv2_b0.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnetv2_b1.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnetv2_b2.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnetv2_b3.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnetv2_l.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnetv2_m.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnetv2_s.py
 create mode 100644 configs/_base_/models/efficientnet_v2/efficientnetv2_xl.py
 create mode 100644 configs/efficientnet_v2/README.md
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-b0_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-b1_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-b2_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-b3_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-l_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-l_8xb32_in21ft1k.py
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-m_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-m_8xb32_in21ft1k.py
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-s_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-s_8xb32_in21ft1k.py
 create mode 100644 configs/efficientnet_v2/efficientnet_v2-xl_8xb32_in21ft1k.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-b0_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-b1_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-b2_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-b3_8xb32_in1k.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-l_8xb32_in1k-480px.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-l_8xb32_in21k.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-m_8xb32_in1k-480px.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-m_8xb32_in21k.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-s_8xb32_in1k-384px.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-s_8xb32_in21k.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-xl_8xb32_in1k-512px.py
 create mode 100644 configs/efficientnet_v2/efficientnetv2-xl_8xb32_in21k.py
 create mode 100644 configs/efficientnet_v2/metafile.yml
 create mode 100644 mmcls/models/backbones/efficientnet_v2.py
 create mode 100644 tests/test_models/test_backbones/test_efficientnet_v2.py
 create mode 100644 tools/model_converters/efficientnetv2_to_mmcls.py

diff --git a/README.md b/README.md
index 95f3c9f5106..00a028fe286 100644
--- a/README.md
+++ b/README.md
@@ -155,6 +155,7 @@ Results and models are available in the [model zoo](https://mmclassification.rea
 - [x] [BEiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/beit) / [BEiT v2](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/beitv2)
 - [x] [EVA](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/eva)
 - [x] [MixMIM](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mixmim)
+- [x] [EfficientNetV2](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/efficientnet_v2)
 
 </details>
 
diff --git a/configs/_base_/models/efficientnet_v2/efficientnet_v2_b0.py b/configs/_base_/models/efficientnet_v2/efficientnet_v2_b0.py
new file mode 100644
index 00000000000..d42e32905ed
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnet_v2_b0.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='b0'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnet_v2_b1.py b/configs/_base_/models/efficientnet_v2/efficientnet_v2_b1.py
new file mode 100644
index 00000000000..10736fc5046
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnet_v2_b1.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='b1'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnet_v2_b2.py b/configs/_base_/models/efficientnet_v2/efficientnet_v2_b2.py
new file mode 100644
index 00000000000..61f477120e0
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnet_v2_b2.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='b2'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1408,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnet_v2_b3.py b/configs/_base_/models/efficientnet_v2/efficientnet_v2_b3.py
new file mode 100644
index 00000000000..14e523fd2e4
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnet_v2_b3.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='b3'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnet_v2_l.py b/configs/_base_/models/efficientnet_v2/efficientnet_v2_l.py
new file mode 100644
index 00000000000..456467d6fa0
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnet_v2_l.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='l'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnet_v2_m.py b/configs/_base_/models/efficientnet_v2/efficientnet_v2_m.py
new file mode 100644
index 00000000000..8e4d303f624
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnet_v2_m.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='m'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnet_v2_s.py b/configs/_base_/models/efficientnet_v2/efficientnet_v2_s.py
new file mode 100644
index 00000000000..866648223c7
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnet_v2_s.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='s'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnet_v2_xl.py b/configs/_base_/models/efficientnet_v2/efficientnet_v2_xl.py
new file mode 100644
index 00000000000..2216c9daa7d
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnet_v2_xl.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='xl'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnetv2_b0.py b/configs/_base_/models/efficientnet_v2/efficientnetv2_b0.py
new file mode 100644
index 00000000000..d42e32905ed
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnetv2_b0.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='b0'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnetv2_b1.py b/configs/_base_/models/efficientnet_v2/efficientnetv2_b1.py
new file mode 100644
index 00000000000..10736fc5046
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnetv2_b1.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='b1'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnetv2_b2.py b/configs/_base_/models/efficientnet_v2/efficientnetv2_b2.py
new file mode 100644
index 00000000000..61f477120e0
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnetv2_b2.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='b2'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1408,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnetv2_b3.py b/configs/_base_/models/efficientnet_v2/efficientnetv2_b3.py
new file mode 100644
index 00000000000..14e523fd2e4
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnetv2_b3.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='b3'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnetv2_l.py b/configs/_base_/models/efficientnet_v2/efficientnetv2_l.py
new file mode 100644
index 00000000000..456467d6fa0
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnetv2_l.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='l'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnetv2_m.py b/configs/_base_/models/efficientnet_v2/efficientnetv2_m.py
new file mode 100644
index 00000000000..8e4d303f624
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnetv2_m.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='m'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnetv2_s.py b/configs/_base_/models/efficientnet_v2/efficientnetv2_s.py
new file mode 100644
index 00000000000..866648223c7
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnetv2_s.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='s'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/_base_/models/efficientnet_v2/efficientnetv2_xl.py b/configs/_base_/models/efficientnet_v2/efficientnetv2_xl.py
new file mode 100644
index 00000000000..2216c9daa7d
--- /dev/null
+++ b/configs/_base_/models/efficientnet_v2/efficientnetv2_xl.py
@@ -0,0 +1,12 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='EfficientNetV2', arch='xl'),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5),
+    ))
diff --git a/configs/efficientnet_v2/README.md b/configs/efficientnet_v2/README.md
new file mode 100644
index 00000000000..b249ef56539
--- /dev/null
+++ b/configs/efficientnet_v2/README.md
@@ -0,0 +1,116 @@
+# EfficientNetV2
+
+> [EfficientNetV2: Smaller Models and Faster Training](https://arxiv.org/abs/2104.00298)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+This paper introduces EfficientNetV2, a new family of convolutional networks that have faster training speed and better parameter efficiency than previous models. To develop this family of models, we use a combination of training-aware neural architecture search and scaling, to jointly optimize training speed and parameter efficiency. The models were searched from the search space enriched with new ops such as Fused-MBConv. Our experiments show that EfficientNetV2 models train much faster than state-of-the-art models while being up to 6.8x smaller.   Our training can be further sped up by progressively increasing the image size during training, but it often causes a drop in accuracy. To compensate for this accuracy drop, we propose to adaptively adjust regularization (e.g., dropout and data augmentation) as well, such that we can achieve both fast training and good accuracy.   With progressive learning, our EfficientNetV2 significantly outperforms previous models on ImageNet and CIFAR/Cars/Flowers datasets. By pretraining on the same ImageNet21k, our EfficientNetV2 achieves 87.3% top-1 accuracy on ImageNet ILSVRC2012, outperforming the recent ViT by 2.0% accuracy while training 5x-11x faster using the same computing resources. Code will be available at https://github.com/google/automl/tree/master/efficientnetv2.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/18586273/208616931-0c5107f1-f08c-48d3-8694-7a6eaf227dc2.png" width="50%"/>
+</div>
+
+## How to use it?
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+```python
+>>> import torch
+>>> from mmcls.apis import init_model, inference_model
+>>>
+>>> model = init_model('configs/efficientnet_v2/efficientnetv2-b0_8xb32_in1k.py', "https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b0_8xb32_in1k_20221219-9689f21f.pth")
+>>> predict = inference_model(model, 'demo/demo.JPEG')
+>>> print(predict['pred_class'])
+sea snake
+>>> print(predict['pred_score'])
+0.3147328197956085
+```
+
+**Use the model**
+
+```python
+>>> import torch
+>>> from mmcls import get_model
+>>>
+>>> model = get_model("efficientnetv2-b0_3rdparty_in1k", pretrained=True)
+>>> model.eval()
+>>> inputs = torch.rand(1, 3, 224, 224).to(model.data_preprocessor.device)
+>>> # To get classification scores.
+>>> out = model(inputs)
+>>> print(out.shape)
+torch.Size([1, 1000])
+>>> # To extract features.
+>>> outs = model.extract_feat(inputs)
+>>> print(outs[0].shape)
+torch.Size([1, 1280])
+```
+
+**Train/Test Command**
+
+Place the ImageNet dataset to the `data/imagenet/` directory, or prepare datasets according to the [docs](https://mmclassification.readthedocs.io/en/1.x/user_guides/dataset_prepare.html#prepare-dataset).
+
+Train:
+
+```shell
+python tools/train.py configs/efficientnet_v2/efficientnetv2-b0_8xb32_in1k.py
+```
+
+Test:
+
+```shell
+python tools/test.py configs/efficientnet_v2/efficientnetv2-b0_8xb32_in1k.py https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b0_8xb32_in1k_20221219-9689f21f.pth
+```
+
+<!-- [TABS-END] -->
+
+For more configurable parameters, please refer to the [API](https://mmclassification.readthedocs.io/en/1.x/api/generated/mmcls.models.backbones.EfficientNetV2.html#mmcls.models.backbones.EfficientNetV2).
+
+## Results and models
+
+### ImageNet-1k
+
+|                       Model                        |   Pretrain   | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                      Config                       |                        Download                        |
+| :------------------------------------------------: | :----------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------: | :----------------------------------------------------: |
+| EfficientNetV2-b0\* (`efficientnetv2-b0_3rdparty_in1k`) | From scratch |   7.14    |   0.92   |   78.52   |   94.44   |    [config](./efficientnetv2-b0_8xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b0_3rdparty_in1k_20221221-9ef6e736.pth) |
+| EfficientNetV2-b1\* (`efficientnetv2-b1_3rdparty_in1k`) | From scratch |   8.14    |   1.44   |   79.80   |   94.89   |    [config](./efficientnetv2-b1_8xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b1_3rdparty_in1k_20221221-6955d9ce.pth) |
+| EfficientNetV2-b2\* (`efficientnetv2-b2_3rdparty_in1k`) | From scratch |   10.10   |   1.99   |   80.63   |   95.30   |    [config](./efficientnetv2-b2_8xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b2_3rdparty_in1k_20221221-74f7d493.pth) |
+| EfficientNetV2-b3\* (`efficientnetv2-b3_3rdparty_in1k`) | From scratch |   14.36   |   3.50   |   82.03   |   95.88   |    [config](./efficientnetv2-b3_8xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b3_3rdparty_in1k_20221221-b6f07a36.pth) |
+| EfficientNetV2-s\* (`efficientnetv2-s_3rdparty_in1k`) | From scratch |   21.46   |   9.72   |   83.82   |   96.67   | [config](./efficientnetv2-s_8xb32_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-s_3rdparty_in1k_20221220-f0eaff9d.pth) |
+| EfficientNetV2-m\* (`efficientnetv2-m_3rdparty_in1k`) | From scratch |   54.14   |  26.88   |   85.01   |   97.26   | [config](./efficientnetv2-m_8xb32_in1k-480px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-m_3rdparty_in1k_20221220-9dc0c729.pth) |
+| EfficientNetV2-l\* (`efficientnetv2-l_3rdparty_in1k`) | From scratch |  118.52   |  60.14   |   85.43   |   97.31   | [config](./efficientnetv2-l_8xb32_in1k-480px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-l_3rdparty_in1k_20221220-5c3bac0f.pth) |
+| EfficientNetV2-s\* (`efficientnetv2-s_in21k-pre_3rdparty_in1k`) | ImageNet 21k |   21.46   |   9.72   |   84.29   |   97.26   | [config](./efficientnetv2-s_8xb32_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-s_in21k-pre-3rdparty_in1k_20221220-7a7c8475.pth) |
+| EfficientNetV2-m\* (`efficientnetv2-m_in21k-pre_3rdparty_in1k`) | ImageNet 21k |   54.14   |  26.88   |   85.47   |   97.76   | [config](./efficientnetv2-m_8xb32_in1k-480px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-m_in21k-pre-3rdparty_in1k_20221220-a1013a04.pth) |
+| EfficientNetV2-l\* (`efficientnetv2-l_in21k-pre_3rdparty_in1k`) | ImageNet 21k |  118.52   |  60.14   |   86.31   |   97.99   | [config](./efficientnetv2-l_8xb32_in1k-480px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-l_in21k-pre-3rdparty_in1k_20221220-63df0efd.pth) |
+| EfficientNetV2-xl\* (`efficientnetv2-xl_in21k-pre_3rdparty_in1k`) | ImageNet 21k |  208.12   |  98.34   |   86.39   |   97.83   | [config](./efficientnetv2-xl_8xb32_in1k-512px.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-xl_in21k-pre-3rdparty_in1k_20221220-583ac18b.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
+### Pre-trained Models In ImageNet-21K
+
+The pre-trained models are only used to fine-tune, and therefore cannot be trained and don't have evaluation results.
+
+|                          Model                           | Params(M) | Flops(G) |                      Config                       |                                    Download                                    |
+| :------------------------------------------------------: | :-------: | :------: | :-----------------------------------------------: | :----------------------------------------------------------------------------: |
+|  EfficientNetV2-s\* (`efficientnetv2-s_3rdparty_in21k`)  |   21.46   |   9.72   | [config](./efficientnetv2-s_8xb32_in1k-384px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-s_3rdparty_in21k_20221220-c0572b56.pth) |
+|  EfficientNetV2-m\* (`efficientnetv2-m_3rdparty_in21k`)  |   54.14   |  26.88   | [config](./efficientnetv2-m_8xb32_in1k-480px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-m_3rdparty_in21k_20221220-073e944c.pth) |
+|  EfficientNetV2-l\* (`efficientnetv2-l_3rdparty_in21k`)  |  118.52   |  60.14   | [config](./efficientnetv2-l_8xb32_in1k-480px.py)  | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-l_3rdparty_in21k_20221220-f28f91e1.pth) |
+| EfficientNetV2-xl\* (`efficientnetv2-xl_3rdparty_in21k`) |  208.12   |  98.34   | [config](./efficientnetv2-xl_8xb32_in1k-512px.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-xl_3rdparty_in21k_20221220-b2c9329c.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py).*
+
+## Citation
+
+```bibtex
+@inproceedings{tan2021efficientnetv2,
+  title={Efficientnetv2: Smaller models and faster training},
+  author={Tan, Mingxing and Le, Quoc},
+  booktitle={International Conference on Machine Learning},
+  pages={10096--10106},
+  year={2021},
+  organization={PMLR}
+}
+```
diff --git a/configs/efficientnet_v2/efficientnet_v2-b0_8xb32_in1k.py b/configs/efficientnet_v2/efficientnet_v2-b0_8xb32_in1k.py
new file mode 100644
index 00000000000..0ce48d43ae1
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-b0_8xb32_in1k.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_b0.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+bgr_mean = data_preprocessor['mean'][::-1]
+bgr_std = data_preprocessor['std'][::-1]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=192,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=bgr_mean,
+        fill_std=bgr_std),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=224, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnet_v2-b1_8xb32_in1k.py b/configs/efficientnet_v2/efficientnet_v2-b1_8xb32_in1k.py
new file mode 100644
index 00000000000..9d628d05981
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-b1_8xb32_in1k.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_b1.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=192),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=240),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnet_v2-b2_8xb32_in1k.py b/configs/efficientnet_v2/efficientnet_v2-b2_8xb32_in1k.py
new file mode 100644
index 00000000000..e15f7698e81
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-b2_8xb32_in1k.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_b2.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=208),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=260),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnet_v2-b3_8xb32_in1k.py b/configs/efficientnet_v2/efficientnet_v2-b3_8xb32_in1k.py
new file mode 100644
index 00000000000..2f4b664dfe2
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-b3_8xb32_in1k.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_b3.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=240),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=300),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnet_v2-l_8xb32_in1k.py b/configs/efficientnet_v2/efficientnet_v2-l_8xb32_in1k.py
new file mode 100644
index 00000000000..5b150138319
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-l_8xb32_in1k.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_l.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=480, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnet_v2-l_8xb32_in21ft1k.py b/configs/efficientnet_v2/efficientnet_v2-l_8xb32_in21ft1k.py
new file mode 100644
index 00000000000..5b150138319
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-l_8xb32_in21ft1k.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_l.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=480, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnet_v2-m_8xb32_in1k.py b/configs/efficientnet_v2/efficientnet_v2-m_8xb32_in1k.py
new file mode 100644
index 00000000000..80884286c81
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-m_8xb32_in1k.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_m.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=480, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnet_v2-m_8xb32_in21ft1k.py b/configs/efficientnet_v2/efficientnet_v2-m_8xb32_in21ft1k.py
new file mode 100644
index 00000000000..80884286c81
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-m_8xb32_in21ft1k.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_m.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=480, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnet_v2-s_8xb32_in1k.py b/configs/efficientnet_v2/efficientnet_v2-s_8xb32_in1k.py
new file mode 100644
index 00000000000..ee85b9f3336
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-s_8xb32_in1k.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_s.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+bgr_mean = data_preprocessor['mean'][::-1]
+bgr_std = data_preprocessor['std'][::-1]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=300,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=bgr_mean,
+        fill_std=bgr_std),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=384, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnet_v2-s_8xb32_in21ft1k.py b/configs/efficientnet_v2/efficientnet_v2-s_8xb32_in21ft1k.py
new file mode 100644
index 00000000000..2eed7b68890
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-s_8xb32_in21ft1k.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_s.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=300, crop_padding=0),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=384, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnet_v2-xl_8xb32_in21ft1k.py b/configs/efficientnet_v2/efficientnet_v2-xl_8xb32_in21ft1k.py
new file mode 100644
index 00000000000..6309ee4bbf6
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnet_v2-xl_8xb32_in21ft1k.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnet_v2_xl.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=512, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnetv2-b0_8xb32_in1k.py b/configs/efficientnet_v2/efficientnetv2-b0_8xb32_in1k.py
new file mode 100644
index 00000000000..c8a64f56029
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-b0_8xb32_in1k.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnetv2_b0.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+bgr_mean = data_preprocessor['mean'][::-1]
+bgr_std = data_preprocessor['std'][::-1]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=192,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=bgr_mean,
+        fill_std=bgr_std),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=224, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnetv2-b1_8xb32_in1k.py b/configs/efficientnet_v2/efficientnetv2-b1_8xb32_in1k.py
new file mode 100644
index 00000000000..33f48dfd26a
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-b1_8xb32_in1k.py
@@ -0,0 +1,21 @@
+_base_ = ['./efficientnetv2-b0_8xb32_in1k.py']
+
+# model setting
+model = dict(backbone=dict(arch='b1'), head=dict(in_channels=1280, ))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=192),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=240, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnetv2-b2_8xb32_in1k.py b/configs/efficientnet_v2/efficientnetv2-b2_8xb32_in1k.py
new file mode 100644
index 00000000000..497c2aa3727
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-b2_8xb32_in1k.py
@@ -0,0 +1,21 @@
+_base_ = ['./efficientnetv2-b0_8xb32_in1k.py']
+
+# model setting
+model = dict(backbone=dict(arch='b2'), head=dict(in_channels=1408, ))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=208),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=260, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnetv2-b3_8xb32_in1k.py b/configs/efficientnet_v2/efficientnetv2-b3_8xb32_in1k.py
new file mode 100644
index 00000000000..16f82c3a512
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-b3_8xb32_in1k.py
@@ -0,0 +1,21 @@
+_base_ = ['./efficientnetv2-b0_8xb32_in1k.py']
+
+# model setting
+model = dict(backbone=dict(arch='b3'), head=dict(in_channels=1536, ))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=240),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=300, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnetv2-l_8xb32_in1k-480px.py b/configs/efficientnet_v2/efficientnetv2-l_8xb32_in1k-480px.py
new file mode 100644
index 00000000000..2bef5591c87
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-l_8xb32_in1k-480px.py
@@ -0,0 +1,23 @@
+_base_ = [
+    'efficientnetv2-s_8xb32_in1k-384px.py',
+]
+
+# model setting
+model = dict(backbone=dict(arch='l'), )
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=480, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnetv2-l_8xb32_in21k.py b/configs/efficientnet_v2/efficientnetv2-l_8xb32_in21k.py
new file mode 100644
index 00000000000..179c72075f6
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-l_8xb32_in21k.py
@@ -0,0 +1,4 @@
+_base_ = ['./efficientnetv2-s_8xb32_in21k.py']
+
+# model setting
+model = dict(backbone=dict(arch='l'), )
diff --git a/configs/efficientnet_v2/efficientnetv2-m_8xb32_in1k-480px.py b/configs/efficientnet_v2/efficientnetv2-m_8xb32_in1k-480px.py
new file mode 100644
index 00000000000..06f941e2eeb
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-m_8xb32_in1k-480px.py
@@ -0,0 +1,23 @@
+_base_ = [
+    'efficientnetv2-s_8xb32_in1k-384px.py',
+]
+
+# model setting
+model = dict(backbone=dict(arch='m'), )
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=480, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnetv2-m_8xb32_in21k.py b/configs/efficientnet_v2/efficientnetv2-m_8xb32_in21k.py
new file mode 100644
index 00000000000..f04d616376a
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-m_8xb32_in21k.py
@@ -0,0 +1,4 @@
+_base_ = ['./efficientnetv2-s_8xb32_in21k.py']
+
+# model setting
+model = dict(backbone=dict(arch='m'), )
diff --git a/configs/efficientnet_v2/efficientnetv2-s_8xb32_in1k-384px.py b/configs/efficientnet_v2/efficientnetv2-s_8xb32_in1k-384px.py
new file mode 100644
index 00000000000..2d9b8e4f7fb
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-s_8xb32_in1k-384px.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnetv2_s.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    num_classes=1000,
+    # RGB format normalization parameters
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=300, crop_padding=0),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=384, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnetv2-s_8xb32_in21k.py b/configs/efficientnet_v2/efficientnetv2-s_8xb32_in21k.py
new file mode 100644
index 00000000000..e45369463ac
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-s_8xb32_in21k.py
@@ -0,0 +1,43 @@
+_base_ = [
+    '../_base_/models/efficientnet_v2/efficientnetv2_s.py',
+    '../_base_/datasets/imagenet_bs32.py',
+    '../_base_/schedules/imagenet_bs256.py',
+    '../_base_/default_runtime.py',
+]
+
+# model setting
+model = dict(head=dict(num_classes=21843))
+
+# dataset settings
+dataset_type = 'ImageNet21k'
+data_preprocessor = dict(
+    num_classes=21843,
+    # RGB format normalization parameters
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=224),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=224, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=4e-3),
+    clip_grad=dict(max_norm=5.0),
+)
diff --git a/configs/efficientnet_v2/efficientnetv2-xl_8xb32_in1k-512px.py b/configs/efficientnet_v2/efficientnetv2-xl_8xb32_in1k-512px.py
new file mode 100644
index 00000000000..ea161aa655a
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-xl_8xb32_in1k-512px.py
@@ -0,0 +1,23 @@
+_base_ = [
+    'efficientnetv2-s_8xb32_in1k-384px.py',
+]
+
+# model setting
+model = dict(backbone=dict(arch='xl'), )
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetRandomCrop', scale=384, crop_padding=0),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='EfficientNetCenterCrop', crop_size=512, crop_padding=0),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
diff --git a/configs/efficientnet_v2/efficientnetv2-xl_8xb32_in21k.py b/configs/efficientnet_v2/efficientnetv2-xl_8xb32_in21k.py
new file mode 100644
index 00000000000..e2ee84cb32f
--- /dev/null
+++ b/configs/efficientnet_v2/efficientnetv2-xl_8xb32_in21k.py
@@ -0,0 +1,4 @@
+_base_ = ['./efficientnetv2-s_8xb32_in21k.py']
+
+# model setting
+model = dict(backbone=dict(arch='xl'), )
diff --git a/configs/efficientnet_v2/metafile.yml b/configs/efficientnet_v2/metafile.yml
new file mode 100644
index 00000000000..cfbdd5f3e8c
--- /dev/null
+++ b/configs/efficientnet_v2/metafile.yml
@@ -0,0 +1,255 @@
+Collections:
+  - Name: EfficientNetV2
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - 1x1 Convolution
+        - Average Pooling
+        - Convolution
+        - Dense Connections
+        - Dropout
+        - Inverted Residual Block
+        - RMSProp
+        - Squeeze-and-Excitation Block
+        - Swish
+    Paper:
+      URL: https://arxiv.org/abs/2104.00298
+      Title: "EfficientNetV2: Smaller Models and Faster Training"
+    README: configs/efficientnet_v2/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmclassification/blob/dev-1.x/mmcls/models/backbones/beit.py
+      Version: v1.0.0rc4
+
+Models:
+  - Name: efficientnetv2-b0_3rdparty_in1k
+    Metadata:
+      FLOPs: 919843360
+      Parameters: 7139704
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 78.52
+          Top 5 Accuracy: 94.44
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b0_3rdparty_in1k_20221221-9ef6e736.pth
+    Config: configs/efficientnet_v2/efficientnetv2-b0_8xb32_in1k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b0-c7cc451f.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-b1_3rdparty_in1k
+    Metadata:
+      FLOPs: 1438287552
+      Parameters: 8141052
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 79.80
+          Top 5 Accuracy: 94.89
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b1_3rdparty_in1k_20221221-6955d9ce.pth
+    Config: configs/efficientnet_v2/efficientnetv2-b1_8xb32_in1k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b1-be6e41b0.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-b2_3rdparty_in1k
+    Metadata:
+      FLOPs: 1986433080
+      Parameters: 10096086
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.63
+          Top 5 Accuracy: 95.30
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b2_3rdparty_in1k_20221221-74f7d493.pth
+    Config: configs/efficientnet_v2/efficientnetv2-b2_8xb32_in1k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b2-847de54e.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-b3_3rdparty_in1k
+    Metadata:
+      FLOPs: 3498068400
+      Parameters: 14358406
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.03
+          Top 5 Accuracy: 95.88
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-b3_3rdparty_in1k_20221221-b6f07a36.pth
+    Config: configs/efficientnet_v2/efficientnetv2-b3_8xb32_in1k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b3-57773f13.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-s_3rdparty_in1k
+    Metadata:
+      FLOPs: 9719420928
+      Parameters: 21458488
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.82
+          Top 5 Accuracy: 96.67
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-s_3rdparty_in1k_20221220-f0eaff9d.pth
+    Config: configs/efficientnet_v2/efficientnetv2-s_8xb32_in1k-384px.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s-eb54923e.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-m_3rdparty_in1k
+    Metadata:
+      FLOPs: 26880363584
+      Parameters: 54139356
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.01
+          Top 5 Accuracy: 97.26
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-m_3rdparty_in1k_20221220-9dc0c729.pth
+    Config: configs/efficientnet_v2/efficientnetv2-m_8xb32_in1k-480px.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m-cc09e0cd.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-l_3rdparty_in1k
+    Metadata:
+      FLOPs: 60142387008
+      Parameters: 118515272
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.43
+          Top 5 Accuracy: 97.31
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-l_3rdparty_in1k_20221220-5c3bac0f.pth
+    Config: configs/efficientnet_v2/efficientnetv2-l_8xb32_in1k-480px.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l-d664b728.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-s_in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 9719420928
+      Parameters: 21458488
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.29
+          Top 5 Accuracy: 97.26
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-s_in21k-pre-3rdparty_in1k_20221220-7a7c8475.pth
+    Config: configs/efficientnet_v2/efficientnetv2-s_8xb32_in1k-384px.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21ft1k-d7dafa41.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-m_in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 26880363584
+      Parameters: 54139356
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.47
+          Top 5 Accuracy: 97.76
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-m_in21k-pre-3rdparty_in1k_20221220-a1013a04.pth
+    Config: configs/efficientnet_v2/efficientnetv2-m_8xb32_in1k-480px.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21ft1k-bf41664a.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-l_in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 60142387008
+      Parameters: 118515272
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.31
+          Top 5 Accuracy: 97.99
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-l_in21k-pre-3rdparty_in1k_20221220-63df0efd.pth
+    Config: configs/efficientnet_v2/efficientnetv2-l_8xb32_in1k-480px.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21ft1k-60127a9d.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-xl_in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 98341230592
+      Parameters: 208119808
+    In Collection: EfficientNetV2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.39
+          Top 5 Accuracy: 97.83
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-xl_in21k-pre-3rdparty_in1k_20221220-583ac18b.pth
+    Config: configs/efficientnet_v2/efficientnetv2-xl_8xb32_in1k-512px.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21ft1k-06c35c48.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-s_3rdparty_in21k
+    Metadata:
+      FLOPs: 3309720768
+      Parameters: 48158371
+    In Collection: EfficientNetV2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-s_3rdparty_in21k_20221220-c0572b56.pth
+    Config: configs/efficientnet_v2/efficientnetv2-s_8xb32_in21k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21k-6337ad01.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-m_3rdparty_in21k
+    Metadata:
+      FLOPs: 5861638208
+      Parameters: 80839239
+    In Collection: EfficientNetV2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-m_3rdparty_in21k_20221220-073e944c.pth
+    Config: configs/efficientnet_v2/efficientnetv2-m_8xb32_in21k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21k-361418a2.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-l_3rdparty_in21k
+    Metadata:
+      FLOPs: 13114950464
+      Parameters: 145215155
+    In Collection: EfficientNetV2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-l_3rdparty_in21k_20221220-f28f91e1.pth
+    Config: configs/efficientnet_v2/efficientnetv2-l_8xb32_in21k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21k-91a19ec9.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
+  - Name: efficientnetv2-xl_3rdparty_in21k
+    Metadata:
+      FLOPs: 18855244288
+      Parameters: 234819691
+    In Collection: EfficientNetV2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientnetv2/efficientnetv2-xl_3rdparty_in21k_20221220-b2c9329c.pth
+    Config: configs/efficientnet_v2/efficientnetv2-xl_8xb32_in21k.py
+    Converted From:
+      Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21k-fd7e8abf.pth
+      Code: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py
diff --git a/docs/en/api/models.rst b/docs/en/api/models.rst
index 24ce3f77411..36befae92fe 100644
--- a/docs/en/api/models.rst
+++ b/docs/en/api/models.rst
@@ -73,6 +73,7 @@ Backbones
    EdgeNeXt
    EfficientFormer
    EfficientNet
+   EfficientNetV2
    HRNet
    HorNet
    InceptionV3
diff --git a/mmcls/models/backbones/__init__.py b/mmcls/models/backbones/__init__.py
index b583d988dfe..1e22bb67e2c 100644
--- a/mmcls/models/backbones/__init__.py
+++ b/mmcls/models/backbones/__init__.py
@@ -12,6 +12,7 @@
 from .edgenext import EdgeNeXt
 from .efficientformer import EfficientFormer
 from .efficientnet import EfficientNet
+from .efficientnet_v2 import EfficientNetV2
 from .hornet import HorNet
 from .hrnet import HRNet
 from .inception_v3 import InceptionV3
@@ -78,6 +79,7 @@
     'PCPVT',
     'SVT',
     'EfficientNet',
+    'EfficientNetV2',
     'ConvNeXt',
     'HRNet',
     'ResNetV1c',
diff --git a/mmcls/models/backbones/efficientnet.py b/mmcls/models/backbones/efficientnet.py
index be0b08a218d..b7ea5a82b2f 100644
--- a/mmcls/models/backbones/efficientnet.py
+++ b/mmcls/models/backbones/efficientnet.py
@@ -69,7 +69,7 @@ def __init__(self,
             in_channels=in_channels,
             out_channels=mid_channels,
             kernel_size=kernel_size,
-            stride=1,
+            stride=stride,
             padding=kernel_size // 2,
             conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
@@ -82,9 +82,9 @@ def __init__(self,
             in_channels=mid_channels,
             out_channels=out_channels,
             kernel_size=1,
-            stride=stride,
+            stride=1,
             padding=0,
-            conv_cfg=conv_cfg,
+            conv_cfg=None,
             norm_cfg=norm_cfg,
             act_cfg=None)
 
diff --git a/mmcls/models/backbones/efficientnet_v2.py b/mmcls/models/backbones/efficientnet_v2.py
new file mode 100644
index 00000000000..84539c49deb
--- /dev/null
+++ b/mmcls/models/backbones/efficientnet_v2.py
@@ -0,0 +1,343 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks import ConvModule, DropPath
+from mmengine.model import Sequential
+from torch import Tensor
+
+from mmcls.models.backbones.base_backbone import BaseBackbone
+from mmcls.models.backbones.efficientnet import EdgeResidual as FusedMBConv
+from mmcls.models.utils import InvertedResidual as MBConv
+from mmcls.registry import MODELS
+
+
+class EnhancedConvModule(ConvModule):
+    """ConvModule with short-cut and droppath.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``.
+        has_skip (bool): Whether there is short-cut. Defaults to False.
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Same as that in ``nn._ConvNd``.
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        inplace (bool): Whether to use inplace mode for activation.
+            Default: True.
+        with_spectral_norm (bool): Whether use spectral norm in conv module.
+            Default: False.
+        padding_mode (str): If the `padding_mode` has not been supported by
+            current `Conv2d` in PyTorch, we will use our own padding layer
+            instead. Currently, we support ['zeros', 'circular'] with official
+            implementation and ['reflect'] with our own implementation.
+            Default: 'zeros'.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Default: ('conv', 'norm', 'act').
+    """
+
+    def __init__(self, *args, has_skip=False, drop_path_rate=0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.has_skip = has_skip
+        if self.has_skip and (self.in_channels != self.out_channels
+                              or self.stride != (1, 1)):
+            raise ValueError('the stride must be 1 and the `in_channels` and'
+                             ' `out_channels` must be the same , when '
+                             '`has_skip` is True in `EnhancedConvModule` .')
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate else nn.Identity()
+
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        short_cut = x
+        x = super().forward(x, **kwargs)
+        if self.has_skip:
+            x = self.drop_path(x) + short_cut
+        return x
+
+
+@MODELS.register_module()
+class EfficientNetV2(BaseBackbone):
+    """EfficientNetV2 backbone.
+
+    A PyTorch implementation of EfficientNetV2 introduced by:
+    `EfficientNetV2: Smaller Models and Faster Training
+    <https://arxiv.org/abs/2104.00298>`_
+
+    Args:
+        arch (str): Architecture of efficientnetv2. Defaults to s.
+        in_channels (int): Number of input image channels. Defaults to 3.
+        drop_path_rate (float): The ratio of the stochastic depth.
+            Defaults to 0.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (-1, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Defaults to 0, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+    """
+
+    # Parameters to build layers. From left to right:
+    # - repeat (int): The repeat number of the block in the layer
+    # - kernel_size (int): The kernel size of the layer
+    # - stride (int): The stride of the first block of the layer
+    # - expand_ratio (int, float): The expand_ratio of the mid_channels
+    # - in_channel (int): The number of in_channels of the layer
+    # - out_channel (int): The number of out_channels of the layer
+    # - se_ratio (float): The sequeeze ratio of SELayer.
+    # - block_type (int): -2: ConvModule, -1: EnhancedConvModule,
+    #                      0: FusedMBConv, 1: MBConv
+    arch_settings = {
+        **dict.fromkeys(['small', 's'], [[2, 3, 1, 1, 24, 24, 0.0, -1],
+                                         [4, 3, 2, 4, 24, 48, 0.0, 0],
+                                         [4, 3, 2, 4, 48, 64, 0.0, 0],
+                                         [6, 3, 2, 4, 64, 128, 0.25, 1],
+                                         [9, 3, 1, 6, 128, 160, 0.25, 1],
+                                         [15, 3, 2, 6, 160, 256, 0.25, 1],
+                                         [1, 1, 1, 1, 256, 1280, 0.0, -2]]),
+        **dict.fromkeys(['m', 'medium'], [[3, 3, 1, 1, 24, 24, 0.0, -1],
+                                          [5, 3, 2, 4, 24, 48, 0.0, 0],
+                                          [5, 3, 2, 4, 48, 80, 0.0, 0],
+                                          [7, 3, 2, 4, 80, 160, 0.25, 1],
+                                          [14, 3, 1, 6, 160, 176, 0.25, 1],
+                                          [18, 3, 2, 6, 176, 304, 0.25, 1],
+                                          [5, 3, 1, 6, 304, 512, 0.25, 1],
+                                          [1, 1, 1, 1, 512, 1280, 0.0, -2]]),
+        **dict.fromkeys(['l', 'large'], [[4, 3, 1, 1, 32, 32, 0.0, -1],
+                                         [7, 3, 2, 4, 32, 64, 0.0, 0],
+                                         [7, 3, 2, 4, 64, 96, 0.0, 0],
+                                         [10, 3, 2, 4, 96, 192, 0.25, 1],
+                                         [19, 3, 1, 6, 192, 224, 0.25, 1],
+                                         [25, 3, 2, 6, 224, 384, 0.25, 1],
+                                         [7, 3, 1, 6, 384, 640, 0.25, 1],
+                                         [1, 1, 1, 1, 640, 1280, 0.0, -2]]),
+        **dict.fromkeys(['xl'], [[4, 3, 1, 1, 32, 32, 0.0, -1],
+                                 [8, 3, 2, 4, 32, 64, 0.0, 0],
+                                 [8, 3, 2, 4, 64, 96, 0.0, 0],
+                                 [16, 3, 2, 4, 96, 192, 0.25, 1],
+                                 [24, 3, 1, 6, 192, 256, 0.25, 1],
+                                 [32, 3, 2, 6, 256, 512, 0.25, 1],
+                                 [8, 3, 1, 6, 512, 640, 0.25, 1],
+                                 [1, 1, 1, 1, 640, 1280, 0.0, -2]]),
+        **dict.fromkeys(['b0'], [[1, 3, 1, 1, 32, 16, 0.0, -1],
+                                 [2, 3, 2, 4, 16, 32, 0.0, 0],
+                                 [2, 3, 2, 4, 32, 48, 0.0, 0],
+                                 [3, 3, 2, 4, 48, 96, 0.25, 1],
+                                 [5, 3, 1, 6, 96, 112, 0.25, 1],
+                                 [8, 3, 2, 6, 112, 192, 0.25, 1],
+                                 [1, 1, 1, 1, 192, 1280, 0.0, -2]]),
+        **dict.fromkeys(['b1'], [[2, 3, 1, 1, 32, 16, 0.0, -1],
+                                 [3, 3, 2, 4, 16, 32, 0.0, 0],
+                                 [3, 3, 2, 4, 32, 48, 0.0, 0],
+                                 [4, 3, 2, 4, 48, 96, 0.25, 1],
+                                 [6, 3, 1, 6, 96, 112, 0.25, 1],
+                                 [9, 3, 2, 6, 112, 192, 0.25, 1],
+                                 [1, 1, 1, 1, 192, 1280, 0.0, -2]]),
+        **dict.fromkeys(['b2'], [[2, 3, 1, 1, 32, 16, 0.0, -1],
+                                 [3, 3, 2, 4, 16, 32, 0.0, 0],
+                                 [3, 3, 2, 4, 32, 56, 0.0, 0],
+                                 [4, 3, 2, 4, 56, 104, 0.25, 1],
+                                 [6, 3, 1, 6, 104, 120, 0.25, 1],
+                                 [10, 3, 2, 6, 120, 208, 0.25, 1],
+                                 [1, 1, 1, 1, 208, 1408, 0.0, -2]]),
+        **dict.fromkeys(['b3'], [[2, 3, 1, 1, 40, 16, 0.0, -1],
+                                 [3, 3, 2, 4, 16, 40, 0.0, 0],
+                                 [3, 3, 2, 4, 40, 56, 0.0, 0],
+                                 [5, 3, 2, 4, 56, 112, 0.25, 1],
+                                 [7, 3, 1, 6, 112, 136, 0.25, 1],
+                                 [12, 3, 2, 6, 136, 232, 0.25, 1],
+                                 [1, 1, 1, 1, 232, 1536, 0.0, -2]])
+    }
+
+    def __init__(self,
+                 arch: str = 's',
+                 in_channels: int = 3,
+                 drop_path_rate: float = 0.,
+                 out_indices: Sequence[int] = (-1, ),
+                 frozen_stages: int = 0,
+                 conv_cfg=dict(type='Conv2dAdaptivePadding'),
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.1),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval: bool = False,
+                 with_cp: bool = False,
+                 init_cfg=[
+                     dict(type='Kaiming', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=['_BatchNorm', 'GroupNorm'],
+                         val=1)
+                 ]):
+        super(EfficientNetV2, self).__init__(init_cfg)
+        assert arch in self.arch_settings, \
+            f'"{arch}" is not one of the arch_settings ' \
+            f'({", ".join(self.arch_settings.keys())})'
+        self.arch = self.arch_settings[arch]
+        if frozen_stages not in range(len(self.arch) + 1):
+            raise ValueError('frozen_stages must be in range(0, '
+                             f'{len(self.arch)}), but get {frozen_stages}')
+        self.drop_path_rate = drop_path_rate
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.layers = nn.ModuleList()
+        assert self.arch[-1][-1] == -2, \
+            f'the last block_type of `arch_setting` must be -2 ,' \
+            f'but get `{self.arch[-1][-1]}`'
+        self.in_channels = in_channels
+        self.out_channels = self.arch[-1][5]
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.make_layers()
+
+        # there len(slef.arch) + 2 layers in the backbone
+        # including: the first + len(self.arch) layers + the last
+        if isinstance(out_indices, int):
+            out_indices = [out_indices]
+        assert isinstance(out_indices, Sequence), \
+            f'"out_indices" must by a sequence or int, ' \
+            f'get {type(out_indices)} instead.'
+        out_indices = list(out_indices)
+        for i, index in enumerate(out_indices):
+            if index < 0:
+                out_indices[i] = len(self.layers) + index
+            assert 0 <= out_indices[i] <= len(self.layers), \
+                f'Invalid out_indices {index}.'
+        self.out_indices = out_indices
+
+    def make_layers(self, ):
+        # make the first layer
+        self.layers.append(
+            ConvModule(
+                in_channels=self.in_channels,
+                out_channels=self.arch[0][4],
+                kernel_size=3,
+                stride=2,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+
+        in_channels = self.arch[0][4]
+        layer_setting = self.arch[:-1]
+
+        total_num_blocks = sum([x[0] for x in layer_setting])
+        block_idx = 0
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, total_num_blocks)
+        ]  # stochastic depth decay rule
+
+        for layer_cfg in layer_setting:
+            layer = []
+            (repeat, kernel_size, stride, expand_ratio, _, out_channels,
+             se_ratio, block_type) = layer_cfg
+            for i in range(repeat):
+                stride = stride if i == 0 else 1
+                if block_type == -1:
+                    has_skip = stride == 1 and in_channels == out_channels
+                    droppath_rate = dpr[block_idx] if has_skip else 0.0
+                    layer.append(
+                        EnhancedConvModule(
+                            in_channels=in_channels,
+                            out_channels=out_channels,
+                            kernel_size=kernel_size,
+                            has_skip=has_skip,
+                            drop_path_rate=droppath_rate,
+                            stride=stride,
+                            padding=1,
+                            conv_cfg=None,
+                            norm_cfg=self.norm_cfg,
+                            act_cfg=self.act_cfg))
+                    in_channels = out_channels
+                else:
+                    mid_channels = int(in_channels * expand_ratio)
+                    se_cfg = None
+                    if block_type != 0 and se_ratio > 0:
+                        se_cfg = dict(
+                            channels=mid_channels,
+                            ratio=expand_ratio * (1.0 / se_ratio),
+                            divisor=1,
+                            act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                    block = FusedMBConv if block_type == 0 else MBConv
+                    conv_cfg = self.conv_cfg if stride == 2 else None
+                    layer.append(
+                        block(
+                            in_channels=in_channels,
+                            out_channels=out_channels,
+                            mid_channels=mid_channels,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            se_cfg=se_cfg,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=self.norm_cfg,
+                            act_cfg=self.act_cfg,
+                            drop_path_rate=dpr[block_idx],
+                            with_cp=self.with_cp))
+                    in_channels = out_channels
+                block_idx += 1
+            self.layers.append(Sequential(*layer))
+
+        # make the last layer
+        self.layers.append(
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.arch[-1][1],
+                stride=self.arch[-1][2],
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+
+    def forward(self, x: Tensor) -> Tuple[Tensor]:
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages):
+            m = self.layers[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super(EfficientNetV2, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/model-index.yml b/model-index.yml
index c036c685c99..c4d8ffb593e 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -47,3 +47,4 @@ Import:
   - configs/revvit/metafile.yml
   - configs/clip/metafile.yml
   - configs/mixmim/metafile.yml
+  - configs/efficientnet_v2/metafile.yml
diff --git a/tests/test_models/test_backbones/test_efficientnet_v2.py b/tests/test_models/test_backbones/test_efficientnet_v2.py
new file mode 100644
index 00000000000..240688b099b
--- /dev/null
+++ b/tests/test_models/test_backbones/test_efficientnet_v2.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.nn.modules import GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmcls.models.backbones import EfficientNetV2
+
+
+def is_norm(modules):
+    """Check if is one of the norms."""
+    if isinstance(modules, (GroupNorm, _BatchNorm)):
+        return True
+    return False
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+def test_efficientnet_v2_backbone():
+    with pytest.raises(TypeError):
+        # pretrained must be a string path
+        model = EfficientNetV2()
+        model.init_weights(pretrained=0)
+
+    with pytest.raises(AssertionError):
+        # arch must in arc_settings
+        EfficientNetV2(arch='others')
+
+    with pytest.raises(ValueError):
+        # frozen_stages must less than 8
+        EfficientNetV2(arch='b1', frozen_stages=12)
+
+    # Test EfficientNetV2
+    model = EfficientNetV2()
+    model.init_weights()
+    model.train()
+    x = torch.rand((1, 3, 224, 224))
+    model(x)
+
+    # Test EfficientNetV2 with first stage frozen
+    frozen_stages = 7
+    model = EfficientNetV2(arch='b0', frozen_stages=frozen_stages)
+    model.init_weights()
+    model.train()
+    for i in range(frozen_stages):
+        layer = model.layers[i]
+        for mod in layer.modules():
+            if isinstance(mod, _BatchNorm):
+                assert mod.training is False
+        for param in layer.parameters():
+            assert param.requires_grad is False
+
+    # Test EfficientNetV2 with norm eval
+    model = EfficientNetV2(norm_eval=True)
+    model.init_weights()
+    model.train()
+    assert check_norm_state(model.modules(), False)
+
+    # Test EfficientNetV2 forward with 'b0' arch
+    out_channels = [32, 16, 32, 48, 96, 112, 192, 1280]
+    model = EfficientNetV2(arch='b0', out_indices=(0, 1, 2, 3, 4, 5, 6, 7))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 8
+    assert feat[0].shape == torch.Size([1, out_channels[0], 112, 112])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 112, 112])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 56, 56])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 28, 28])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 14, 14])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 14, 14])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7])
+
+    # Test EfficientNetV2 forward with 'b0' arch and GroupNorm
+    out_channels = [32, 16, 32, 48, 96, 112, 192, 1280]
+    model = EfficientNetV2(
+        arch='b0',
+        out_indices=(0, 1, 2, 3, 4, 5, 6, 7),
+        norm_cfg=dict(type='GN', num_groups=2, requires_grad=True))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 64, 64)
+    feat = model(imgs)
+    assert len(feat) == 8
+    assert feat[0].shape == torch.Size([1, out_channels[0], 32, 32])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 32, 32])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 16, 16])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 8, 8])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 4, 4])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 4, 4])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 2, 2])
+    assert feat[7].shape == torch.Size([1, out_channels[7], 2, 2])
+
+    # Test EfficientNetV2 forward with 'm' arch
+    out_channels = [24, 24, 48, 80, 160, 176, 304, 512, 1280]
+    model = EfficientNetV2(arch='m', out_indices=(0, 1, 2, 3, 4, 5, 6, 7, 8))
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 64, 64)
+    feat = model(imgs)
+    assert len(feat) == 9
+    assert feat[0].shape == torch.Size([1, out_channels[0], 32, 32])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 32, 32])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 16, 16])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 8, 8])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 4, 4])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 4, 4])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 2, 2])
+    assert feat[7].shape == torch.Size([1, out_channels[7], 2, 2])
+    assert feat[8].shape == torch.Size([1, out_channels[8], 2, 2])
+
+    # Test EfficientNetV2 forward with 'm' arch and GroupNorm
+    out_channels = [24, 24, 48, 80, 160, 176, 304, 512, 1280]
+    model = EfficientNetV2(
+        arch='m',
+        out_indices=(0, 1, 2, 3, 4, 5, 6, 7, 8),
+        norm_cfg=dict(type='GN', num_groups=2, requires_grad=True))
+    for m in model.modules():
+        if is_norm(m):
+            assert isinstance(m, GroupNorm)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 64, 64)
+    feat = model(imgs)
+    assert len(feat) == 9
+    assert feat[0].shape == torch.Size([1, out_channels[0], 32, 32])
+    assert feat[1].shape == torch.Size([1, out_channels[1], 32, 32])
+    assert feat[2].shape == torch.Size([1, out_channels[2], 16, 16])
+    assert feat[3].shape == torch.Size([1, out_channels[3], 8, 8])
+    assert feat[4].shape == torch.Size([1, out_channels[4], 4, 4])
+    assert feat[5].shape == torch.Size([1, out_channels[5], 4, 4])
+    assert feat[6].shape == torch.Size([1, out_channels[6], 2, 2])
+    assert feat[7].shape == torch.Size([1, out_channels[7], 2, 2])
+    assert feat[8].shape == torch.Size([1, out_channels[8], 2, 2])
diff --git a/tools/model_converters/efficientnetv2_to_mmcls.py b/tools/model_converters/efficientnetv2_to_mmcls.py
new file mode 100644
index 00000000000..b6ae4ec1c8f
--- /dev/null
+++ b/tools/model_converters/efficientnetv2_to_mmcls.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""convert the weights of efficientnetv2 in
+timm(https://github.com/rwightman/pytorch-image-models) to mmcls format."""
+import argparse
+import os.path as osp
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_from_efficientnetv2_timm(param):
+    # main change_key
+    param_lst = list(param.keys())
+    op = str(int(param_lst[-9][7]) + 2)
+    new_key = dict()
+    for name in param_lst:
+        data = param[name]
+        if 'blocks' not in name:
+            if 'conv_stem' in name:
+                name = name.replace('conv_stem', 'backbone.layers.0.conv')
+            if 'bn1' in name:
+                name = name.replace('bn1', 'backbone.layers.0.bn')
+            if 'conv_head' in name:
+                # if efficientnet-v2_s/base/b1/b2/b3，op = 7，
+                # if for m/l/xl , op = 8
+                name = name.replace('conv_head', f'backbone.layers.{op}.conv')
+            if 'bn2' in name:
+                name = name.replace('bn2', f'backbone.layers.{op}.bn')
+            if 'classifier' in name:
+                name = name.replace('classifier', 'head.fc')
+        else:
+            operator = int(name[7])
+            if operator == 0:
+                name = name[:7] + str(operator + 1) + name[8:]
+                name = name.replace('blocks', 'backbone.layers')
+                if 'conv' in name:
+                    name = name.replace('conv', 'conv')
+                if 'bn1' in name:
+                    name = name.replace('bn1', 'bn')
+            elif operator < 3:
+                name = name[:7] + str(operator + 1) + name[8:]
+                name = name.replace('blocks', 'backbone.layers')
+                if 'conv_exp' in name:
+                    name = name.replace('conv_exp', 'conv1.conv')
+                if 'conv_pwl' in name:
+                    name = name.replace('conv_pwl', 'conv2.conv')
+                if 'bn1' in name:
+                    name = name.replace('bn1', 'conv1.bn')
+                if 'bn2' in name:
+                    name = name.replace('bn2', 'conv2.bn')
+            else:
+                name = name[:7] + str(operator + 1) + name[8:]
+                name = name.replace('blocks', 'backbone.layers')
+                if 'conv_pwl' in name:
+                    name = name.replace('conv_pwl', 'linear_conv.conv')
+                if 'conv_pw' in name:
+                    name = name.replace('conv_pw', 'expand_conv.conv')
+                if 'conv_dw' in name:
+                    name = name.replace('conv_dw', 'depthwise_conv.conv')
+                if 'bn1' in name:
+                    name = name.replace('bn1', 'expand_conv.bn')
+                if 'bn2' in name:
+                    name = name.replace('bn2', 'depthwise_conv.bn')
+                if 'bn3' in name:
+                    name = name.replace('bn3', 'linear_conv.bn')
+                if 'se.conv_reduce' in name:
+                    name = name.replace('se.conv_reduce', 'se.conv1.conv')
+                if 'se.conv_expand' in name:
+                    name = name.replace('se.conv_expand', 'se.conv2.conv')
+        new_key[name] = data
+    return new_key
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert pretrained efficientnetv2 '
+        'models in timm to mmcls style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    weight = convert_from_efficientnetv2_timm(state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+    print('Done!!')
+
+
+if __name__ == '__main__':
+    main()

From e0e6a1f1aea306de0aec0186e83210cf7b7e4ca6 Mon Sep 17 00:00:00 2001
From: WINDSKY45 <645297763@qq.com>
Date: Fri, 30 Dec 2022 15:21:51 +0800
Subject: [PATCH 18/21] [Docs] Fix typo. (#1285)

---
 mmcls/models/necks/gem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mmcls/models/necks/gem.py b/mmcls/models/necks/gem.py
index 2ec7f860ddb..fd048469db6 100644
--- a/mmcls/models/necks/gem.py
+++ b/mmcls/models/necks/gem.py
@@ -31,7 +31,7 @@ class GeneralizedMeanPooling(nn.Module):
     """
 
     def __init__(self, p=3., eps=1e-6, clamp=True):
-        assert p >= 1, "'p' must be a value greater then 1"
+        assert p >= 1, "'p' must be a value greater than 1"
         super(GeneralizedMeanPooling, self).__init__()
         self.p = Parameter(torch.ones(1) * p)
         self.eps = eps

From 88e5ba28db6e4422d11b94459e7412d1679837e9 Mon Sep 17 00:00:00 2001
From: Ezra-Yu <18586273+Ezra-Yu@users.noreply.github.com>
Date: Fri, 30 Dec 2022 15:49:56 +0800
Subject: [PATCH 19/21] [Reproduce] Reproduce RepVGG  Training Accuracy.
 (#1264)

* repr repvgg

* add VisionRRC

* uodate repvgg configs

* add BCD seriers cfgs

* add cv backend config

* add vision configs

* add L2se configs

* add ra configs

* add num-works configs

* add num-works configs

* configs

* update README

* rm extra config

* reset un-needed changes

* update

* reset pbn

* update readme

* update code

* update code

* refine doc
---
 configs/repvgg/README.md                      | 217 +++++++++++++-----
 .../repvgg-A0_deploy_4xb64-coslr-120e_in1k.py |   3 -
 .../repvgg-A1_deploy_4xb64-coslr-120e_in1k.py |   3 -
 .../repvgg-A2_deploy_4xb64-coslr-120e_in1k.py |   3 -
 .../repvgg-B0_deploy_4xb64-coslr-120e_in1k.py |   3 -
 .../repvgg-B1_deploy_4xb64-coslr-120e_in1k.py |   3 -
 ...epvgg-B1g2_deploy_4xb64-coslr-120e_in1k.py |   3 -
 ...epvgg-B1g4_deploy_4xb64-coslr-120e_in1k.py |   3 -
 .../repvgg-B2_deploy_4xb64-coslr-120e_in1k.py |   3 -
 ...4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py |   3 -
 ...4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py |   3 -
 ...4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py |   3 -
 ...4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py |   3 -
 configs/repvgg/metafile.yml                   | 191 +++++++--------
 .../repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py |  12 -
 configs/repvgg/repvgg-A0_8xb32_in1k.py        |  33 +++
 configs/repvgg/repvgg-A0_deploy_in1k.py       |   3 +
 .../repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py |   3 -
 configs/repvgg/repvgg-A1_8xb32_in1k.py        |   3 +
 ...r-120e_in1k.py => repvgg-A2_8xb32_in1k.py} |   2 +-
 ...r-120e_in1k.py => repvgg-B0_8xb32_in1k.py} |   2 +-
 ...r-120e_in1k.py => repvgg-B1_8xb32_in1k.py} |   2 +-
 ...120e_in1k.py => repvgg-B1g2_8xb32_in1k.py} |   2 +-
 ...120e_in1k.py => repvgg-B1g4_8xb32_in1k.py} |   2 +-
 ...r-120e_in1k.py => repvgg-B2_8xb32_in1k.py} |   2 +-
 ...4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py |   3 -
 configs/repvgg/repvgg-B2g4_8xb32_in1k.py      |   3 +
 ...r-200e_in1k.py => repvgg-B3_8xb32_in1k.py} |  36 ++-
 ...4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py |   3 -
 configs/repvgg/repvgg-B3g4_8xb32_in1k.py      |   3 +
 ...4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py |   3 -
 configs/repvgg/repvgg-D2se_8xb32_in1k.py      |  28 +++
 32 files changed, 352 insertions(+), 237 deletions(-)
 delete mode 100644 configs/repvgg/deploy/repvgg-A0_deploy_4xb64-coslr-120e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-A1_deploy_4xb64-coslr-120e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-A2_deploy_4xb64-coslr-120e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-B0_deploy_4xb64-coslr-120e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-B1_deploy_4xb64-coslr-120e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-B1g2_deploy_4xb64-coslr-120e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-B1g4_deploy_4xb64-coslr-120e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-B2_deploy_4xb64-coslr-120e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-B2g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-B3_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-B3g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
 delete mode 100644 configs/repvgg/deploy/repvgg-D2se_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
 delete mode 100644 configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py
 create mode 100644 configs/repvgg/repvgg-A0_8xb32_in1k.py
 create mode 100644 configs/repvgg/repvgg-A0_deploy_in1k.py
 delete mode 100644 configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py
 create mode 100644 configs/repvgg/repvgg-A1_8xb32_in1k.py
 rename configs/repvgg/{repvgg-A2_4xb64-coslr-120e_in1k.py => repvgg-A2_8xb32_in1k.py} (58%)
 rename configs/repvgg/{repvgg-B0_4xb64-coslr-120e_in1k.py => repvgg-B0_8xb32_in1k.py} (58%)
 rename configs/repvgg/{repvgg-B1_4xb64-coslr-120e_in1k.py => repvgg-B1_8xb32_in1k.py} (58%)
 rename configs/repvgg/{repvgg-B1g2_4xb64-coslr-120e_in1k.py => repvgg-B1g2_8xb32_in1k.py} (59%)
 rename configs/repvgg/{repvgg-B1g4_4xb64-coslr-120e_in1k.py => repvgg-B1g4_8xb32_in1k.py} (59%)
 rename configs/repvgg/{repvgg-B2_4xb64-coslr-120e_in1k.py => repvgg-B2_8xb32_in1k.py} (58%)
 delete mode 100644 configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
 create mode 100644 configs/repvgg/repvgg-B2g4_8xb32_in1k.py
 rename configs/repvgg/{repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py => repvgg-B3_8xb32_in1k.py} (54%)
 delete mode 100644 configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
 create mode 100644 configs/repvgg/repvgg-B3g4_8xb32_in1k.py
 delete mode 100644 configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
 create mode 100644 configs/repvgg/repvgg-D2se_8xb32_in1k.py

diff --git a/configs/repvgg/README.md b/configs/repvgg/README.md
index a1bded13eb0..a6cf6c98d13 100644
--- a/configs/repvgg/README.md
+++ b/configs/repvgg/README.md
@@ -1,43 +1,134 @@
 # RepVGG
 
-> [Repvgg: Making vgg-style convnets great again](https://arxiv.org/abs/2101.03697)
+> [RepVGG: Making VGG-style ConvNets Great Again](https://arxiv.org/abs/2101.03697)
 
 <!-- [ALGORITHM] -->
 
-## Abstract
+## Introduction
 
-We present a simple but powerful architecture of convolutional neural network, which has a VGG-like inference-time body composed of nothing but a stack of 3x3 convolution and ReLU, while the training-time model has a multi-branch topology. Such decoupling of the training-time and inference-time architecture is realized by a structural re-parameterization technique so that the model is named RepVGG. On ImageNet, RepVGG reaches over 80% top-1 accuracy, which is the first time for a plain model, to the best of our knowledge. On NVIDIA 1080Ti GPU, RepVGG models run 83% faster than ResNet-50 or 101% faster than ResNet-101 with higher accuracy and show favorable accuracy-speed trade-off compared to the state-of-the-art models like EfficientNet and RegNet.
+RepVGG is a VGG-style convolutional architecture. It has the following advantages:
+
+1. The model has a VGG-like plain (a.k.a. feed-forward) topology 1 without any branches. I.e., every layer takes the output of its only preceding layer as input and feeds the output into its only following layer.
+2. The model’s body uses only 3 × 3 conv and ReLU.
+3. The concrete architecture (including the specific depth and layer widths) is instantiated with no automatic search, manual refinement, compound scaling, nor other heavy designs.
 
 <div align=center>
 <img src="https://user-images.githubusercontent.com/26739999/142573223-f7f14d32-ea08-43a1-81ad-5a6a83ee0122.png" width="60%"/>
 </div>
 
-## Results and models
+## Abstract
 
-### ImageNet-1k
+<details>
 
-|     Model     | Epochs |             Params(M)             |            Flops(G)             | Top-1 (%) | Top-5 (%) |                      Config                      |                      Download                       |
-| :-----------: | :----: | :-------------------------------: | :-----------------------------: | :-------: | :-------: | :----------------------------------------------: | :-------------------------------------------------: |
-|  RepVGG-A0\*  |  120   |   9.11（train) \| 8.31 (deploy)   |  1.52 (train) \| 1.36 (deploy)  |   72.41   |   90.50   | [config (train)](./repvgg-A0_4xb64-coslr-120e_in1k.py) \| [config (deploy)](./deploy/repvgg-A0_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_3rdparty_4xb64-coslr-120e_in1k_20210909-883ab98c.pth) |
-|  RepVGG-A1\*  |  120   |  14.09 (train) \| 12.79 (deploy)  |  2.64 (train) \| 2.37 (deploy)  |   74.47   |   91.85   | [config (train)](./repvgg-A1_4xb64-coslr-120e_in1k.py) \| [config (deploy)](./deploy/repvgg-A1_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_3rdparty_4xb64-coslr-120e_in1k_20210909-24003a24.pth) |
-|  RepVGG-A2\*  |  120   |  28.21 (train) \| 25.5 (deploy)   |  5.7 (train)  \| 5.12 (deploy)  |   76.48   |   93.01   | [config (train)](./repvgg-A2_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-A2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_3rdparty_4xb64-coslr-120e_in1k_20210909-97d7695a.pth) |
-|  RepVGG-B0\*  |  120   |  15.82 (train) \| 14.34 (deploy)  |  3.42 (train) \| 3.06 (deploy)  |   75.14   |   92.42   | [config (train)](./repvgg-B0_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-B0_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_3rdparty_4xb64-coslr-120e_in1k_20210909-446375f4.pth) |
-|  RepVGG-B1\*  |  120   |  57.42 (train) \| 51.83 (deploy)  | 13.16 (train) \| 11.82 (deploy) |   78.37   |   94.11   | [config (train)](./repvgg-B1_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-B1_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_3rdparty_4xb64-coslr-120e_in1k_20210909-750cdf67.pth) |
-| RepVGG-B1g2\* |  120   |  45.78 (train) \| 41.36 (deploy)  |  9.82 (train) \| 8.82 (deploy)  |   77.79   |   93.88   | [config (train)](./repvgg-B1g2_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-B1g2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k_20210909-344f6422.pth) |
-| RepVGG-B1g4\* |  120   |  39.97 (train) \| 36.13 (deploy)  |  8.15 (train) \| 7.32 (deploy)  |   77.58   |   93.84   | [config (train)](./repvgg-B1g4_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-B1g4_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k_20210909-d4c1a642.pth) |
-|  RepVGG-B2\*  |  120   |  89.02 (train) \| 80.32 (deploy)  | 20.46 (train) \| 18.39 (deploy) |   78.78   |   94.42   | [config (train)](./repvgg-B2_4xb64-coslr-120e_in1k.py) \|[config (deploy)](./deploy/repvgg-B2_deploy_4xb64-coslr-120e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_3rdparty_4xb64-coslr-120e_in1k_20210909-bd6b937c.pth) |
-| RepVGG-B2g4\* |  200   |  61.76 (train) \| 55.78 (deploy)  | 12.63 (train) \| 11.34 (deploy) |   79.38   |   94.68   | [config (train)](./repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](./deploy/repvgg-B2g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-7b7955f0.pth) |
-|  RepVGG-B3\*  |  200   | 123.09 (train) \| 110.96 (deploy) | 29.17 (train) \| 26.22 (deploy) |   80.52   |   95.26   | [config (train)](./repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](./deploy/repvgg-B3_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-dda968bf.pth) |
-| RepVGG-B3g4\* |  200   |  83.83 (train) \| 75.63 (deploy)  | 17.9 (train) \| 16.08 (deploy)  |   80.22   |   95.10   | [config (train)](./repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](./deploy/repvgg-B3g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-4e54846a.pth) |
-| RepVGG-D2se\* |  200   | 133.33 (train) \| 120.39 (deploy) | 36.56 (train) \| 32.85 (deploy) |   81.81   |   95.94   | [config (train)](./repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) \|[config (deploy)](./deploy/repvgg-D2se_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-cf3139b7.pth) |
-
-*Models with * are converted from the [official repo](https://github.com/DingXiaoH/RepVGG). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+<summary>Show the paper's abstract</summary>
+
+<br>
+We present a simple but powerful architecture of convolutional neural network, which has a VGG-like inference-time body composed of nothing but a stack of 3x3 convolution and ReLU, while the training-time model has a multi-branch topology. Such decoupling of the training-time and inference-time architecture is realized by a structural re-parameterization technique so that the model is named RepVGG. On ImageNet, RepVGG reaches over 80% top-1 accuracy, which is the first time for a plain model, to the best of our knowledge. On NVIDIA 1080Ti GPU, RepVGG models run 83% faster than ResNet-50 or 101% faster than ResNet-101 with higher accuracy and show favorable accuracy-speed trade-off compared to the state-of-the-art models like EfficientNet and RegNet.
+</br>
+
+</details>
 
 ## How to use
 
-The checkpoints provided are all `training-time` models. Use the reparameterize tool to switch them to more efficient `inference-time` architecture, which not only has fewer parameters but also less calculations.
+The checkpoints provided are all `training-time` models. Use the reparameterize tool or `switch_to_deploy` interface to switch them to more efficient `inference-time` architecture, which not only has fewer parameters but also less calculations.
+
+<!-- [TABS-BEGIN] -->
+
+**Predict image**
+
+Use `classifier.backbone.switch_to_deploy()` interface to switch the RepVGG models into inference mode.
+
+```python
+>>> import torch
+>>> from mmcls.apis import init_model, inference_model
+>>>
+>>> model = init_model('configs/repvgg/repvgg-A0_8xb32_in1k.py', 'https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_8xb32_in1k_20221213-60ae8e23.pth')
+>>> results = inference_model(model, 'demo/demo.JPEG')
+>>> print( (results['pred_class'], results['pred_score']) )
+('sea snake' 0.8338906168937683)
+>>>
+>>> # switch to deploy mode
+>>> model.backbone.switch_to_deploy()
+>>> results = inference_model(model, 'demo/demo.JPEG')
+>>> print( (results['pred_class'], results['pred_score']) )
+('sea snake', 0.7883061170578003)
+```
+
+**Use the model**
+
+```python
+>>> import torch
+>>> from mmcls.apis import get_model
+>>>
+>>> model = get_model("repvgg-a0_8xb32_in1k", pretrained=True)
+>>> model.eval()
+>>> inputs = torch.rand(1, 3, 224, 224).to(model.data_preprocessor.device)
+>>> # To get classification scores.
+>>> out = model(inputs)
+>>> print(out.shape)
+torch.Size([1, 1000])
+>>> # To extract features.
+>>> outs = model.extract_feat(inputs)
+>>> print(outs[0].shape)
+torch.Size([1, 1280])
+>>>
+>>> # switch to deploy mode
+>>> model.backbone.switch_to_deploy()
+>>> out_deploy = model(inputs)
+>>> print(out.shape)
+torch.Size([1, 1000])
+>>> assert torch.allclose(out, out_deploy, rtol=1e-4, atol=1e-5) # pass without error
+```
+
+**Train/Test Command**
+
+Place the ImageNet dataset to the `data/imagenet/` directory, or prepare datasets according to the [docs](https://mmclassification.readthedocs.io/en/1.x/user_guides/dataset_prepare.html#prepare-dataset).
+
+Train:
+
+```shell
+python tools/train.py configs/repvgg/repvgg-a0_8xb32_in1k.py
+```
+
+Download Checkpoint:
+
+```shell
+wget https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_8xb32_in1k_20221213-60ae8e23.pth
+```
+
+Test use unfused model:
+
+```shell
+python tools/test.py configs/repvgg/repvgg-a0_8xb32_in1k.py repvgg-A0_8xb32_in1k_20221213-60ae8e23.pth
+```
 
-### Use tool
+Reparameterize checkpoint:
+
+```shell
+python ./tools/convert_models/reparameterize_model.py configs/repvgg/repvgg-a0_8xb32_in1k.py repvgg-A0_8xb32_in1k_20221213-60ae8e23.pth repvgg_A0_deploy.pth
+```
+
+Test use fused model:
+
+```shell
+python tools/test.py configs/repvgg/repvgg-A0_8xb32_in1k.py repvgg_A0_deploy.pth --cfg-options model.backbone.deploy=True
+```
+
+or
+
+```shell
+python tools/test.py configs/repvgg/repvgg-A0_deploy_in1k.py repvgg_A0_deploy.pth
+```
+
+<!-- [TABS-END] -->
+
+For more configurable parameters, please refer to the [API](https://mmclassification.readthedocs.io/en/1.x/api/generated/mmcls.models.backbones.RepVGG.html#mmcls.models.backbones.RepVGG).
+
+<details>
+
+<summary><b>How to use the reparameterisation tool</b>(click to show)</summary>
+
+<br>
 
 Use provided tool to reparameterize the given model and save the checkpoint:
 
@@ -45,52 +136,68 @@ Use provided tool to reparameterize the given model and save the checkpoint:
 python tools/convert_models/reparameterize_model.py ${CFG_PATH} ${SRC_CKPT_PATH} ${TARGET_CKPT_PATH}
 ```
 
-`${CFG_PATH}` is the config file, `${SRC_CKPT_PATH}` is the source chenpoint file, `${TARGET_CKPT_PATH}` is the target deploy weight file path.
+`${CFG_PATH}` is the config file path, `${SRC_CKPT_PATH}` is the source chenpoint file path, `${TARGET_CKPT_PATH}` is the target deploy weight file path.
 
-To use reparameterized weights, the config file must switch to the deploy config files.
+For example:
 
-```bash
-python tools/test.py ${Deploy_CFG} ${Deploy_Checkpoint} --metrics accuracy
+```shell
+# download the weight
+wget https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_8xb32_in1k_20221213-60ae8e23.pth
+# reparameterize unfused weight to fused weight
+python ./tools/convert_models/reparameterize_model.py configs/repvgg/repvgg-a0_8xb32_in1k.py repvgg-A0_8xb32_in1k_20221213-60ae8e23.pth repvgg-A0_deploy.pth
 ```
 
-### In the code
+To use reparameterized weights, the config file must switch to **the deploy config files** as [the deploy_A0 example](./repvgg-A0_deploy_in1k.py) or add `--cfg-options model.backbone.deploy=True` in command.
 
-Use `backbone.switch_to_deploy()` or `classificer.backbone.switch_to_deploy()` to switch to the deploy mode. For example:
+For example of using the reparameterized weights above:
 
-```python
-from mmcls.models import build_backbone
+```shell
+python ./tools/test.py ./configs/repvgg/repvgg-A0_deploy_in1k.py  repvgg-A0_deploy.pth
+```
+
+You can get other deploy configs by modifying the [A0_deploy example](./repvgg-A0_deploy_in1k.py):
+
+```text
+# in repvgg-A0_deploy_in1k.py
+_base_ = '../repvgg-A0_8xb32_in1k.py'  # basic A0 config
 
-backbone_cfg=dict(type='RepVGG',arch='A0'),
-backbone = build_backbone(backbone_cfg)
-backbone.switch_to_deploy()
+model = dict(backbone=dict(deploy=True))  # switch model into deploy mode
 ```
 
-or
+or add `--cfg-options model.backbone.deploy=True` in command as following：
 
-```python
-from mmcls.models import build_classifier
-
-cfg = dict(
-    type='ImageClassifier',
-    backbone=dict(
-        type='RepVGG',
-        arch='A0'),
-    neck=dict(type='GlobalAveragePooling'),
-    head=dict(
-        type='LinearClsHead',
-        num_classes=1000,
-        in_channels=1280,
-        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
-        topk=(1, 5),
-    ))
-
-classifier = build_classifier(cfg)
-classifier.backbone.switch_to_deploy()
+```shell
+python tools/test.py configs/repvgg/repvgg-A0_8xb32_in1k.py repvgg_A0_deploy.pth --cfg-options model.backbone.deploy=True
 ```
 
+</br>
+
+</details>
+
+## Results and models
+
+### ImageNet-1k
+
+|            Model            |   Pretrain   | <p> Params(M) <br>（train\|deploy) </p> | <p> Flops(G)  <br>（train\|deploy)  </p> | Top-1 (%) | Top-5 (%) |             Config              |             Download              |
+| :-------------------------: | :----------: | :-------------------------------------: | :--------------------------------------: | :-------: | :-------: | :-----------------------------: | :-------------------------------: |
+|    repvgg-A0_8xb32_in1k     | From scratch |              9.11 \| 8.31               |               1.53 \| 1.36               |   72.37   |   90.56   | [config](./repvgg-A0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_8xb32_in1k_20221213-60ae8e23.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_8xb32_in1k_20221213-60ae8e23.log) |
+|    repvgg-A1_8xb32_in1k     | From scratch |             14.09  \| 12.79             |               2.65 \| 2.37               |   74.47   |   91.85   | [config](./repvgg-A1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_8xb32_in1k_20221213-f81bf3df.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_8xb32_in1k_20221213-f81bf3df.log) |
+|    repvgg-A2_8xb32_in1k     | From scratch |             28.21   \| 25.5             |             5.72    \| 5.12              |   76.49   |   93.09   | [config](./repvgg-A2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_8xb32_in1k_20221213-a8767caf.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_8xb32_in1k_20221213-a8767caf.log) |
+|    repvgg-B0_8xb32_in1k     | From scratch |            15.82   \| 14.34             |              3.43   \| 3.06              |   75.27   |   92.21   | [config](./repvgg-B0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_8xb32_in1k_20221213-5091ecc7.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_8xb32_in1k_20221213-5091ecc7.log) |
+|    repvgg-B1_8xb32_in1k     | From scratch |            57.42   \| 51.83             |             13.20   \| 11.81             |   78.19   |   94.04   | [config](./repvgg-B1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_8xb32_in1k_20221213-d17c45e7.pth)   \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_8xb32_in1k_20221213-d17c45e7.log) |
+|   repvgg-B1g2_8xb32_in1k    | From scratch |            45.78   \| 41.36             |              9.86   \| 8.80              |   77.87   |   93.99   | [config](./repvgg-B1g2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_8xb32_in1k_20221213-ae6428fd.pth)  \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_8xb32_in1k_20221213-ae6428fd.log) |
+|   repvgg-B1g4_8xb32_in1k    | From scratch |            39.97   \| 36.13             |              8.19   \| 7.30              |   77.81   |   93.77   | [config](./repvgg-B1g4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_8xb32_in1k_20221213-a7a4aaea.pth)  \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_8xb32_in1k_20221213-a7a4aaea.log) |
+|    repvgg-B2_8xb32_in1k     | From scratch |            89.02   \| 80.32             |              20.5   \| 18.4              |   78.58   |   94.23   | [config](./repvgg-B2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_8xb32_in1k_20221213-d8b420ef.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_8xb32_in1k_20221213-d8b420ef.log) |
+|   repvgg-B2g4_8xb32_in1k    | From scratch |            61.76   \| 55.78             |              12.7   \| 11.3              |   79.44   |   94.72   | [config](./repvgg-B2g4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_8xb32_in1k_20221213-0c1990eb.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_8xb32_in1k_20221213-0c1990eb.log) |
+|    repvgg-B3_8xb32_in1k     | From scratch |           123.09   \| 110.96            |              29.2   \| 26.2              |   80.58   |   95.33   | [config](./repvgg-B3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_8xb32_in1k_20221213-927a329a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_8xb32_in1k_20221213-927a329a.log) |
+|   repvgg-B3g4_8xb32_in1k    | From scratch |            83.83   \| 75.63             |              18.0   \| 16.1              |   80.26   |   95.15   | [config](./repvgg-B3g4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_8xb32_in1k_20221213-e01cb280.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_8xb32_in1k_20221213-e01cb280.log) |
+| repvgg-D2se_3rdparty_in1k\* | From scratch |           133.33   \| 120.39            |              36.6   \| 32.8              |   81.81   |   95.94   | [config](./repvgg-D2se_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-cf3139b7.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L250). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
 ## Citation
 
-```
+```bibtex
 @inproceedings{ding2021repvgg,
   title={Repvgg: Making vgg-style convnets great again},
   author={Ding, Xiaohan and Zhang, Xiangyu and Ma, Ningning and Han, Jungong and Ding, Guiguang and Sun, Jian},
diff --git a/configs/repvgg/deploy/repvgg-A0_deploy_4xb64-coslr-120e_in1k.py b/configs/repvgg/deploy/repvgg-A0_deploy_4xb64-coslr-120e_in1k.py
deleted file mode 100644
index 20787f286da..00000000000
--- a/configs/repvgg/deploy/repvgg-A0_deploy_4xb64-coslr-120e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-A0_4xb64-coslr-120e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-A1_deploy_4xb64-coslr-120e_in1k.py b/configs/repvgg/deploy/repvgg-A1_deploy_4xb64-coslr-120e_in1k.py
deleted file mode 100644
index eea0da9c58c..00000000000
--- a/configs/repvgg/deploy/repvgg-A1_deploy_4xb64-coslr-120e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-A1_4xb64-coslr-120e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-A2_deploy_4xb64-coslr-120e_in1k.py b/configs/repvgg/deploy/repvgg-A2_deploy_4xb64-coslr-120e_in1k.py
deleted file mode 100644
index 7b0cea7b7d5..00000000000
--- a/configs/repvgg/deploy/repvgg-A2_deploy_4xb64-coslr-120e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-A2_4xb64-coslr-120e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-B0_deploy_4xb64-coslr-120e_in1k.py b/configs/repvgg/deploy/repvgg-B0_deploy_4xb64-coslr-120e_in1k.py
deleted file mode 100644
index 23a2898ac56..00000000000
--- a/configs/repvgg/deploy/repvgg-B0_deploy_4xb64-coslr-120e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-B0_4xb64-coslr-120e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-B1_deploy_4xb64-coslr-120e_in1k.py b/configs/repvgg/deploy/repvgg-B1_deploy_4xb64-coslr-120e_in1k.py
deleted file mode 100644
index 24355edac7f..00000000000
--- a/configs/repvgg/deploy/repvgg-B1_deploy_4xb64-coslr-120e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-B1_4xb64-coslr-120e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-B1g2_deploy_4xb64-coslr-120e_in1k.py b/configs/repvgg/deploy/repvgg-B1g2_deploy_4xb64-coslr-120e_in1k.py
deleted file mode 100644
index 579fcc47b9c..00000000000
--- a/configs/repvgg/deploy/repvgg-B1g2_deploy_4xb64-coslr-120e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-B1g2_4xb64-coslr-120e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-B1g4_deploy_4xb64-coslr-120e_in1k.py b/configs/repvgg/deploy/repvgg-B1g4_deploy_4xb64-coslr-120e_in1k.py
deleted file mode 100644
index eab5d440374..00000000000
--- a/configs/repvgg/deploy/repvgg-B1g4_deploy_4xb64-coslr-120e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-B1g4_4xb64-coslr-120e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-B2_deploy_4xb64-coslr-120e_in1k.py b/configs/repvgg/deploy/repvgg-B2_deploy_4xb64-coslr-120e_in1k.py
deleted file mode 100644
index 0681f14dc36..00000000000
--- a/configs/repvgg/deploy/repvgg-B2_deploy_4xb64-coslr-120e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-B2_4xb64-coslr-120e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-B2g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/deploy/repvgg-B2g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
deleted file mode 100644
index 8f1840145f7..00000000000
--- a/configs/repvgg/deploy/repvgg-B2g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-B3_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/deploy/repvgg-B3_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
deleted file mode 100644
index e60b0678a9e..00000000000
--- a/configs/repvgg/deploy/repvgg-B3_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-B3g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/deploy/repvgg-B3g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
deleted file mode 100644
index 46f187789a3..00000000000
--- a/configs/repvgg/deploy/repvgg-B3g4_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/deploy/repvgg-D2se_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/deploy/repvgg-D2se_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
deleted file mode 100644
index 66dff3b6d44..00000000000
--- a/configs/repvgg/deploy/repvgg-D2se_deploy_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = '../repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py'
-
-model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/metafile.yml b/configs/repvgg/metafile.yml
index 84fee5911c1..8c550729aea 100644
--- a/configs/repvgg/metafile.yml
+++ b/configs/repvgg/metafile.yml
@@ -14,57 +14,48 @@ Collections:
       Version: v0.16.0
 
 Models:
-  - Name: repvgg-A0_3rdparty_4xb64-coslr-120e_in1k
+  - Name: repvgg-A0_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py
+    Config: configs/repvgg/repvgg-A0_8xb32_in1k.py
     Metadata:
-      FLOPs: 1520000000
-      Parameters: 9110000
+      FLOPs: 1360233728
+      Parameters: 8309384
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 72.41
-        Top 5 Accuracy: 90.50
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_3rdparty_4xb64-coslr-120e_in1k_20210909-883ab98c.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L196
-  - Name: repvgg-A1_3rdparty_4xb64-coslr-120e_in1k
+        Top 1 Accuracy: 72.37
+        Top 5 Accuracy: 90.56
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_8xb32_in1k_20221213-60ae8e23.pth
+  - Name: repvgg-A1_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py
+    Config: configs/repvgg/repvgg-A1_8xb32_in1k.py
     Metadata:
-      FLOPs: 2640000000
-      Parameters: 14090000
+      FLOPs: 2362750208
+      Parameters: 12789864
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 74.47
-        Top 5 Accuracy: 91.85
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_3rdparty_4xb64-coslr-120e_in1k_20210909-24003a24.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L200
-  - Name: repvgg-A2_3rdparty_4xb64-coslr-120e_in1k
+        Top 1 Accuracy: 74.23
+        Top 5 Accuracy: 91.80
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_8xb32_in1k_20221213-f81bf3df.pth
+  - Name: repvgg-A2_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-A2_4xb64-coslr-120e_in1k.py
+    Config: configs/repvgg/repvgg-A2_8xb32_in1k.py
     Metadata:
-      FLOPs: 28210000000
-      Parameters: 5700000
+      FLOPs: 5115612544
+      Parameters: 25499944
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 76.48
-        Top 5 Accuracy: 93.01
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_3rdparty_4xb64-coslr-120e_in1k_20210909-97d7695a.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L204
-  - Name: repvgg-B0_3rdparty_4xb64-coslr-120e_in1k
+        Top 1 Accuracy: 76.49
+        Top 5 Accuracy: 93.09
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_8xb32_in1k_20221213-a8767caf.pth
+  - Name: repvgg-B0_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-B0_4xb64-coslr-120e_in1k.py
+    Config: configs/repvgg/repvgg-B0_8xb32_in1k.py
     Metadata:
       FLOPs: 15820000000
       Parameters: 3420000
@@ -72,130 +63,106 @@ Models:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 75.14
-        Top 5 Accuracy: 92.42
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_3rdparty_4xb64-coslr-120e_in1k_20210909-446375f4.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L208
-  - Name: repvgg-B1_3rdparty_4xb64-coslr-120e_in1k
+        Top 1 Accuracy: 75.27
+        Top 5 Accuracy: 92.21
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_8xb32_in1k_20221213-5091ecc7.pth
+  - Name: repvgg-B1_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-B1_4xb64-coslr-120e_in1k.py
+    Config: configs/repvgg/repvgg-B1_8xb32_in1k.py
     Metadata:
-      FLOPs: 57420000000
-      Parameters: 13160000
+      FLOPs: 11813537792
+      Parameters: 51829480
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 78.37
-        Top 5 Accuracy: 94.11
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_3rdparty_4xb64-coslr-120e_in1k_20210909-750cdf67.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L212
-  - Name: repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k
+        Top 1 Accuracy: 78.19
+        Top 5 Accuracy: 94.04
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_8xb32_in1k_20221213-d17c45e7.pth
+  - Name: repvgg-B1g2_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-B1g2_4xb64-coslr-120e_in1k.py
+    Config: configs/repvgg/repvgg-B1g2_8xb32_in1k.py
     Metadata:
-      FLOPs: 45780000000
-      Parameters: 9820000
+      FLOPs: 8807794688
+      Parameters: 41360104
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 77.79
-        Top 5 Accuracy: 93.88
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k_20210909-344f6422.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L216
-  - Name: repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k
+        Top 1 Accuracy: 77.87
+        Top 5 Accuracy: 93.99
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_8xb32_in1k_20221213-ae6428fd.pth
+  - Name: repvgg-B1g4_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-B1g4_4xb64-coslr-120e_in1k.py
+    Config: configs/repvgg/repvgg-B1g4_8xb32_in1k.py
     Metadata:
-      FLOPs: 39970000000
-      Parameters: 8150000
+      FLOPs: 7304923136
+      Parameters: 36125416
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 77.58
-        Top 5 Accuracy: 93.84
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k_20210909-d4c1a642.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L220
-  - Name: repvgg-B2_3rdparty_4xb64-coslr-120e_in1k
+        Top 1 Accuracy: 77.81
+        Top 5 Accuracy: 93.77
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_8xb32_in1k_20221213-a7a4aaea.pth
+  - Name: repvgg-B2_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-B2_4xb64-coslr-120e_in1k.py
+    Config: configs/repvgg/repvgg-B2_8xb32_in1k.py
     Metadata:
-      FLOPs: 89020000000
-      Parameters: 20420000
+      FLOPs: 18374175232
+      Parameters: 80315112
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 78.78
-        Top 5 Accuracy: 94.42
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_3rdparty_4xb64-coslr-120e_in1k_20210909-bd6b937c.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L225
-  - Name: repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k
+        Top 1 Accuracy: 78.58
+        Top 5 Accuracy: 94.23
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_8xb32_in1k_20221213-d8b420ef.pth
+  - Name: repvgg-B2g4_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+    Config: configs/repvgg/repvgg-B2g4_8xb32_in1k.py
     Metadata:
-      FLOPs: 61760000000
-      Parameters: 12630000
+      FLOPs: 11329464832
+      Parameters: 55777512
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 79.38
-        Top 5 Accuracy: 94.68
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-7b7955f0.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L229
-  - Name: repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k
+        Top 1 Accuracy: 79.44
+        Top 5 Accuracy: 94.72
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_8xb32_in1k_20221213-0c1990eb.pth
+  - Name: repvgg-B3_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+    Config: configs/repvgg/repvgg-B3_8xb32_in1k.py
     Metadata:
-      FLOPs: 123090000000
-      Parameters: 29170000
+      FLOPs: 26206448128
+      Parameters: 110960872
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 80.52
-        Top 5 Accuracy: 95.26
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-dda968bf.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L238
-  - Name: repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k
+        Top 1 Accuracy: 80.58
+        Top 5 Accuracy: 95.33
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_8xb32_in1k_20221213-927a329a.pth
+  - Name: repvgg-B3g4_8xb32_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+    Config: configs/repvgg/repvgg-B3g4_8xb32_in1k.py
     Metadata:
-      FLOPs: 83830000000
-      Parameters: 17900000
+      FLOPs: 16062065152
+      Parameters: 75626728
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
       Metrics:
-        Top 1 Accuracy: 80.22
-        Top 5 Accuracy: 95.10
-    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-4e54846a.pth
-    Converted From:
-      Weights: https://drive.google.com/drive/folders/1Avome4KvNp0Lqh2QwhXO6L5URQjzCjUq
-      Code: https://github.com/DingXiaoH/RepVGG/blob/9f272318abfc47a2b702cd0e916fca8d25d683e7/repvgg.py#L238
-  - Name: repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k
+        Top 1 Accuracy: 80.26
+        Top 5 Accuracy: 95.15
+    Weights: https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_8xb32_in1k_20221213-e01cb280.pth
+  - Name: repvgg-D2se_3rdparty_in1k
     In Collection: RepVGG
-    Config: configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+    Config: configs/repvgg/repvgg-D2se_8xb32_in1k.py
     Metadata:
-      FLOPs: 133330000000
-      Parameters: 36560000
+      FLOPs: 32838581760
+      Parameters: 120387572
     Results:
     - Dataset: ImageNet-1k
       Task: Image Classification
diff --git a/configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py
deleted file mode 100644
index 8a93ed0a08c..00000000000
--- a/configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = [
-    '../_base_/models/repvgg-A0_in1k.py',
-    '../_base_/datasets/imagenet_bs64_pil_resize.py',
-    '../_base_/schedules/imagenet_bs256_coslr.py',
-    '../_base_/default_runtime.py'
-]
-
-# schedule settings
-param_scheduler = dict(
-    type='CosineAnnealingLR', T_max=120, by_epoch=True, begin=0, end=120)
-
-train_cfg = dict(by_epoch=True, max_epochs=120)
diff --git a/configs/repvgg/repvgg-A0_8xb32_in1k.py b/configs/repvgg/repvgg-A0_8xb32_in1k.py
new file mode 100644
index 00000000000..b767ae2a3e4
--- /dev/null
+++ b/configs/repvgg/repvgg-A0_8xb32_in1k.py
@@ -0,0 +1,33 @@
+_base_ = [
+    '../_base_/models/repvgg-A0_in1k.py',
+    '../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../_base_/schedules/imagenet_bs256_coslr.py',
+    '../_base_/default_runtime.py'
+]
+
+val_dataloader = dict(batch_size=256)
+test_dataloader = dict(batch_size=256)
+
+# schedule settings
+optim_wrapper = dict(
+    paramwise_cfg=dict(
+        bias_decay_mult=0.0,
+        custom_keys={
+            'branch_3x3.norm': dict(decay_mult=0.0),
+            'branch_1x1.norm': dict(decay_mult=0.0),
+            'branch_norm.bias': dict(decay_mult=0.0),
+        }))
+
+# schedule settings
+param_scheduler = dict(
+    type='CosineAnnealingLR',
+    T_max=120,
+    by_epoch=True,
+    begin=0,
+    end=120,
+    convert_to_iter_based=True)
+
+train_cfg = dict(by_epoch=True, max_epochs=120)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
diff --git a/configs/repvgg/repvgg-A0_deploy_in1k.py b/configs/repvgg/repvgg-A0_deploy_in1k.py
new file mode 100644
index 00000000000..16f0bbfcc7c
--- /dev/null
+++ b/configs/repvgg/repvgg-A0_deploy_in1k.py
@@ -0,0 +1,3 @@
+_base_ = '../repvgg-A0_8xb32_in1k.py'
+
+model = dict(backbone=dict(deploy=True))
diff --git a/configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py
deleted file mode 100644
index 649020f2c6f..00000000000
--- a/configs/repvgg/repvgg-A1_4xb64-coslr-120e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
-
-model = dict(backbone=dict(arch='A1'))
diff --git a/configs/repvgg/repvgg-A1_8xb32_in1k.py b/configs/repvgg/repvgg-A1_8xb32_in1k.py
new file mode 100644
index 00000000000..fab5e586359
--- /dev/null
+++ b/configs/repvgg/repvgg-A1_8xb32_in1k.py
@@ -0,0 +1,3 @@
+_base_ = './repvgg-A0_8xb32_in1k.py'
+
+model = dict(backbone=dict(arch='A1'))
diff --git a/configs/repvgg/repvgg-A2_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-A2_8xb32_in1k.py
similarity index 58%
rename from configs/repvgg/repvgg-A2_4xb64-coslr-120e_in1k.py
rename to configs/repvgg/repvgg-A2_8xb32_in1k.py
index eedaf2d29b7..f6196f02fbf 100644
--- a/configs/repvgg/repvgg-A2_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-A2_8xb32_in1k.py
@@ -1,3 +1,3 @@
-_base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
+_base_ = './repvgg-A0_8xb32_in1k.py'
 
 model = dict(backbone=dict(arch='A2'), head=dict(in_channels=1408))
diff --git a/configs/repvgg/repvgg-B0_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-B0_8xb32_in1k.py
similarity index 58%
rename from configs/repvgg/repvgg-B0_4xb64-coslr-120e_in1k.py
rename to configs/repvgg/repvgg-B0_8xb32_in1k.py
index b3ce7ea27d2..9bbc4ab2259 100644
--- a/configs/repvgg/repvgg-B0_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-B0_8xb32_in1k.py
@@ -1,3 +1,3 @@
-_base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
+_base_ = './repvgg-A0_8xb32_in1k.py'
 
 model = dict(backbone=dict(arch='B0'), head=dict(in_channels=1280))
diff --git a/configs/repvgg/repvgg-B1_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-B1_8xb32_in1k.py
similarity index 58%
rename from configs/repvgg/repvgg-B1_4xb64-coslr-120e_in1k.py
rename to configs/repvgg/repvgg-B1_8xb32_in1k.py
index 30adea3dc8e..e08db3c4b81 100644
--- a/configs/repvgg/repvgg-B1_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-B1_8xb32_in1k.py
@@ -1,3 +1,3 @@
-_base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
+_base_ = './repvgg-A0_8xb32_in1k.py'
 
 model = dict(backbone=dict(arch='B1'), head=dict(in_channels=2048))
diff --git a/configs/repvgg/repvgg-B1g2_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-B1g2_8xb32_in1k.py
similarity index 59%
rename from configs/repvgg/repvgg-B1g2_4xb64-coslr-120e_in1k.py
rename to configs/repvgg/repvgg-B1g2_8xb32_in1k.py
index 2749db8d955..a1c53fded4e 100644
--- a/configs/repvgg/repvgg-B1g2_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-B1g2_8xb32_in1k.py
@@ -1,3 +1,3 @@
-_base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
+_base_ = './repvgg-A0_8xb32_in1k.py'
 
 model = dict(backbone=dict(arch='B1g2'), head=dict(in_channels=2048))
diff --git a/configs/repvgg/repvgg-B1g4_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-B1g4_8xb32_in1k.py
similarity index 59%
rename from configs/repvgg/repvgg-B1g4_4xb64-coslr-120e_in1k.py
rename to configs/repvgg/repvgg-B1g4_8xb32_in1k.py
index 2647690975d..0757b1e580e 100644
--- a/configs/repvgg/repvgg-B1g4_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-B1g4_8xb32_in1k.py
@@ -1,3 +1,3 @@
-_base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
+_base_ = './repvgg-A0_8xb32_in1k.py'
 
 model = dict(backbone=dict(arch='B1g4'), head=dict(in_channels=2048))
diff --git a/configs/repvgg/repvgg-B2_4xb64-coslr-120e_in1k.py b/configs/repvgg/repvgg-B2_8xb32_in1k.py
similarity index 58%
rename from configs/repvgg/repvgg-B2_4xb64-coslr-120e_in1k.py
rename to configs/repvgg/repvgg-B2_8xb32_in1k.py
index 4d215567f4d..b9a7d4ca557 100644
--- a/configs/repvgg/repvgg-B2_4xb64-coslr-120e_in1k.py
+++ b/configs/repvgg/repvgg-B2_8xb32_in1k.py
@@ -1,3 +1,3 @@
-_base_ = './repvgg-A0_4xb64-coslr-120e_in1k.py'
+_base_ = './repvgg-A0_8xb32_in1k.py'
 
 model = dict(backbone=dict(arch='B2'), head=dict(in_channels=2560))
diff --git a/configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
deleted file mode 100644
index 11331cf02f2..00000000000
--- a/configs/repvgg/repvgg-B2g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = './repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py'
-
-model = dict(backbone=dict(arch='B2g4'))
diff --git a/configs/repvgg/repvgg-B2g4_8xb32_in1k.py b/configs/repvgg/repvgg-B2g4_8xb32_in1k.py
new file mode 100644
index 00000000000..8b3397881d7
--- /dev/null
+++ b/configs/repvgg/repvgg-B2g4_8xb32_in1k.py
@@ -0,0 +1,3 @@
+_base_ = './repvgg-B3_8xb32_in1k.py'
+
+model = dict(backbone=dict(arch='B2g4'), head=dict(in_channels=2560))
diff --git a/configs/repvgg/repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/repvgg-B3_8xb32_in1k.py
similarity index 54%
rename from configs/repvgg/repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
rename to configs/repvgg/repvgg-B3_8xb32_in1k.py
index 98bcad22da0..2d5d6e1358a 100644
--- a/configs/repvgg/repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ b/configs/repvgg/repvgg-B3_8xb32_in1k.py
@@ -1,10 +1,20 @@
 _base_ = [
     '../_base_/models/repvgg-B3_lbs-mixup_in1k.py',
-    '../_base_/datasets/imagenet_bs64_pil_resize.py',
-    '../_base_/schedules/imagenet_bs256_200e_coslr_warmup.py',
+    '../_base_/datasets/imagenet_bs32_pil_resize.py',
+    '../_base_/schedules/imagenet_bs256_coslr.py',
     '../_base_/default_runtime.py'
 ]
 
+# schedule settings
+optim_wrapper = dict(
+    paramwise_cfg=dict(
+        bias_decay_mult=0.0,
+        custom_keys={
+            'branch_3x3.norm': dict(decay_mult=0.0),
+            'branch_1x1.norm': dict(decay_mult=0.0),
+            'branch_norm.bias': dict(decay_mult=0.0),
+        }))
+
 data_preprocessor = dict(
     # RGB format normalization parameters
     mean=[123.675, 116.28, 103.53],
@@ -21,8 +31,12 @@
     dict(type='RandomResizedCrop', scale=224, backend='pillow'),
     dict(type='RandomFlip', prob=0.5, direction='horizontal'),
     dict(
-        type='AutoAugment',
-        policies='imagenet',
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=7,
+        magnitude_std=0.5,
         hparams=dict(pad_val=[round(x) for x in bgr_mean])),
     dict(type='PackClsInputs'),
 ]
@@ -37,3 +51,17 @@
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
 test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule settings
+param_scheduler = dict(
+    type='CosineAnnealingLR',
+    T_max=200,
+    by_epoch=True,
+    begin=0,
+    end=200,
+    convert_to_iter_based=True)
+
+train_cfg = dict(by_epoch=True, max_epochs=200)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
diff --git a/configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
deleted file mode 100644
index 67e3688c5ae..00000000000
--- a/configs/repvgg/repvgg-B3g4_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = './repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py'
-
-model = dict(backbone=dict(arch='B3g4'))
diff --git a/configs/repvgg/repvgg-B3g4_8xb32_in1k.py b/configs/repvgg/repvgg-B3g4_8xb32_in1k.py
new file mode 100644
index 00000000000..b0c5c00af84
--- /dev/null
+++ b/configs/repvgg/repvgg-B3g4_8xb32_in1k.py
@@ -0,0 +1,3 @@
+_base_ = './repvgg-B3_8xb32_in1k.py'
+
+model = dict(backbone=dict(arch='B3g4'))
diff --git a/configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py b/configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
deleted file mode 100644
index d235610f07d..00000000000
--- a/configs/repvgg/repvgg-D2se_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = './repvgg-B3_4xb64-autoaug-lbs-mixup-coslr-200e_in1k.py'
-
-model = dict(backbone=dict(arch='D2se'))
diff --git a/configs/repvgg/repvgg-D2se_8xb32_in1k.py b/configs/repvgg/repvgg-D2se_8xb32_in1k.py
new file mode 100644
index 00000000000..f532dcd7968
--- /dev/null
+++ b/configs/repvgg/repvgg-D2se_8xb32_in1k.py
@@ -0,0 +1,28 @@
+_base_ = './repvgg-B3_8xb32_in1k.py'
+
+model = dict(backbone=dict(arch='D2se'), head=dict(in_channels=2560))
+
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=0.0001,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(
+        type='CosineAnnealingLR',
+        T_max=295,
+        eta_min=1.0e-6,
+        by_epoch=True,
+        begin=5,
+        end=300)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=300)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))

From 4f5350f365b1932a3ee36637422f06f57dd8967c Mon Sep 17 00:00:00 2001
From: "Mr.Li" <1055271769@qq.com>
Date: Fri, 30 Dec 2022 15:52:57 +0800
Subject: [PATCH 20/21] [Doc] Fix typo. (#1281)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [Fix] Fix imports in transforms. (#1255)

* fix import

* import from mmegine.utils

* 修复错别字

Co-authored-by: Xieyuan Zhang <25652281+Francis777@users.noreply.github.com>
---
 docs/zh_CN/user_guides/finetune.md        | 2 +-
 mmcls/datasets/transforms/auto_augment.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/zh_CN/user_guides/finetune.md b/docs/zh_CN/user_guides/finetune.md
index acb28d00f10..bcbba3da65e 100644
--- a/docs/zh_CN/user_guides/finetune.md
+++ b/docs/zh_CN/user_guides/finetune.md
@@ -41,7 +41,7 @@ _base_ = [
 
 ## 修改模型
 
-在进行模型微调是，我们通常希望在主干网络（backbone）加载预训练模型，再用我们的数据集训练一个新的分类头（head）。
+在进行模型微调时，我们通常希望在主干网络（backbone）加载预训练模型，再用我们的数据集训练一个新的分类头（head）。
 
 为了在主干网络加载预训练模型，我们需要修改主干网络的初始化设置，使用
 `Pretrained` 类型的初始化函数。另外，在初始化设置中，我们使用 `prefix='backbone'`
diff --git a/mmcls/datasets/transforms/auto_augment.py b/mmcls/datasets/transforms/auto_augment.py
index e289c216e2e..8e97cb245eb 100644
--- a/mmcls/datasets/transforms/auto_augment.py
+++ b/mmcls/datasets/transforms/auto_augment.py
@@ -7,10 +7,9 @@
 
 import mmcv
 import numpy as np
-from mmcv import BaseTransform, RandomChoice
-from mmcv.transforms import Compose
+from mmcv.transforms import BaseTransform, Compose, RandomChoice
 from mmcv.transforms.utils import cache_randomness
-from mmengine import is_list_of, is_seq_of
+from mmengine.utils import is_list_of, is_seq_of
 
 from mmcls.registry import TRANSFORMS
 

From 0d8f918eaa8c9ec443badf6232561b93b9690fb9 Mon Sep 17 00:00:00 2001
From: Ma Zerun <mzr1996@163.com>
Date: Fri, 30 Dec 2022 17:32:04 +0800
Subject: [PATCH 21/21] Bump version to v1.0.0rc5. (#1286)

---
 README.md                  |  8 ++++++++
 README_zh-CN.md            |  7 +++++++
 docs/en/notes/changelog.md | 36 ++++++++++++++++++++++++++++++++++++
 mmcls/__init__.py          |  2 +-
 mmcls/version.py           |  2 +-
 requirements/mminstall.txt |  2 +-
 6 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 00a028fe286..5c761e061a5 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,14 @@ The `1.x` branch works with **PyTorch 1.6+**.
 
 ## What's new
 
+v1.0.0rc5 was released in 30/12/2022
+
+- Support **EVA**, **RevViT**, **EfficientnetV2**, **CLIP**, **TinyViT** and **MixMIM** backbones.
+- Reproduce the training accuracy of **ConvNeXt** and **RepVGG**.
+- Support **multi-task** training and testing. See [#1229](https://github.com/open-mmlab/mmclassification/pull/1229) for more details.
+- Support Test-time Augmentation. See [#1161](https://github.com/open-mmlab/mmclassification/pull/1161) for
+  more details.
+
 v1.0.0rc4 was released in 06/12/2022.
 
 - Upgrade API to get pre-defined models of MMClassification. See [#1236](https://github.com/open-mmlab/mmclassification/pull/1236) for more details.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 5fddca0ee54..13c12207643 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -57,6 +57,13 @@ MMClassification 是一款基于 PyTorch 的开源图像分类工具箱，是 [O
 
 ## 更新日志
 
+2022/12/30 发布了 v1.0.0rc5 版本
+
+- 支持了**EVA**, **RevViT**, **EfficientnetV2**, **CLIP**, **TinyViT** 和 **MixMIM** 等骨干网络结构
+- 复现了 ConvNeXt 和 RepVGG 的训练精度。
+- 支持了 **多任务** 训练和测试，详见 [#1229](https://github.com/open-mmlab/mmclassification/pull/1229)
+- 支持了测试时增强（TTA），详见 [#1161](https://github.com/open-mmlab/mmclassification/pull/1161)
+
 2022/12/06 发布了 v1.0.0rc4 版本
 
 - 更新了主要 API 接口，用以方便地获取 MMClassification 中预定义的模型。详见 [#1236](https://github.com/open-mmlab/mmclassification/pull/1236)。
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index 1baaf185668..9fc1b2eddb8 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,41 @@
 # Changelog
 
+## v1.0.0rc5(30/12/2022)
+
+### Highlights
+
+- Support EVA, RevViT, EfficientnetV2, CLIP, TinyViT and MixMIM backbones.
+- Reproduce the training accuracy of ConvNeXt and RepVGG.
+- Support multi-task training and testing.
+- Support Test-time Augmentation.
+
+### New Features
+
+- [Feature] Add EfficientnetV2 Backbone. ([#1253](https://github.com/open-mmlab/mmclassification/pull/1253))
+- [Feature] Support TTA and add `--tta` in `tools/test.py`. ([#1161](https://github.com/open-mmlab/mmclassification/pull/1161))
+- [Feature] Support Multi-task. ([#1229](https://github.com/open-mmlab/mmclassification/pull/1229))
+- [Feature] Add clip backbone. ([#1258](https://github.com/open-mmlab/mmclassification/pull/1258))
+- [Feature] Add mixmim backbone with checkpoints. ([#1224](https://github.com/open-mmlab/mmclassification/pull/1224))
+- [Feature] Add TinyViT for dev-1.x. ([#1042](https://github.com/open-mmlab/mmclassification/pull/1042))
+- [Feature] Add some scripts for development. ([#1257](https://github.com/open-mmlab/mmclassification/pull/1257))
+- [Feature] Support EVA. ([#1239](https://github.com/open-mmlab/mmclassification/pull/1239))
+- [Feature] Implementation of RevViT. ([#1127](https://github.com/open-mmlab/mmclassification/pull/1127))
+
+### Improvements
+
+- [Reproduce] Reproduce RepVGG  Training Accuracy. ([#1264](https://github.com/open-mmlab/mmclassification/pull/1264))
+- [Enhance] Support ConvNeXt More Weights. ([#1240](https://github.com/open-mmlab/mmclassification/pull/1240))
+- [Reproduce] Update ConvNeXt config files. ([#1256](https://github.com/open-mmlab/mmclassification/pull/1256))
+- [CI] Update CI to test PyTorch 1.13.0. ([#1260](https://github.com/open-mmlab/mmclassification/pull/1260))
+- [Project] Add ACCV workshop 1st Solution. ([#1245](https://github.com/open-mmlab/mmclassification/pull/1245))
+- [Project] Add Example project. ([#1254](https://github.com/open-mmlab/mmclassification/pull/1254))
+
+### Bug Fixes
+
+- [Fix] Fix imports in transforms. ([#1255](https://github.com/open-mmlab/mmclassification/pull/1255))
+- [Fix] Fix CAM visualization. ([#1248](https://github.com/open-mmlab/mmclassification/pull/1248))
+- [Fix] Fix the requirements and lazy register mmcls models. ([#1275](https://github.com/open-mmlab/mmclassification/pull/1275))
+
 ## v1.0.0rc4(06/12/2022)
 
 ### Highlights
diff --git a/mmcls/__init__.py b/mmcls/__init__.py
index f1a297817ed..19f40fa8ede 100644
--- a/mmcls/__init__.py
+++ b/mmcls/__init__.py
@@ -10,7 +10,7 @@
 mmcv_maximum_version = '2.0.0'
 mmcv_version = digit_version(mmcv.__version__)
 
-mmengine_minimum_version = '0.2.0'
+mmengine_minimum_version = '0.4.0'
 mmengine_maximum_version = '1.0.0'
 mmengine_version = digit_version(mmengine.__version__)
 
diff --git a/mmcls/version.py b/mmcls/version.py
index 6a6b4cae5d7..e994544c5ae 100644
--- a/mmcls/version.py
+++ b/mmcls/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved
 
-__version__ = '1.0.0rc4'
+__version__ = '1.0.0rc5'
 
 
 def parse_version_info(version_str):
diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt
index 18016ab9e65..3b8103c0321 100644
--- a/requirements/mminstall.txt
+++ b/requirements/mminstall.txt
@@ -1,2 +1,2 @@
 mmcv>=2.0.0rc1,<=2.0.0
-mmengine>=0.2.0,<1.0.0
+mmengine>=0.4.0,<1.0.0