diff --git a/configs/_base_/models/tin_r50.py b/configs/_base_/models/tin_r50.py
index 4a0dbbf6a0..cc27704f07 100644
--- a/configs/_base_/models/tin_r50.py
+++ b/configs/_base_/models/tin_r50.py
@@ -1,7 +1,9 @@
 # model settings
 
 preprocess_cfg = dict(
-    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], format_shape='NCHW')
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    format_shape='NCHW')
 
 model = dict(
     type='Recognizer2D',
diff --git a/configs/recognition/tin/README.md b/configs/recognition/tin/README.md
index 1e3db9145b..a250388fc3 100644
--- a/configs/recognition/tin/README.md
+++ b/configs/recognition/tin/README.md
@@ -34,7 +34,7 @@ For a long time, the vision community tries to learn the spatio-temporal represe
 
 | frame sampling strategy |   resolution   | gpus | backbone |    pretrain     | top1 acc | top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) |          config           |          ckpt           |           log           |
 | :---------------------: | :------------: | :--: | :------: | :-------------: | :------: | :------: | :--------------: | :---------------------: | :--------: | :-----------------------: | :---------------------: | :---------------------: |
-|          1x1x8          | short-side 256 | 8x4  | ResNet50 | TSM-Kinetics400 |  71.77   |  90.36   | 8 clips x 1 crop |            x            |    6185    | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log) |
+|          1x1x8          | short-side 256 | 8x4  | ResNet50 | TSM-Kinetics400 |  71.86   |  90.44   | 8 clips x 1 crop |            x            |    6185    | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log) |
 
 Here, we use `finetune` to indicate that we use [TSM model](https://download.openmmlab.com/mmaction/v1.0/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth) trained on Kinetics-400 to finetune the TIN model on Kinetics-400.
 
diff --git a/configs/recognition/tin/metafile.yml b/configs/recognition/tin/metafile.yml
index 6f69c73fda..7954bd90b3 100644
--- a/configs/recognition/tin/metafile.yml
+++ b/configs/recognition/tin/metafile.yml
@@ -66,8 +66,8 @@ Models:
   Results:
   - Dataset: Kinetics-400
     Metrics:
-      Top 1 Accuracy: 71.77
-      Top 5 Accuracy: 90.36
+      Top 1 Accuracy: 71.86
+      Top 5 Accuracy: 90.44
     Task: Action Recognition
   Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log
   Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth
diff --git a/dataset-index.yml b/dataset-index.yml
index 57f2f4f62a..2637f88d4b 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -1,39 +1,40 @@
+openxlab: true
 kinetics400:
-  dataset: Kinetics-400
+  dataset: OpenMMLab/Kinetics-400
   download_root: data
   data_root: data/kinetics400
-  script: tools/data/kinetics/k400_preprocess.sh
+  script: tools/data/kinetics/preprocess_k400.sh
 
 kinetics600:
-  dataset: Kinetics600
+  dataset: OpenMMLab/Kinetics600
   download_root: data
   data_root: data/kinetics600
-  script: tools/data/kinetics/k600_preprocess.sh
+  script: tools/data/kinetics/preprocess_k600.sh
 
 kinetics700:
-  dataset: Kinetics_700
+  dataset: OpenMMLab/Kinetics_700
   download_root: data
   data_root: data/kinetics700
-  script: tools/data/kinetics/k700_preprocess.sh
+  script: tools/data/kinetics/preprocess_k700.sh
 
 sthv2:
-  dataset: sthv2
+  dataset: OpenDataLab/sthv2
   download_root: data
   data_root: data/sthv2
   script: tools/data/sthv2/preprocess.sh
 
 ucf-101:
-  dataset: UCF101
+  dataset: OpenDataLab/UCF101
   download_root: data
   data_root: data/ucf101
 
 finegym:
-  dataset: FineGym
+  dataset: OpenDataLab/FineGym
   download_root: data
   data_root: data/gym
 
 diving48:
-  dataset: diving48
+  dataset: OpenDataLab/diving48
   download_root: data
   data_root: data/diving48
   script: tools/data/diving48/preprocess.sh
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
new file mode 100644
index 0000000000..1dfd9f976c
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
@@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth',  # noqa: E501
+            prefix='backbone.')),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=339,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/mit/videos/training'
+data_root_val = 'data/mit/videos/validation'
+ann_file_train = 'data/mit/mit_train_list_videos.txt'
+ann_file_val = 'data/mit/mit_val_list_videos.txt'
+ann_file_test = 'data/mit/mit_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=24, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+base_lr = 2e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=1 / 20,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        eta_min_ratio=1 / 20,
+        by_epoch=True,
+        begin=5,
+        end=24,
+        convert_to_iter_based=True)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=512)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py
new file mode 100644
index 0000000000..5b57aacfc6
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
+    cls_head=dict(
+        type=UniFormerHead,
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=768,
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k400.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py
new file mode 100644
index 0000000000..4616065b4c
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
+    cls_head=dict(
+        type=UniFormerHead,
+        dropout_ratio=0.5,
+        num_classes=600,
+        in_channels=768,
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k600.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics600/videos_train'
+data_root_val = 'data/kinetics600/videos_val'
+ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt'
+ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt'
+ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
new file mode 100644
index 0000000000..32e1bc72e9
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
+    cls_head=dict(
+        type=UniFormerHead,
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=768,
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k700.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics700/videos_train'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt'
+ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py
new file mode 100644
index 0000000000..84e6f6729f
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=710,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py
new file mode 100644
index 0000000000..6db31b373e
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=True,
+        pretrained='ViT-B/16'),
+    cls_head=dict(
+        type=UniFormerHead,
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+base_lr = 1e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=50,
+        eta_min_ratio=0.1,
+        by_epoch=True,
+        begin=5,
+        end=55,
+        convert_to_iter_based=True)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py
new file mode 100644
index 0000000000..6b8cb00c13
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=True,
+        pretrained='ViT-B/16'),
+    cls_head=dict(
+        type=UniFormerHead,
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics700/videos_train'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt'
+ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+base_lr = 1e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=50,
+        eta_min_ratio=0.1,
+        by_epoch=True,
+        begin=5,
+        end=55,
+        convert_to_iter_based=True)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py
new file mode 100644
index 0000000000..72527a79be
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import ConcatDataset, DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, ConcatDataset, DecordDecode,
+                               DecordInit, Flip, FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=True,
+        pretrained='ViT-B/16'),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=710,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
+    dict(
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+val_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_pipeline = [
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+# dataset settings
+k400_data_root = 'data/kinetics400/videos_train'
+k600_data_root = 'data/kinetics600/videos'
+k700_data_root = 'data/kinetics700/videos'
+k400_data_root_val = 'data/kinetics400/videos_val'
+k600_data_root_val = k600_data_root
+k700_data_root_val = k700_data_root
+
+k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt'
+k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt'
+k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt'
+
+k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt'
+k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt'
+k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt'
+
+k400_trainset = dict(
+    type=VideoDataset,
+    ann_file=k400_ann_file_train,
+    data_prefix=dict(video=k400_data_root),
+    pipeline=train_pipeline)
+k600_trainset = dict(
+    type=VideoDataset,
+    ann_file=k600_ann_file_train,
+    data_prefix=dict(video=k600_data_root),
+    pipeline=train_pipeline)
+k700_trainset = dict(
+    type=VideoDataset,
+    ann_file=k700_ann_file_train,
+    data_prefix=dict(video=k700_data_root),
+    pipeline=train_pipeline)
+
+k400_valset = dict(
+    type=VideoDataset,
+    ann_file=k400_ann_file_val,
+    data_prefix=dict(video=k400_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k600_valset = dict(
+    type=VideoDataset,
+    ann_file=k600_ann_file_val,
+    data_prefix=dict(video=k600_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+k700_valset = dict(
+    type=VideoDataset,
+    ann_file=k700_ann_file_val,
+    data_prefix=dict(video=k700_data_root_val),
+    pipeline=val_pipeline,
+    test_mode=True)
+
+k400_testset = k400_valset.copy()
+k600_testset = k600_valset.copy()
+k700_testset = k700_valset.copy()
+k400_testset['pipeline'] = test_pipeline
+k600_testset['pipeline'] = test_pipeline
+k700_testset['pipeline'] = test_pipeline
+
+k710_trainset = dict(
+    type=ConcatDataset, datasets=[k400_trainset, k600_trainset, k700_trainset])
+k710_valset = dict(
+    type=ConcatDataset, datasets=[k400_valset, k600_valset, k700_valset])
+k710_testset = dict(
+    type=ConcatDataset,
+    datasets=[k400_testset, k600_testset, k700_testset],
+)
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=k710_trainset)
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=k710_valset)
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=k710_testset)
+
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+base_lr = 1e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=50,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=5,
+        end=55,
+        convert_to_iter_based=True)
+]
+
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py
new file mode 100644
index 0000000000..8c53a18dee
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 16
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k400'
+ann_file_test = 'data/k400/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py
new file mode 100644
index 0000000000..84d1b295ef
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 16
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=600,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k600'
+ann_file_test = 'data/k600/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py
new file mode 100644
index 0000000000..b94bb75abf
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 16
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k700'
+ann_file_test = 'data/k700/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py
new file mode 100644
index 0000000000..f1b8def59a
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 32
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k400'
+ann_file_test = 'data/k400/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py
new file mode 100644
index 0000000000..c6e16ef759
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 32
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=600,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k600'
+ann_file_test = 'data/k600/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py
new file mode 100644
index 0000000000..e715fca14f
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 32
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k700'
+ann_file_test = 'data/k700/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
new file mode 100644
index 0000000000..6391e01825
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k400'
+ann_file_test = 'data/k400/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
new file mode 100644
index 0000000000..dec1a65b6b
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=600,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k600'
+ann_file_test = 'data/k600/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
new file mode 100644
index 0000000000..8bc6cb4407
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k700'
+ann_file_test = 'data/k700/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py
new file mode 100644
index 0000000000..c85b802da4
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=710,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py
new file mode 100644
index 0000000000..373fe9f3bf
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 32
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=336,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k400'
+ann_file_test = 'data/k400/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 336)),
+    dict(type=ThreeCrop, crop_size=336),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py
new file mode 100644
index 0000000000..3f1964d2c7
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 32
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=336,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=600,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k600'
+ann_file_test = 'data/k600/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 336)),
+    dict(type=ThreeCrop, crop_size=336),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py
new file mode 100644
index 0000000000..0ef24778f9
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 32
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=336,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/k700'
+ann_file_test = 'data/k700/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 336)),
+    dict(type=ThreeCrop, crop_size=336),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=','))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py
new file mode 100644
index 0000000000..798a215bd1
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 32
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=336,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=710,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
new file mode 100644
index 0000000000..2687bec030
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=224,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=339,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/mit_v1'
+ann_file_test = 'data/mit_v1/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=32,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=' '))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
new file mode 100644
index 0000000000..bddc27e89a
--- /dev/null
+++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
+
+# model settings
+num_frames = 8
+model = dict(
+    type=Recognizer3D,
+    backbone=dict(
+        type=UniFormerV2,
+        input_resolution=336,
+        patch_size=14,
+        width=1024,
+        layers=24,
+        heads=16,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[20, 21, 22, 23],
+        n_layers=4,
+        n_dim=1024,
+        n_head=16,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+    cls_head=dict(
+        type=TimeSformerHead,
+        dropout_ratio=0.5,
+        num_classes=339,
+        in_channels=1024,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type=ActionDataPreprocessor,
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root_val = 'data/mit_v1'
+ann_file_test = 'data/mit_v1/val.csv'
+
+test_pipeline = [
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 336)),
+    dict(type=ThreeCrop, crop_size=336),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
+]
+
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=VideoDataset,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True,
+        delimiter=' '))
+
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/mmaction/models/backbones/resnet_tin.py b/mmaction/models/backbones/resnet_tin.py
index b8ff3659f0..0958546926 100644
--- a/mmaction/models/backbones/resnet_tin.py
+++ b/mmaction/models/backbones/resnet_tin.py
@@ -325,6 +325,9 @@ def init_structure(self):
         if len(self.non_local_cfg) != 0:
             self.make_non_local()
 
+    def _get_wrap_prefix(self):
+        return ['.net2']
+
     def make_temporal_interlace(self):
         """Make temporal interlace for some layers."""
         num_segment_list = [self.num_segments] * 4
@@ -365,6 +368,3 @@ def make_block_interlace(stage, num_segments, shift_div):
                                            self.shift_div)
         self.layer4 = make_block_interlace(self.layer4, num_segment_list[3],
                                            self.shift_div)
-
-    def init_weights(self):
-        pass
diff --git a/mmaction/models/backbones/resnet_tsm.py b/mmaction/models/backbones/resnet_tsm.py
index 0079c96cb7..a2dcaf2939 100644
--- a/mmaction/models/backbones/resnet_tsm.py
+++ b/mmaction/models/backbones/resnet_tsm.py
@@ -305,6 +305,9 @@ def make_non_local(self):
                                                  self.num_segments,
                                                  self.non_local_cfg)
 
+    def _get_wrap_prefix(self):
+        return ['.net', '.block']
+
     def load_original_weights(self, logger):
         """Load weights from original checkpoint, which required converting
         keys."""
@@ -317,7 +320,7 @@ def load_original_weights(self, logger):
         for name, module in self.named_modules():
             # convert torchvision keys
             ori_name = name
-            for wrap_prefix in ['.net', '.block']:
+            for wrap_prefix in self._get_wrap_prefix():
                 if wrap_prefix in ori_name:
                     ori_name = ori_name.replace(wrap_prefix, '')
                     wrapped_layers_map[ori_name] = name
@@ -352,6 +355,7 @@ def load_original_weights(self, logger):
             if layer_name in wrapped_layers_map:
                 wrapped_name = param_name.replace(
                     layer_name, wrapped_layers_map[layer_name])
+                print(f'wrapped_name {wrapped_name}')
                 state_dict_torchvision[
                     wrapped_name] = state_dict_torchvision.pop(param_name)
 
diff --git a/projects/gesture_recognition/README.md b/projects/gesture_recognition/README.md
index 47ca04e472..f3d097c869 100644
--- a/projects/gesture_recognition/README.md
+++ b/projects/gesture_recognition/README.md
@@ -14,7 +14,7 @@ Hand detection results on OneHand10K validation dataset
 
 | Config                                                  | Input Size | bbox mAP | bbox mAP 50 | bbox mAP 75 |                         ckpt                          |                         log                          |
 | :------------------------------------------------------ | :--------: | :------: | :---------: | :---------: | :---------------------------------------------------: | :--------------------------------------------------: |
-| [rtmpose_nano](/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py) |  320x320   |  0.8100  |   0.9870    |   0.9190    | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320_20230524-f6ffed6a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.log) |
+| [rtmdet_nano](/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py) |  320x320   |  0.8100  |   0.9870    |   0.9190    | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320_20230524-f6ffed6a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.log) |
 
 ## Pose estimation stage
 
diff --git a/tools/data/diving48/README.md b/tools/data/diving48/README.md
index 02d19d5a62..0a2b68707f 100644
--- a/tools/data/diving48/README.md
+++ b/tools/data/diving48/README.md
@@ -21,10 +21,10 @@ For basic dataset information, you can refer to the official dataset [website](h
 ````{group-tab} Download by MIM
 MIM supports downloading from OpenDataLab and preprocessing Diving48 dataset with one command line.
 ```Bash
-# install OpenDataLab CLI tools
-pip install -U opendatalab
-# log in OpenDataLab
-odl login
+# install OpenXlab CLI tools
+pip install -U openxlab
+# log in OpenXLab
+openxlab login
 # download and preprocess by MIM
 mim download mmaction2 --dataset diving48
 ```
diff --git a/tools/data/diving48/README_zh-CN.md b/tools/data/diving48/README_zh-CN.md
index 825344039e..f6ebca9f2c 100644
--- a/tools/data/diving48/README_zh-CN.md
+++ b/tools/data/diving48/README_zh-CN.md
@@ -21,10 +21,10 @@
 ````{group-tab} 使用 MIM 下载
 # MIM 支持下载 Diving48 数据集。用户可以通过一行命令，从 OpenDataLab 进行下载，并进行预处理。
 ```Bash
-# 安装 OpenDataLab CLI 工具
-pip install -U opendatalab
-# 登录 OpenDataLab
-odl login
+# 安装 OpenXLab CLI 工具
+pip install -U openxlab
+# 登录 OpenXLab
+openxlab login
 # 通过 MIM 进行数据集下载，预处理。注意这将花费较长时间
 mim download mmaction2 --dataset diving48
 ```
diff --git a/tools/data/diving48/preprocess.sh b/tools/data/diving48/preprocess.sh
index 10d9e42044..ddd9a1de48 100644
--- a/tools/data/diving48/preprocess.sh
+++ b/tools/data/diving48/preprocess.sh
@@ -3,6 +3,6 @@
 DOWNLOAD_DIR=$1
 DATA_ROOT=$2
 
-cat $DOWNLOAD_DIR/diving48/raw/*.tar.gz.*  | tar -xvz -C $(dirname $DATA_ROOT)
+cat $DOWNLOAD_DIR/OpenDataLab___diving48/raw/*.tar.gz.*  | tar -xvz -C $(dirname $DATA_ROOT)
 tar -xvf $DATA_ROOT/diving48.tar -C $(dirname $DATA_ROOT)
 rm $DATA_ROOT/diving48.tar
diff --git a/tools/data/kinetics/README.md b/tools/data/kinetics/README.md
index 257c650eba..bd96a5f364 100644
--- a/tools/data/kinetics/README.md
+++ b/tools/data/kinetics/README.md
@@ -38,10 +38,10 @@ All experiments on Kinetics in MMAction2 are based on this version, we recommend
 MIM supports downloading from OpenDataLab and preprocessing Kinetics-400/600/700 dataset with one command line.
 
 ```Bash
-# install OpenDataLab CLI tools
-pip install -U opendatalab
-# log in OpenDataLab
-odl login
+# install OpenXlab CLI tools
+pip install -U openxlab
+# log in OpenXLab
+openxlab login
 # download and preprocess Kinetics-400 by MIM. Note that this might take a long time.
 mim download mmaction2 --dataset kinetics400
 # download and preprocess Kinetics-600 by MIM. Note that this might take a long time.
diff --git a/tools/data/kinetics/README_zh-CN.md b/tools/data/kinetics/README_zh-CN.md
index a0d2e858e1..6047617985 100644
--- a/tools/data/kinetics/README_zh-CN.md
+++ b/tools/data/kinetics/README_zh-CN.md
@@ -36,10 +36,10 @@ MMAction2 代码仓库中提供的 Kinetics 实验性能，都是基于这个版
 
 # MIM 支持下载 Kinetics-400/600/700 数据集。用户可以通过一行命令，从 OpenDataLab 进行下载，并进行预处理。
 ```Bash
-# 安装 OpenDataLab CLI 工具
-pip install -U opendatalab
-# 登录 OpenDataLab
-odl login
+# 安装 OpenXLab CLI 工具
+pip install -U openxlab
+# 登录 OpenXLab
+openxlab login
 # 通过 MIM 进行 Kinetics-400 数据集下载，预处理。注意这将花费较长时间
 mim download mmaction2 --dataset kinetics400
 # 通过 MIM 进行 Kinetics-600 数据集下载，预处理。注意这将花费较长时间
diff --git a/tools/data/kinetics/preprocess_k400.sh b/tools/data/kinetics/preprocess_k400.sh
index 9f07885095..d48c166375 100644
--- a/tools/data/kinetics/preprocess_k400.sh
+++ b/tools/data/kinetics/preprocess_k400.sh
@@ -5,5 +5,5 @@ set -x
 DOWNLOAD_DIR=$1
 DATA_ROOT=$2
 
-cat $DOWNLOAD_DIR/Kinetics-400/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
+cat $DOWNLOAD_DIR/OpenMMLab___Kinetics-400/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
 mv $(dirname $DATA_ROOT)/Kinetics-400 $DATA_ROOT
diff --git a/tools/data/kinetics/preprocess_k600.sh b/tools/data/kinetics/preprocess_k600.sh
index 438297a620..e6264909cb 100644
--- a/tools/data/kinetics/preprocess_k600.sh
+++ b/tools/data/kinetics/preprocess_k600.sh
@@ -5,5 +5,5 @@ set -x
 DOWNLOAD_DIR=$1
 DATA_ROOT=$2
 
-cat $DOWNLOAD_DIR/Kinetics600/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
+cat $DOWNLOAD_DIR/OpenMMLab___Kinetics600/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
 mv $(dirname $DATA_ROOT)/Kinetics600 $DATA_ROOT
diff --git a/tools/data/kinetics/preprocess_k700.sh b/tools/data/kinetics/preprocess_k700.sh
index 930bf8577a..8e48cdcd47 100644
--- a/tools/data/kinetics/preprocess_k700.sh
+++ b/tools/data/kinetics/preprocess_k700.sh
@@ -5,5 +5,5 @@ set -x
 DOWNLOAD_DIR=$1
 DATA_ROOT=$2
 
-cat $DOWNLOAD_DIR/Kinetics_700/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
+cat $DOWNLOAD_DIR/OpenMMLab___Kinetics_700/raw/*.tar.gz*  | tar -xvz -C $(dirname $DATA_ROOT)
 mv $(dirname $DATA_ROOT)/Kinetics_700 $DATA_ROOT
diff --git a/tools/data/kinetics710/README.md b/tools/data/kinetics710/README.md
index 76a239fbf5..4dd52b393f 100644
--- a/tools/data/kinetics710/README.md
+++ b/tools/data/kinetics710/README.md
@@ -25,10 +25,10 @@ Before we start, please make sure that the directory is located at `$MMACTION2`.
 Kinetics-710 is a video benchmark based on Kinetics-400/600/700, which merges the training set of these Kinetics datasets, and deletes the repeated videos according to Youtube IDs. MMAction2 provides an annotation file based on the Kinetics-400/600/700 on [OpenDataLab](https://opendatalab.com/). So we suggest you download Kinetics-400/600/700 first from OpenDataLab by [MIM](https://github.com/open-mmlab/mim).
 
 ```shell
-# install OpenDataLab CLI tools
-pip install -U opendatalab
-# log in OpenDataLab
-odl login
+# install OpenXlab CLI tools
+pip install -U openxlab
+# log in OpenXLab
+openxlab login
 # download Kinetics-400/600/700, note that this might take a long time.
 mim download mmaction2 --dataset kinetics400
 mim download mmaction2 --dataset kinetics600
diff --git a/tools/data/kinetics710/README_zh-CN.md b/tools/data/kinetics710/README_zh-CN.md
index b428be43df..650a05c7d5 100644
--- a/tools/data/kinetics710/README_zh-CN.md
+++ b/tools/data/kinetics710/README_zh-CN.md
@@ -23,10 +23,10 @@
 Kinetics-710 是基于 Kinetics-400/600/700 的视频数据集，它合并了这些 Kinetics 数据集的训练集，并根据 Youtube ID 删除了重复的视频。MMAction2 提供了一个基于 Kinetics-400/600/700 的 OpenDataLab 版本的标注文件，你可以通过 [MIM](https://github.com/open-mmlab/mim) 从 OpenDataLab 下载。
 
 ```shell
-# 安装 OpenDataLab CLI 工具
-pip install -U opendatalab
-# 登录 OpenDataLab
-odl login
+# 安装 OpenXLab CLI 工具
+pip install -U openxlab
+# 登录 OpenXLab
+openxlab login
 # 下载 Kinetics-400/600/700，注意这可能需要很长时间。
 mim download mmaction2 --dataset kinetics400
 mim download mmaction2 --dataset kinetics600
diff --git a/tools/data/sthv2/README.md b/tools/data/sthv2/README.md
index 5e05e6ff90..5c6ee953b4 100644
--- a/tools/data/sthv2/README.md
+++ b/tools/data/sthv2/README.md
@@ -22,10 +22,10 @@ For basic dataset information, you can refer to the dataset [website](https://de
 ````{group-tab} Download by MIM
 MIM supports downloading from OpenDataLab and preprocessing Something-Something V2 dataset with one command line.
 ```Bash
-# install OpenDataLab CLI tools
-pip install -U opendatalab
-# log in OpenDataLab
-odl login
+# install OpenXlab CLI tools
+pip install -U openxlab
+# log in OpenXLab
+openxlab login
 # download and preprocess by MIM
 mim download mmaction2 --dataset sthv2
 ```
diff --git a/tools/data/sthv2/README_zh-CN.md b/tools/data/sthv2/README_zh-CN.md
index 0bc5baf3fc..83647eb735 100644
--- a/tools/data/sthv2/README_zh-CN.md
+++ b/tools/data/sthv2/README_zh-CN.md
@@ -22,10 +22,10 @@
 ````{group-tab} 使用 MIM 下载
 # MIM 支持下载 Something-Something V2 数据集。用户可以通过一行命令，从 OpenDataLab 进行下载，并进行预处理。
 ```Bash
-# 安装 OpenDataLab CLI 工具
-pip install -U opendatalab
-# 登录 OpenDataLab
-odl login
+# 安装 OpenXLab CLI 工具
+pip install -U openxlab
+# 登录 OpenXLab
+openxlab login
 # 通过 MIM 进行数据集下载，预处理。注意这将花费较长时间
 mim download mmaction2 --dataset sthv2
 ```
diff --git a/tools/data/sthv2/preprocss.sh b/tools/data/sthv2/preprocss.sh
index 440a3d42ba..39b574d317 100644
--- a/tools/data/sthv2/preprocss.sh
+++ b/tools/data/sthv2/preprocss.sh
@@ -3,6 +3,6 @@
 DOWNLOAD_DIR=$1
 DATA_ROOT=$2
 
-cat $DOWNLOAD_DIR/sthv2/raw/*.tar.gz  | tar -xvz -C $(dirname $DATA_ROOT)
+cat $DOWNLOAD_DIR/OpenDataLab___sthv2/raw/*.tar.gz  | tar -xvz -C $(dirname $DATA_ROOT)
 tar -xvf $DATA_ROOT/sthv2.tar -C $(dirname $DATA_ROOT)
 rm $DATA_ROOT/sthv2.tar